diff --git a/searx/engines/tonline.py b/searx/engines/tonline.py new file mode 100644 index 000000000..cfb1dbb35 --- /dev/null +++ b/searx/engines/tonline.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""T-Online_ is a German news portal, which is powered by Ströer, a German +advertising company, not by Deutsche Telekom (contrary to its name). + +It gets its web results from Google, image results from Flickr and videos +results from YouTube. + +.. _T-Online: https://www.t-online.de/ + +""" + +import typing as t +from urllib.parse import urlencode + +from lxml import html + +from searx.utils import eval_xpath_list, eval_xpath, extract_text, get_embeded_stream_url, ElementType +from searx.result_types import EngineResults + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + +about = { + "website": "https://www.t-online.de", + "wikidata_id": "Q590940", + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "HTML", + "language": "de", +} + +paging = True +time_range_support = True + +base_url = "https://suche.t-online.de" +tonline_categ = "web" +"""Supported categories are ``web``, ``videos``, ``news`` and ``images``.""" + +time_range_map = {"day": "d", "week": "w", "month": "m", "year": "y"} + +# result provider has to be specified during pagination, pagination can alternatively +# use "tonline" to only search for results from t-online news articles +tonline_channel_map = {"images": "flickr", "videos": "yt"} + + +def init(_): + if tonline_categ not in ("web", "images", "videos", "news"): + raise ValueError("invalid category: %s" % tonline_categ) + + +def request(query: str, params: "OnlineParams") -> None: + # "mandant", "dia" and "ptl" are not needed, but this might reduce changes of captchas + args = {"q": query, "mandant": "toi", "dia": "suche", "ptl": "std"} + if params["time_range"]: + args["age"] = time_range_map[params["time_range"]] + + if params["pageno"] > 1 and tonline_categ in tonline_channel_map: + ch = tonline_channel_map[tonline_categ] + args["ch"] = ch + args[f"{ch}_page"] = str(params["pageno"]) + else: + args["page"] = str(params["pageno"]) + + params["url"] = f"{base_url}/{tonline_categ}?{urlencode(args)}" + + +def _general_results(doc: ElementType, res: EngineResults): + result: ElementType + for result in eval_xpath_list(doc, "//div[@id='google_re']/div[contains(@class, 'doc')]"): + ( + res.add( + res.types.MainResult( + url=extract_text(eval_xpath(result, "./a/@href") or ""), + title=extract_text(eval_xpath(result, ".//span[contains(@class, 'tMMReshl')]") or "") or "", + content=extract_text(eval_xpath(result, ".//div[contains(@class, 'tMMRest')]") or "") or "", + ), + ) + ) + suggestion: ElementType + for suggestion in eval_xpath_list(doc, "//div[starts-with(@class, 'rsbl')]/a"): + res.add(res.types.LegacyResult({"suggestion": extract_text(suggestion)})) + + +def _image_results(doc: ElementType, res: EngineResults): + result: ElementType + for result in eval_xpath_list(doc, "//div[@class='doc']"): + ( + res.add( + res.types.Image( + url=extract_text(eval_xpath(result, "./a/@href") or ""), + title=extract_text(eval_xpath(result, ".//div[contains(@class, 'doc_info')]") or "") or "", + thumbnail_src=extract_text(eval_xpath(result, ".//img/@src") or "") or "", + ), + ) + ) + + +def _news_results(doc: ElementType, res: EngineResults): + result: ElementType + title_parts: list[ElementType] + for result in eval_xpath_list(doc, "//div[@id='portal_re']/div[contains(@class, 'doc')]"): + title_parts = eval_xpath(result, ".//a[starts-with(@class, 'tMMReshl')]") + ( + res.add( + res.types.MainResult( + url=extract_text(eval_xpath(result, "(./a/@href)[1]") or ""), + title=" - ".join(extract_text(part) or "" for part in title_parts), + content=extract_text(eval_xpath(result, ".//div[contains(@class, 'tMMRest')]") or "") or "", + thumbnail=extract_text(eval_xpath(result, ".//img[contains(@class, 'desk')]/@src") or "") or "", + ), + ) + ) + + +def _video_results(doc: ElementType, res: EngineResults): + result: ElementType + for result in eval_xpath_list(doc, "//div[@class='doc']"): + url: str | None = extract_text(eval_xpath(result, "./a/@href") or "") + if url is None: + continue + title_parts: list[ElementType] = eval_xpath(result, ".//a[starts-with(@class, 'tMMReshl')]") + res.add( + res.types.LegacyResult( + template="videos.html", + url=url, + title=" - ".join(extract_text(part) or "" for part in title_parts), + thumbnail=extract_text(eval_xpath(result, ".//img/@src") or "") or "", + iframe_src=get_embeded_stream_url(url) or "", + ) + ) + + +def response(resp: "SXNG_Response") -> EngineResults: + doc = html.fromstring(resp.text) + res = EngineResults() + match tonline_categ: + case "web": + _general_results(doc, res) + case "news": + _news_results(doc, res) + case "images": + _image_results(doc, res) + case "videos": + _video_results(doc, res) + case _: + raise ValueError("invalid category: %s" % tonline_categ) + return res diff --git a/searx/settings.yml b/searx/settings.yml index 3c6e08db7..87196fa39 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -2313,6 +2313,36 @@ engines: shortcut: tm disabled: true + - name: tonline + engine: tonline + shortcut: tol + disabled: true + inactive: true + + - name: tonline images + engine: tonline + categories: images + tonline_categ: images + shortcut: toli + disabled: true + inactive: true + + - name: tonline videos + engine: tonline + categories: videos + tonline_categ: videos + shortcut: tolv + disabled: true + inactive: true + + - name: tonline news + engine: tonline + categories: news + tonline_categ: news + shortcut: toln + disabled: true + inactive: true + # Requires Tor - name: torch engine: xpath