From 31a8a22aa60e7a2fffbe4d5ba7cd4da5fee73785 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Sun, 14 Jun 2026 08:46:07 +0200 Subject: [PATCH] [feat] engines: add German tonline engine (general, news, images, videos) (#6250) T-Online_ is a German news portal. It gets its web results from Google, image results from Flickr and videos results from YouTube. For images and videos, it additionally returns result from its news catalog. However, for pagination we have to specify the result type (e.g. either videos from YouTube or from T-Online), so we use flickr/youtube there instead of tonline because the tonline results are usually irrelevant. --- searx/engines/tonline.py | 149 +++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 30 ++++++++ 2 files changed, 179 insertions(+) create mode 100644 searx/engines/tonline.py diff --git a/searx/engines/tonline.py b/searx/engines/tonline.py new file mode 100644 index 000000000..cfb1dbb35 --- /dev/null +++ b/searx/engines/tonline.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""T-Online_ is a German news portal, which is powered by Ströer, a German +advertising company, not by Deutsche Telekom (contrary to its name). + +It gets its web results from Google, image results from Flickr and videos +results from YouTube. + +.. _T-Online: https://www.t-online.de/ + +""" + +import typing as t +from urllib.parse import urlencode + +from lxml import html + +from searx.utils import eval_xpath_list, eval_xpath, extract_text, get_embeded_stream_url, ElementType +from searx.result_types import EngineResults + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + +about = { + "website": "https://www.t-online.de", + "wikidata_id": "Q590940", + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "HTML", + "language": "de", +} + +paging = True +time_range_support = True + +base_url = "https://suche.t-online.de" +tonline_categ = "web" +"""Supported categories are ``web``, ``videos``, ``news`` and ``images``.""" + +time_range_map = {"day": "d", "week": "w", "month": "m", "year": "y"} + +# result provider has to be specified during pagination, pagination can alternatively +# use "tonline" to only search for results from t-online news articles +tonline_channel_map = {"images": "flickr", "videos": "yt"} + + +def init(_): + if tonline_categ not in ("web", "images", "videos", "news"): + raise ValueError("invalid category: %s" % tonline_categ) + + +def request(query: str, params: "OnlineParams") -> None: + # "mandant", "dia" and "ptl" are not needed, but this might reduce changes of captchas + args = {"q": query, "mandant": "toi", "dia": "suche", "ptl": "std"} + if params["time_range"]: + args["age"] = time_range_map[params["time_range"]] + + if params["pageno"] > 1 and tonline_categ in tonline_channel_map: + ch = tonline_channel_map[tonline_categ] + args["ch"] = ch + args[f"{ch}_page"] = str(params["pageno"]) + else: + args["page"] = str(params["pageno"]) + + params["url"] = f"{base_url}/{tonline_categ}?{urlencode(args)}" + + +def _general_results(doc: ElementType, res: EngineResults): + result: ElementType + for result in eval_xpath_list(doc, "//div[@id='google_re']/div[contains(@class, 'doc')]"): + ( + res.add( + res.types.MainResult( + url=extract_text(eval_xpath(result, "./a/@href") or ""), + title=extract_text(eval_xpath(result, ".//span[contains(@class, 'tMMReshl')]") or "") or "", + content=extract_text(eval_xpath(result, ".//div[contains(@class, 'tMMRest')]") or "") or "", + ), + ) + ) + suggestion: ElementType + for suggestion in eval_xpath_list(doc, "//div[starts-with(@class, 'rsbl')]/a"): + res.add(res.types.LegacyResult({"suggestion": extract_text(suggestion)})) + + +def _image_results(doc: ElementType, res: EngineResults): + result: ElementType + for result in eval_xpath_list(doc, "//div[@class='doc']"): + ( + res.add( + res.types.Image( + url=extract_text(eval_xpath(result, "./a/@href") or ""), + title=extract_text(eval_xpath(result, ".//div[contains(@class, 'doc_info')]") or "") or "", + thumbnail_src=extract_text(eval_xpath(result, ".//img/@src") or "") or "", + ), + ) + ) + + +def _news_results(doc: ElementType, res: EngineResults): + result: ElementType + title_parts: list[ElementType] + for result in eval_xpath_list(doc, "//div[@id='portal_re']/div[contains(@class, 'doc')]"): + title_parts = eval_xpath(result, ".//a[starts-with(@class, 'tMMReshl')]") + ( + res.add( + res.types.MainResult( + url=extract_text(eval_xpath(result, "(./a/@href)[1]") or ""), + title=" - ".join(extract_text(part) or "" for part in title_parts), + content=extract_text(eval_xpath(result, ".//div[contains(@class, 'tMMRest')]") or "") or "", + thumbnail=extract_text(eval_xpath(result, ".//img[contains(@class, 'desk')]/@src") or "") or "", + ), + ) + ) + + +def _video_results(doc: ElementType, res: EngineResults): + result: ElementType + for result in eval_xpath_list(doc, "//div[@class='doc']"): + url: str | None = extract_text(eval_xpath(result, "./a/@href") or "") + if url is None: + continue + title_parts: list[ElementType] = eval_xpath(result, ".//a[starts-with(@class, 'tMMReshl')]") + res.add( + res.types.LegacyResult( + template="videos.html", + url=url, + title=" - ".join(extract_text(part) or "" for part in title_parts), + thumbnail=extract_text(eval_xpath(result, ".//img/@src") or "") or "", + iframe_src=get_embeded_stream_url(url) or "", + ) + ) + + +def response(resp: "SXNG_Response") -> EngineResults: + doc = html.fromstring(resp.text) + res = EngineResults() + match tonline_categ: + case "web": + _general_results(doc, res) + case "news": + _news_results(doc, res) + case "images": + _image_results(doc, res) + case "videos": + _video_results(doc, res) + case _: + raise ValueError("invalid category: %s" % tonline_categ) + return res diff --git a/searx/settings.yml b/searx/settings.yml index 3c6e08db7..87196fa39 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -2313,6 +2313,36 @@ engines: shortcut: tm disabled: true + - name: tonline + engine: tonline + shortcut: tol + disabled: true + inactive: true + + - name: tonline images + engine: tonline + categories: images + tonline_categ: images + shortcut: toli + disabled: true + inactive: true + + - name: tonline videos + engine: tonline + categories: videos + tonline_categ: videos + shortcut: tolv + disabled: true + inactive: true + + - name: tonline news + engine: tonline + categories: news + tonline_categ: news + shortcut: toln + disabled: true + inactive: true + # Requires Tor - name: torch engine: xpath