From 031747f29ed06c1a67d75538662eae72f199e35c Mon Sep 17 00:00:00 2001 From: Bnyro Date: Sat, 13 Jun 2026 13:52:01 +0200 Subject: [PATCH] [feat] engines: add chatnoir general engine (#6183) Chatnoir is an open source search engine developed by universities, based on CommonCrawl (and others). It's uncommented by default - we don't want to overload the universities with bot traffic that targets SearXNG (sad truth why we can't have nice things anymore) --- searx/engines/chatnoir.py | 134 ++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 ++ 2 files changed, 140 insertions(+) create mode 100644 searx/engines/chatnoir.py diff --git a/searx/engines/chatnoir.py b/searx/engines/chatnoir.py new file mode 100644 index 000000000..838bdb725 --- /dev/null +++ b/searx/engines/chatnoir.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Chatnoir is an open source search engine developed by Webis, a network of +researchers from the universities of Weimar, Halle and Leipzig. It supports +different different text corpora as indexes, e.g. CommonCrawl. See its +`announcement`_ for more information. + +.. _announcement : https://groups.google.com/g/common-crawl/c/3o2dOHpeRxo/m/H2Osqz9dAAAJ +""" + +import typing as t + +from searx.exceptions import SearxEngineAPIException +from searx.extended_types import SXNG_Response +from searx.network import get, post +from searx.result_types import EngineResults +from searx.utils import html_to_text +from searx.enginelib import EngineCache + +if t.TYPE_CHECKING: + from searx.search.processors import OnlineParams + +about = { + "website": "https://www.chatnoir.eu", + "official_api_documentation": "https://www.chatnoir.eu/docs/api-general", + "use_official_api": True, + "require_api_key": False, + "results": "JSON", +} + +base_url = "https://www.chatnoir.eu" +categories = ["general"] + +paging = True +page_size = 10 + +api_key = "" +"""You can optionally provide your own API key here. This one will then be used +instead of scraping an API key.""" + +search_index = "cw22" +"""Search index to browse in. See `the API documentation +`_ for a full list.""" + + +CACHE: EngineCache +"""Cache to store session info (i.e. api key, csrf token, session id).""" + + +def setup(engine_settings: dict[str, t.Any]) -> bool: + global CACHE # pylint: disable=global-statement + CACHE = EngineCache(engine_settings["name"]) + return True + + +def _obtain_api_key() -> tuple[str, str, str]: + cached_session = CACHE.get("session") + if cached_session: + return tuple(cached_session.split("|")) + + home_resp = get(base_url) + if not home_resp.ok: + raise SearxEngineAPIException("failed to obtain api key") + csrf_token = home_resp.cookies["csrftoken"] + + token_resp = post( + "https://www.chatnoir.eu/?init", + headers={ + "Referer": f"{base_url}/", + "X-Requested-With": "XMLHttpRequest", + "X-Csrf-Token": csrf_token, + }, + cookies=home_resp.cookies, + ) + if not token_resp.ok: + raise SearxEngineAPIException("failed to obtain api key") + session_id = token_resp.cookies["sessionid"] + scraped_api_key = token_resp.json()["token"]["token"] + + # session keys seem to become rate-limited very fast, so only remembering + # for 1 minute here + CACHE.set("session", f"{csrf_token}|{session_id}|{scraped_api_key}", expire=60) + + return csrf_token, session_id, scraped_api_key + + +def request(query: str, params: "OnlineParams"): + if api_key: + # use user-provided API key instead of scraping one + headers = { + "Authorization": f"Bearer {api_key}", + } + + params["headers"].update(headers) + else: + csrf_token, session_id, scraped_api_key = _obtain_api_key() + + headers = { + "Authorization": f"Bearer {scraped_api_key}", + "X-Csrf-Token": csrf_token, + } + + params["headers"].update(headers) + params["cookies"] = {"csrftoken": session_id, "sessionid": session_id} + + params["url"] = f"{base_url}/api/v1/_search" + params["method"] = "POST" + + json_data = { + "query": query, + "index": [ + search_index, + ], + "from": (params["pageno"] - 1) * page_size, + "size": page_size, + "_extended_meta": True, + } + params["json"] = json_data + + +def response(resp: "SXNG_Response") -> EngineResults: + res = EngineResults() + + results = resp.json()["results"] + + for result in results: + res.add( + res.types.MainResult( + url=result["target_uri"], + title=html_to_text(result["title"]), + content=html_to_text(result["snippet"]), + ) + ) + + return res diff --git a/searx/settings.yml b/searx/settings.yml index 97573f241..9e2fd9804 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -609,6 +609,12 @@ engines: shortcut: ca disabled: true + # - name: chatnoir + # engine: chatnoir + # shortcut: cha + # search_index: cw22 + # disabled: true + - name: chefkoch engine: chefkoch shortcut: chef