mirror of
https://github.com/searxng/searxng.git
synced 2026-06-14 22:06:52 +02:00
[feat] engines: add chatnoir general engine (#6183)
Chatnoir is an open source search engine developed by universities, based on CommonCrawl (and others). It's uncommented by default - we don't want to overload the universities with bot traffic that targets SearXNG (sad truth why we can't have nice things anymore)
This commit is contained in:
@@ -0,0 +1,134 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Chatnoir is an open source search engine developed by Webis, a network of
|
||||
researchers from the universities of Weimar, Halle and Leipzig. It supports
|
||||
different different text corpora as indexes, e.g. CommonCrawl. See its
|
||||
`announcement`_ for more information.
|
||||
|
||||
.. _announcement : https://groups.google.com/g/common-crawl/c/3o2dOHpeRxo/m/H2Osqz9dAAAJ
|
||||
"""
|
||||
|
||||
import typing as t
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.extended_types import SXNG_Response
|
||||
from searx.network import get, post
|
||||
from searx.result_types import EngineResults
|
||||
from searx.utils import html_to_text
|
||||
from searx.enginelib import EngineCache
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from searx.search.processors import OnlineParams
|
||||
|
||||
about = {
|
||||
"website": "https://www.chatnoir.eu",
|
||||
"official_api_documentation": "https://www.chatnoir.eu/docs/api-general",
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
base_url = "https://www.chatnoir.eu"
|
||||
categories = ["general"]
|
||||
|
||||
paging = True
|
||||
page_size = 10
|
||||
|
||||
api_key = ""
|
||||
"""You can optionally provide your own API key here. This one will then be used
|
||||
instead of scraping an API key."""
|
||||
|
||||
search_index = "cw22"
|
||||
"""Search index to browse in. See `the API documentation
|
||||
<https://www.chatnoir.eu/docs/api-general>`_ for a full list."""
|
||||
|
||||
|
||||
CACHE: EngineCache
|
||||
"""Cache to store session info (i.e. api key, csrf token, session id)."""
|
||||
|
||||
|
||||
def setup(engine_settings: dict[str, t.Any]) -> bool:
|
||||
global CACHE # pylint: disable=global-statement
|
||||
CACHE = EngineCache(engine_settings["name"])
|
||||
return True
|
||||
|
||||
|
||||
def _obtain_api_key() -> tuple[str, str, str]:
|
||||
cached_session = CACHE.get("session")
|
||||
if cached_session:
|
||||
return tuple(cached_session.split("|"))
|
||||
|
||||
home_resp = get(base_url)
|
||||
if not home_resp.ok:
|
||||
raise SearxEngineAPIException("failed to obtain api key")
|
||||
csrf_token = home_resp.cookies["csrftoken"]
|
||||
|
||||
token_resp = post(
|
||||
"https://www.chatnoir.eu/?init",
|
||||
headers={
|
||||
"Referer": f"{base_url}/",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"X-Csrf-Token": csrf_token,
|
||||
},
|
||||
cookies=home_resp.cookies,
|
||||
)
|
||||
if not token_resp.ok:
|
||||
raise SearxEngineAPIException("failed to obtain api key")
|
||||
session_id = token_resp.cookies["sessionid"]
|
||||
scraped_api_key = token_resp.json()["token"]["token"]
|
||||
|
||||
# session keys seem to become rate-limited very fast, so only remembering
|
||||
# for 1 minute here
|
||||
CACHE.set("session", f"{csrf_token}|{session_id}|{scraped_api_key}", expire=60)
|
||||
|
||||
return csrf_token, session_id, scraped_api_key
|
||||
|
||||
|
||||
def request(query: str, params: "OnlineParams"):
|
||||
if api_key:
|
||||
# use user-provided API key instead of scraping one
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
}
|
||||
|
||||
params["headers"].update(headers)
|
||||
else:
|
||||
csrf_token, session_id, scraped_api_key = _obtain_api_key()
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {scraped_api_key}",
|
||||
"X-Csrf-Token": csrf_token,
|
||||
}
|
||||
|
||||
params["headers"].update(headers)
|
||||
params["cookies"] = {"csrftoken": session_id, "sessionid": session_id}
|
||||
|
||||
params["url"] = f"{base_url}/api/v1/_search"
|
||||
params["method"] = "POST"
|
||||
|
||||
json_data = {
|
||||
"query": query,
|
||||
"index": [
|
||||
search_index,
|
||||
],
|
||||
"from": (params["pageno"] - 1) * page_size,
|
||||
"size": page_size,
|
||||
"_extended_meta": True,
|
||||
}
|
||||
params["json"] = json_data
|
||||
|
||||
|
||||
def response(resp: "SXNG_Response") -> EngineResults:
|
||||
res = EngineResults()
|
||||
|
||||
results = resp.json()["results"]
|
||||
|
||||
for result in results:
|
||||
res.add(
|
||||
res.types.MainResult(
|
||||
url=result["target_uri"],
|
||||
title=html_to_text(result["title"]),
|
||||
content=html_to_text(result["snippet"]),
|
||||
)
|
||||
)
|
||||
|
||||
return res
|
||||
@@ -609,6 +609,12 @@ engines:
|
||||
shortcut: ca
|
||||
disabled: true
|
||||
|
||||
# - name: chatnoir
|
||||
# engine: chatnoir
|
||||
# shortcut: cha
|
||||
# search_index: cw22
|
||||
# disabled: true
|
||||
|
||||
- name: chefkoch
|
||||
engine: chefkoch
|
||||
shortcut: chef
|
||||
|
||||
Reference in New Issue
Block a user