mirror of
https://github.com/searxng/searxng.git
synced 2026-06-14 22:06:52 +02:00
031747f29e
Chatnoir is an open source search engine developed by universities, based on CommonCrawl (and others). It's uncommented by default - we don't want to overload the universities with bot traffic that targets SearXNG (sad truth why we can't have nice things anymore)
135 lines
3.8 KiB
Python
135 lines
3.8 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""Chatnoir is an open source search engine developed by Webis, a network of
|
|
researchers from the universities of Weimar, Halle and Leipzig. It supports
|
|
different different text corpora as indexes, e.g. CommonCrawl. See its
|
|
`announcement`_ for more information.
|
|
|
|
.. _announcement : https://groups.google.com/g/common-crawl/c/3o2dOHpeRxo/m/H2Osqz9dAAAJ
|
|
"""
|
|
|
|
import typing as t
|
|
|
|
from searx.exceptions import SearxEngineAPIException
|
|
from searx.extended_types import SXNG_Response
|
|
from searx.network import get, post
|
|
from searx.result_types import EngineResults
|
|
from searx.utils import html_to_text
|
|
from searx.enginelib import EngineCache
|
|
|
|
if t.TYPE_CHECKING:
|
|
from searx.search.processors import OnlineParams
|
|
|
|
about = {
|
|
"website": "https://www.chatnoir.eu",
|
|
"official_api_documentation": "https://www.chatnoir.eu/docs/api-general",
|
|
"use_official_api": True,
|
|
"require_api_key": False,
|
|
"results": "JSON",
|
|
}
|
|
|
|
base_url = "https://www.chatnoir.eu"
|
|
categories = ["general"]
|
|
|
|
paging = True
|
|
page_size = 10
|
|
|
|
api_key = ""
|
|
"""You can optionally provide your own API key here. This one will then be used
|
|
instead of scraping an API key."""
|
|
|
|
search_index = "cw22"
|
|
"""Search index to browse in. See `the API documentation
|
|
<https://www.chatnoir.eu/docs/api-general>`_ for a full list."""
|
|
|
|
|
|
CACHE: EngineCache
|
|
"""Cache to store session info (i.e. api key, csrf token, session id)."""
|
|
|
|
|
|
def setup(engine_settings: dict[str, t.Any]) -> bool:
|
|
global CACHE # pylint: disable=global-statement
|
|
CACHE = EngineCache(engine_settings["name"])
|
|
return True
|
|
|
|
|
|
def _obtain_api_key() -> tuple[str, str, str]:
|
|
cached_session = CACHE.get("session")
|
|
if cached_session:
|
|
return tuple(cached_session.split("|"))
|
|
|
|
home_resp = get(base_url)
|
|
if not home_resp.ok:
|
|
raise SearxEngineAPIException("failed to obtain api key")
|
|
csrf_token = home_resp.cookies["csrftoken"]
|
|
|
|
token_resp = post(
|
|
"https://www.chatnoir.eu/?init",
|
|
headers={
|
|
"Referer": f"{base_url}/",
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
"X-Csrf-Token": csrf_token,
|
|
},
|
|
cookies=home_resp.cookies,
|
|
)
|
|
if not token_resp.ok:
|
|
raise SearxEngineAPIException("failed to obtain api key")
|
|
session_id = token_resp.cookies["sessionid"]
|
|
scraped_api_key = token_resp.json()["token"]["token"]
|
|
|
|
# session keys seem to become rate-limited very fast, so only remembering
|
|
# for 1 minute here
|
|
CACHE.set("session", f"{csrf_token}|{session_id}|{scraped_api_key}", expire=60)
|
|
|
|
return csrf_token, session_id, scraped_api_key
|
|
|
|
|
|
def request(query: str, params: "OnlineParams"):
|
|
if api_key:
|
|
# use user-provided API key instead of scraping one
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
}
|
|
|
|
params["headers"].update(headers)
|
|
else:
|
|
csrf_token, session_id, scraped_api_key = _obtain_api_key()
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {scraped_api_key}",
|
|
"X-Csrf-Token": csrf_token,
|
|
}
|
|
|
|
params["headers"].update(headers)
|
|
params["cookies"] = {"csrftoken": session_id, "sessionid": session_id}
|
|
|
|
params["url"] = f"{base_url}/api/v1/_search"
|
|
params["method"] = "POST"
|
|
|
|
json_data = {
|
|
"query": query,
|
|
"index": [
|
|
search_index,
|
|
],
|
|
"from": (params["pageno"] - 1) * page_size,
|
|
"size": page_size,
|
|
"_extended_meta": True,
|
|
}
|
|
params["json"] = json_data
|
|
|
|
|
|
def response(resp: "SXNG_Response") -> EngineResults:
|
|
res = EngineResults()
|
|
|
|
results = resp.json()["results"]
|
|
|
|
for result in results:
|
|
res.add(
|
|
res.types.MainResult(
|
|
url=result["target_uri"],
|
|
title=html_to_text(result["title"]),
|
|
content=html_to_text(result["snippet"]),
|
|
)
|
|
)
|
|
|
|
return res
|