From 92abd98a554f54fe63d8a96590db8719a2052285 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Mon, 22 Jun 2026 09:40:32 +0200 Subject: [PATCH] [feat] engines: add tusksearch (web, news, videos, images) (#6267) The code that reads the value of variable `x` from `embed.js`, decodes it to ASCII and based on that sets `window["tuskheader"]` and `window["tuskkey"]` is attached below. The only real way to figure out what this is doing is by stepping through it with the debugger, otherwise it's almost hopeless. ```js function fe() { const B = pe => pe.map(_e => String.fromCharCode(_e)).join(''), ae = window, o = ae.x; if (o?.length) { const pe = o.length / 2; for (let _e = 0; _e < pe; _e++) ae[B(o[_e])] = B(o[pe + _e]); ae.x = void 0 } } ``` Minimal script for testing the engine: ```py import random from json import loads import requests resp = requests.get("https://api.tusksearch.com/revcontent/embed.js") data = loads(resp.text[6:]) def _decode(text: list[int]) -> str: return "".join([chr(x) for x in text]) header = _decode(data[3]) value = _decode(data[4]) resp = requests.get( "https://api.tusksearch.com/Search/Web?q=test&p=1&l=center&nextArgs=&prevArgs=", # "https://api.tusksearch.com/Search/Image?q=test&p=1&l=center", headers={ header: value, 'x-lon': str(random.random() * 90), 'x-lat': str(random.random() * 90), }, ) print(resp.text) ``` --- searx/engines/tusksearch.py | 162 ++++++++++++++++++++++++++++++++++++ searx/settings.yml | 29 +++++++ 2 files changed, 191 insertions(+) create mode 100644 searx/engines/tusksearch.py diff --git a/searx/engines/tusksearch.py b/searx/engines/tusksearch.py new file mode 100644 index 000000000..f85e17c12 --- /dev/null +++ b/searx/engines/tusksearch.py @@ -0,0 +1,162 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Tusksearch_ is an American search engine that claims to fight censorship. +Its search results are (at least partially) from Brave. + +.. _Tusksearch: https://tusksearch.com/about +""" + +from json import loads +import random +import typing as t +from urllib.parse import urlencode +from dateutil import parser + +from searx.exceptions import SearxEngineAPIException +from searx.network import get +from searx.utils import html_to_text +from searx.result_types import EngineResults + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + +about = { + "website": "https://tusksearch.com", + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "JSON", +} + +paging = True + +categories = ["general"] +tusk_categ = "web" +"""Category to search in. Can be either "web", "images", "videos" or "news".""" + + +api_url = "https://api.tusksearch.com" + + +def init(_): + if tusk_categ not in ("web", "images", "videos", "news"): + raise ValueError("invalid search type: %s" % tusk_categ) + + +def _obtain_x_sid() -> tuple[str, str]: + """ + The session ID ("sid") is encoded as a byte array in ``embed.js``. + It is only valid for exactly one request, so we can't cache it. + + The header key is usually called `x-sid-{UUIDv4}`, and the value is + usually a plain UUIDv4 (but a different one than in the header key). + """ + resp = get(f"{api_url}/revcontent/embed.js") + if not resp.ok: + raise SearxEngineAPIException("failed to obtain request x-sid token") + + # data is prefixed by 'var x=' + data_array = loads(resp.text[6:]) + + def _byte_array_to_ascii(text: list[int]) -> str: + """ + Converts a byte array (e.g. [81, 101, 97, 114, 88, 78, 71]) to the ASCII + string representation (e.g. "SearXNG"). + """ + return "".join([chr(x) for x in text]) + + x_sid_header = _byte_array_to_ascii(data_array[3]) + x_sid_value = _byte_array_to_ascii(data_array[4]) + return x_sid_header, x_sid_value + + +def request(query: str, params: "OnlineParams") -> None: + # images don't support pagination, news and videos only support two pages + if tusk_categ == "images" and params["pageno"] > 1 or tusk_categ in ("news", "videos") and params["pageno"] > 2: + params["url"] = None + return + + args = { + "q": query, + "p": params["pageno"], + "l": "center", # political direction: "left", "center" or "right" + } + if tusk_categ == "images": + params["url"] = f"{api_url}/Search/Image?{urlencode(args)}" + else: + # web response also contains news and videos + params["url"] = f"{api_url}/Search/Web?{urlencode(args)}" + + x_sid_header, x_sid_value = _obtain_x_sid() + params["headers"] = { + x_sid_header: x_sid_value, + # required - we send a random longitude and latitude instead of the actual user location + 'x-lon': str(random.random() * 90), + 'x-lat': str(random.random() * 90), + } + + +def response(resp: "SXNG_Response"): + res = EngineResults() + + json_resp = resp.json()["results"] + + if tusk_categ == "web": + for result in (json_resp.get("web") or {}).get("results", []): + res.add( + res.types.MainResult( + url=result["url"], + title=html_to_text(result["title"]), + content=html_to_text(result["description"]), + thumbnail=(result["thumbnail"] or {}).get("src") or "", + ) + ) + elif tusk_categ == "news": + for result in (json_resp.get("news") or {}).get("results", []): + publishedDate = None + try: + publishedDate = parser.parse(result["age"]) + except parser.ParserError: + pass + + res.add( + res.types.MainResult( + url=result["url"], + title=html_to_text(result["title"]), + content=html_to_text(result["description"]), + thumbnail=result["thumbnail"]["src"], + publishedDate=publishedDate, + ) + ) + elif tusk_categ == "videos": + for result in (json_resp.get("videos") or {}).get("results", []): + publishedDate = None + try: + publishedDate = parser.parse(result["age"]) + except parser.ParserError: + pass + + res.add( + res.types.LegacyResult( + template="videos.html", + url=result["url"], + title=html_to_text(result["title"]), + content=html_to_text(result["description"]), + thumbnail=result["thumbnail"]["src"], + publishedDate=publishedDate, + length=result["video"].get("duration"), + ) + ) + elif tusk_categ == "images": + for result in json_resp: + res.add( + res.types.Image( + url=result["url"], + title=html_to_text(result["title"]), + img_src=result["properties"]["url"], + thumbnail_src=result["thumbnail"]["src"], + ) + ) + + return res diff --git a/searx/settings.yml b/searx/settings.yml index 9fd9b984a..0261c0b7d 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -2383,6 +2383,35 @@ engines: - 5000 inactive: true + - name: tusksearch + engine: tusksearch + shortcut: tu + tusk_categ: web + categories: general + disabled: true + + - name: tusksearch images + engine: tusksearch + shortcut: tui + paging: false + tusk_categ: images + categories: images + disabled: true + + - name: tusksearch videos + engine: tusksearch + shortcut: tuv + tusk_categ: videos + categories: videos + disabled: true + + - name: tusksearch news + engine: tusksearch + shortcut: tun + tusk_categ: news + categories: news + disabled: true + # tmp suspended - too slow, too many errors # - name: urbandictionary # engine : xpath