Files
searxng/searx/engines/tusksearch.py
T
Bnyro 92abd98a55 [feat] engines: add tusksearch (web, news, videos, images) (#6267)
The code that reads the value of variable `x` from `embed.js`, decodes
it to ASCII and based on that sets `window["tuskheader"]` and `window["tuskkey"]`
is attached below. The only real way to figure out what this is doing is
by stepping through it with the debugger, otherwise it's almost hopeless.

```js
function fe() {
  const B = pe => pe.map(_e => String.fromCharCode(_e)).join(''),
  ae = window,
  o = ae.x;
  if (o?.length) {
    const pe = o.length / 2;
    for (let _e = 0; _e < pe; _e++) ae[B(o[_e])] = B(o[pe + _e]);
    ae.x = void 0
  }
}
```

Minimal script for testing the engine:

```py
import random
from json import loads
import requests

resp = requests.get("https://api.tusksearch.com/revcontent/embed.js")
data = loads(resp.text[6:])

def _decode(text: list[int]) -> str:
    return "".join([chr(x) for x in text])

header = _decode(data[3])
value = _decode(data[4])

resp = requests.get(
    "https://api.tusksearch.com/Search/Web?q=test&p=1&l=center&nextArgs=&prevArgs=",
    # "https://api.tusksearch.com/Search/Image?q=test&p=1&l=center",
    headers={
        header: value,
        'x-lon': str(random.random() * 90),
        'x-lat': str(random.random() * 90),
    },
)
print(resp.text)
```
2026-06-22 09:40:32 +02:00

163 lines
5.3 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Tusksearch_ is an American search engine that claims to fight censorship.
Its search results are (at least partially) from Brave.
.. _Tusksearch: https://tusksearch.com/about
"""
from json import loads
import random
import typing as t
from urllib.parse import urlencode
from dateutil import parser
from searx.exceptions import SearxEngineAPIException
from searx.network import get
from searx.utils import html_to_text
from searx.result_types import EngineResults
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": "https://tusksearch.com",
"wikidata_id": None,
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
paging = True
categories = ["general"]
tusk_categ = "web"
"""Category to search in. Can be either "web", "images", "videos" or "news"."""
api_url = "https://api.tusksearch.com"
def init(_):
if tusk_categ not in ("web", "images", "videos", "news"):
raise ValueError("invalid search type: %s" % tusk_categ)
def _obtain_x_sid() -> tuple[str, str]:
"""
The session ID ("sid") is encoded as a byte array in ``embed.js``.
It is only valid for exactly one request, so we can't cache it.
The header key is usually called `x-sid-{UUIDv4}`, and the value is
usually a plain UUIDv4 (but a different one than in the header key).
"""
resp = get(f"{api_url}/revcontent/embed.js")
if not resp.ok:
raise SearxEngineAPIException("failed to obtain request x-sid token")
# data is prefixed by 'var x='
data_array = loads(resp.text[6:])
def _byte_array_to_ascii(text: list[int]) -> str:
"""
Converts a byte array (e.g. [81, 101, 97, 114, 88, 78, 71]) to the ASCII
string representation (e.g. "SearXNG").
"""
return "".join([chr(x) for x in text])
x_sid_header = _byte_array_to_ascii(data_array[3])
x_sid_value = _byte_array_to_ascii(data_array[4])
return x_sid_header, x_sid_value
def request(query: str, params: "OnlineParams") -> None:
# images don't support pagination, news and videos only support two pages
if tusk_categ == "images" and params["pageno"] > 1 or tusk_categ in ("news", "videos") and params["pageno"] > 2:
params["url"] = None
return
args = {
"q": query,
"p": params["pageno"],
"l": "center", # political direction: "left", "center" or "right"
}
if tusk_categ == "images":
params["url"] = f"{api_url}/Search/Image?{urlencode(args)}"
else:
# web response also contains news and videos
params["url"] = f"{api_url}/Search/Web?{urlencode(args)}"
x_sid_header, x_sid_value = _obtain_x_sid()
params["headers"] = {
x_sid_header: x_sid_value,
# required - we send a random longitude and latitude instead of the actual user location
'x-lon': str(random.random() * 90),
'x-lat': str(random.random() * 90),
}
def response(resp: "SXNG_Response"):
res = EngineResults()
json_resp = resp.json()["results"]
if tusk_categ == "web":
for result in (json_resp.get("web") or {}).get("results", []):
res.add(
res.types.MainResult(
url=result["url"],
title=html_to_text(result["title"]),
content=html_to_text(result["description"]),
thumbnail=(result["thumbnail"] or {}).get("src") or "",
)
)
elif tusk_categ == "news":
for result in (json_resp.get("news") or {}).get("results", []):
publishedDate = None
try:
publishedDate = parser.parse(result["age"])
except parser.ParserError:
pass
res.add(
res.types.MainResult(
url=result["url"],
title=html_to_text(result["title"]),
content=html_to_text(result["description"]),
thumbnail=result["thumbnail"]["src"],
publishedDate=publishedDate,
)
)
elif tusk_categ == "videos":
for result in (json_resp.get("videos") or {}).get("results", []):
publishedDate = None
try:
publishedDate = parser.parse(result["age"])
except parser.ParserError:
pass
res.add(
res.types.LegacyResult(
template="videos.html",
url=result["url"],
title=html_to_text(result["title"]),
content=html_to_text(result["description"]),
thumbnail=result["thumbnail"]["src"],
publishedDate=publishedDate,
length=result["video"].get("duration"),
)
)
elif tusk_categ == "images":
for result in json_resp:
res.add(
res.types.Image(
url=result["url"],
title=html_to_text(result["title"]),
img_src=result["properties"]["url"],
thumbnail_src=result["thumbnail"]["src"],
)
)
return res