mirror of
https://github.com/searxng/searxng.git
synced 2026-06-22 17:48:33 +02:00
[feat] engines: add tusksearch (web, news, videos, images) (#6267)
The code that reads the value of variable `x` from `embed.js`, decodes
it to ASCII and based on that sets `window["tuskheader"]` and `window["tuskkey"]`
is attached below. The only real way to figure out what this is doing is
by stepping through it with the debugger, otherwise it's almost hopeless.
```js
function fe() {
const B = pe => pe.map(_e => String.fromCharCode(_e)).join(''),
ae = window,
o = ae.x;
if (o?.length) {
const pe = o.length / 2;
for (let _e = 0; _e < pe; _e++) ae[B(o[_e])] = B(o[pe + _e]);
ae.x = void 0
}
}
```
Minimal script for testing the engine:
```py
import random
from json import loads
import requests
resp = requests.get("https://api.tusksearch.com/revcontent/embed.js")
data = loads(resp.text[6:])
def _decode(text: list[int]) -> str:
return "".join([chr(x) for x in text])
header = _decode(data[3])
value = _decode(data[4])
resp = requests.get(
"https://api.tusksearch.com/Search/Web?q=test&p=1&l=center&nextArgs=&prevArgs=",
# "https://api.tusksearch.com/Search/Image?q=test&p=1&l=center",
headers={
header: value,
'x-lon': str(random.random() * 90),
'x-lat': str(random.random() * 90),
},
)
print(resp.text)
```
This commit is contained in:
@@ -0,0 +1,162 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Tusksearch_ is an American search engine that claims to fight censorship.
|
||||
Its search results are (at least partially) from Brave.
|
||||
|
||||
.. _Tusksearch: https://tusksearch.com/about
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
import random
|
||||
import typing as t
|
||||
from urllib.parse import urlencode
|
||||
from dateutil import parser
|
||||
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.network import get
|
||||
from searx.utils import html_to_text
|
||||
from searx.result_types import EngineResults
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from searx.extended_types import SXNG_Response
|
||||
from searx.search.processors import OnlineParams
|
||||
|
||||
about = {
|
||||
"website": "https://tusksearch.com",
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": None,
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
paging = True
|
||||
|
||||
categories = ["general"]
|
||||
tusk_categ = "web"
|
||||
"""Category to search in. Can be either "web", "images", "videos" or "news"."""
|
||||
|
||||
|
||||
api_url = "https://api.tusksearch.com"
|
||||
|
||||
|
||||
def init(_):
|
||||
if tusk_categ not in ("web", "images", "videos", "news"):
|
||||
raise ValueError("invalid search type: %s" % tusk_categ)
|
||||
|
||||
|
||||
def _obtain_x_sid() -> tuple[str, str]:
|
||||
"""
|
||||
The session ID ("sid") is encoded as a byte array in ``embed.js``.
|
||||
It is only valid for exactly one request, so we can't cache it.
|
||||
|
||||
The header key is usually called `x-sid-{UUIDv4}`, and the value is
|
||||
usually a plain UUIDv4 (but a different one than in the header key).
|
||||
"""
|
||||
resp = get(f"{api_url}/revcontent/embed.js")
|
||||
if not resp.ok:
|
||||
raise SearxEngineAPIException("failed to obtain request x-sid token")
|
||||
|
||||
# data is prefixed by 'var x='
|
||||
data_array = loads(resp.text[6:])
|
||||
|
||||
def _byte_array_to_ascii(text: list[int]) -> str:
|
||||
"""
|
||||
Converts a byte array (e.g. [81, 101, 97, 114, 88, 78, 71]) to the ASCII
|
||||
string representation (e.g. "SearXNG").
|
||||
"""
|
||||
return "".join([chr(x) for x in text])
|
||||
|
||||
x_sid_header = _byte_array_to_ascii(data_array[3])
|
||||
x_sid_value = _byte_array_to_ascii(data_array[4])
|
||||
return x_sid_header, x_sid_value
|
||||
|
||||
|
||||
def request(query: str, params: "OnlineParams") -> None:
|
||||
# images don't support pagination, news and videos only support two pages
|
||||
if tusk_categ == "images" and params["pageno"] > 1 or tusk_categ in ("news", "videos") and params["pageno"] > 2:
|
||||
params["url"] = None
|
||||
return
|
||||
|
||||
args = {
|
||||
"q": query,
|
||||
"p": params["pageno"],
|
||||
"l": "center", # political direction: "left", "center" or "right"
|
||||
}
|
||||
if tusk_categ == "images":
|
||||
params["url"] = f"{api_url}/Search/Image?{urlencode(args)}"
|
||||
else:
|
||||
# web response also contains news and videos
|
||||
params["url"] = f"{api_url}/Search/Web?{urlencode(args)}"
|
||||
|
||||
x_sid_header, x_sid_value = _obtain_x_sid()
|
||||
params["headers"] = {
|
||||
x_sid_header: x_sid_value,
|
||||
# required - we send a random longitude and latitude instead of the actual user location
|
||||
'x-lon': str(random.random() * 90),
|
||||
'x-lat': str(random.random() * 90),
|
||||
}
|
||||
|
||||
|
||||
def response(resp: "SXNG_Response"):
|
||||
res = EngineResults()
|
||||
|
||||
json_resp = resp.json()["results"]
|
||||
|
||||
if tusk_categ == "web":
|
||||
for result in (json_resp.get("web") or {}).get("results", []):
|
||||
res.add(
|
||||
res.types.MainResult(
|
||||
url=result["url"],
|
||||
title=html_to_text(result["title"]),
|
||||
content=html_to_text(result["description"]),
|
||||
thumbnail=(result["thumbnail"] or {}).get("src") or "",
|
||||
)
|
||||
)
|
||||
elif tusk_categ == "news":
|
||||
for result in (json_resp.get("news") or {}).get("results", []):
|
||||
publishedDate = None
|
||||
try:
|
||||
publishedDate = parser.parse(result["age"])
|
||||
except parser.ParserError:
|
||||
pass
|
||||
|
||||
res.add(
|
||||
res.types.MainResult(
|
||||
url=result["url"],
|
||||
title=html_to_text(result["title"]),
|
||||
content=html_to_text(result["description"]),
|
||||
thumbnail=result["thumbnail"]["src"],
|
||||
publishedDate=publishedDate,
|
||||
)
|
||||
)
|
||||
elif tusk_categ == "videos":
|
||||
for result in (json_resp.get("videos") or {}).get("results", []):
|
||||
publishedDate = None
|
||||
try:
|
||||
publishedDate = parser.parse(result["age"])
|
||||
except parser.ParserError:
|
||||
pass
|
||||
|
||||
res.add(
|
||||
res.types.LegacyResult(
|
||||
template="videos.html",
|
||||
url=result["url"],
|
||||
title=html_to_text(result["title"]),
|
||||
content=html_to_text(result["description"]),
|
||||
thumbnail=result["thumbnail"]["src"],
|
||||
publishedDate=publishedDate,
|
||||
length=result["video"].get("duration"),
|
||||
)
|
||||
)
|
||||
elif tusk_categ == "images":
|
||||
for result in json_resp:
|
||||
res.add(
|
||||
res.types.Image(
|
||||
url=result["url"],
|
||||
title=html_to_text(result["title"]),
|
||||
img_src=result["properties"]["url"],
|
||||
thumbnail_src=result["thumbnail"]["src"],
|
||||
)
|
||||
)
|
||||
|
||||
return res
|
||||
@@ -2383,6 +2383,35 @@ engines:
|
||||
- 5000
|
||||
inactive: true
|
||||
|
||||
- name: tusksearch
|
||||
engine: tusksearch
|
||||
shortcut: tu
|
||||
tusk_categ: web
|
||||
categories: general
|
||||
disabled: true
|
||||
|
||||
- name: tusksearch images
|
||||
engine: tusksearch
|
||||
shortcut: tui
|
||||
paging: false
|
||||
tusk_categ: images
|
||||
categories: images
|
||||
disabled: true
|
||||
|
||||
- name: tusksearch videos
|
||||
engine: tusksearch
|
||||
shortcut: tuv
|
||||
tusk_categ: videos
|
||||
categories: videos
|
||||
disabled: true
|
||||
|
||||
- name: tusksearch news
|
||||
engine: tusksearch
|
||||
shortcut: tun
|
||||
tusk_categ: news
|
||||
categories: news
|
||||
disabled: true
|
||||
|
||||
# tmp suspended - too slow, too many errors
|
||||
# - name: urbandictionary
|
||||
# engine : xpath
|
||||
|
||||
Reference in New Issue
Block a user