Files
searxng/searx/engines/swisscows_extra.py
T

194 lines
6.1 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=invalid-name
"""Swisscows (images, videos)"""
import base64
import codecs
import hashlib
import json
import random
from datetime import datetime
from urllib.parse import urlencode
import typing as t
from searx.result_types import EngineResults
from searx.utils import humanize_number, html_to_text
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": "https://swisscows.com",
"wikidata_id": "Q22937452",
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
categories = ["videos"]
swisscows_category = "videos" # possible: "videos", "images"
paging = True
results_per_page = 50
base_url = "https://api.swisscows.com"
CAESAR_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
NONCE_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
def generate_nonce(length: int = 32) -> str:
"""
Generate a random char sequence with the given length.
"""
return "".join([random.choice(NONCE_ALPHABET) for _ in range(length)])
def caesar_shift_with_switch_case(s: str, offset: int = 13) -> str:
"""
Caesar shift by :py:obj:`offset` that additionally inverts the casing of all letters
(i.e. from lowercase to uppercase and vice versa).
"""
out = ""
for c in s:
if c.upper() in CAESAR_ALPHABET:
alphabet_index = ord(c.upper()) - ord("A")
shifted = CAESAR_ALPHABET[(alphabet_index + offset) % len(CAESAR_ALPHABET)]
case_switched = shifted.lower() if c.isupper() else shifted.upper()
out += case_switched
else:
out += c
return out
def sha256_hash_b64_url(s: str) -> str:
"""
Calculate the SHA256 hash and base64 URL-encodes it.
"""
hasher = hashlib.sha256()
hasher.update(s.encode())
hashed_bytes = hasher.digest()
# hashlib generates a byte digest, but since we need to convert it to base64, we
# need to do that by hand
hash_base64 = codecs.encode(hashed_bytes, "base64").decode("utf-8").rstrip('\n')
hash_base64_url_encoded = hash_base64.replace("=", "").replace("+", '-').replace("/", '_')
return hash_base64_url_encoded
def generate_nonce_and_signature(url_path: str) -> tuple[str, str]:
"""
Generate "X-Request-Nonce" and "X-Request-Signature" which are required for accessing
Swisscows images (reverse engineered from their official website).
"""
nonce = generate_nonce()
nonce_shifted = caesar_shift_with_switch_case(nonce, 13)
signature = sha256_hash_b64_url(url_path + nonce_shifted)
return (nonce, signature)
def init(_):
if swisscows_category not in ("videos", "images"):
raise ValueError("illegal swisscows category: %s" % swisscows_category)
if swisscows_category == "videos" and results_per_page > 10:
raise ValueError("results_per_page for swisscows videos can be at most 10")
def request(query: str, params: "OnlineParams") -> None:
# swisscows images only supports 2 pages
if swisscows_category == "images" and params["pageno"] > 2:
params["url"] = None
return
# the keys have to be sorted in alphabetic order,
# otherwise the generated signature won't be accepted!
url_path = ""
if swisscows_category == "images":
args = {
"itemsCount": results_per_page,
"locale": "en-US",
"offset": (params["pageno"] - 1) * results_per_page,
"query": query,
"spellcheck": True,
}
url_path = f"/v5/images/search?{urlencode(args)}"
else:
args = {
"itemsCount": results_per_page,
"offset": (params["pageno"] - 1) * results_per_page,
"query": query,
"region": "en-US",
"spellcheck": True,
}
url_path = f"/v2/videos/search?{urlencode(args)}"
nonce, signature = generate_nonce_and_signature(url_path)
params["headers"].update(
{
"X-Request-Nonce": nonce,
"X-Request-Signature": signature,
}
)
params["url"] = base_url + url_path
def response(resp: "SXNG_Response"):
res = EngineResults()
json_data = resp.json()
# only appears to be the case for images, for videos the data doesn't seem to be encoded
# payload is encoded as a JSON web token -> 3 parts, separated by "."
# the actual data is in the center of the encoded string
if "payload" in json_data:
payload = json_data["payload"].split(".")[1]
# pad with '=' to be valid base64
payload = payload + '=' * (4 - len(payload) % 4)
decoded = base64.urlsafe_b64decode(payload)
json_data = json.loads(decoded.decode())
for result in json_data["items"]:
if swisscows_category == "images":
res.add(
res.types.LegacyResult(
{
"template": "images.html",
"url": result["url"],
"thumbnail_src": result["thumbnail"]["url"],
"img_src": result["contentUrl"],
"title": result["name"],
}
)
)
else:
published_date = None
if result["datePublished"]:
published_date = datetime.fromisoformat(result["datePublished"])
res.add(
res.types.LegacyResult(
{
"template": "videos.html",
"url": result["url"],
"title": html_to_text(result["title"]),
"content": result["description"],
"thumbnail": result["thumbnailUrl"],
"length": result["duration"],
"iframe_src": result["embedUrl"],
"publishedDate": published_date,
"views": humanize_number(result["viewCount"]),
}
)
)
return res