searxng/searx/engines/s1search.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Search engines by System1 (general).

System1 is an advertising company, and provides all its search engines as a
subdomain of ``s1search.co``.  As a result, it has more than 1000 subdomains, of
which some work, and some don't.

Some of the engines get their results from Google, others get them from Yahoo.
"""

import typing as t
from urllib.parse import urlencode, urlparse, parse_qs

from lxml import html

from searx.result_types import EngineResults
from searx.enginelib import EngineCache
from searx.utils import eval_xpath_list, eval_xpath, extract_text

if t.TYPE_CHECKING:
    from searx.search.processors import OnlineParams
    from searx.extended_types import SXNG_Response

about = {
    "website": "https://s1search.co",
    "official_api_documentation": None,
    "use_official_api": False,
    "require_api_key": False,
    "results": "HTML",
}

base_url = ""  # alternatively: search.gmx.net
categories = ["general"]

paging = True

CACHE: EngineCache
"""Cache to store verification tokens for pagination."""


def init(_):
    if not base_url:
        raise ValueError("base_url must be set")


def setup(engine_settings: dict[str, t.Any]) -> bool:
    global CACHE  # pylint: disable=global-statement
    CACHE = EngineCache(engine_settings["name"])
    return True


def _cache_key(query: str, pageno: int) -> str:
    return f"{query}|{pageno}"


def request(query: str, params: "OnlineParams"):
    args = {"q": query, "page": params["pageno"]}
    if params["pageno"] > 1:
        sc = CACHE.get(_cache_key(query, params["pageno"]))
        # sc is required for pagination to avoid rate-limits
        if not sc:
            params["url"] = None
            return

        args["sc"] = sc

    params["url"] = f"{base_url}/serp?{urlencode(args)}"


def response(resp: "SXNG_Response") -> EngineResults:
    res = EngineResults()

    doc = html.fromstring(resp.text)

    for suggestion in eval_xpath_list(doc, "//div[@class='aylf-yahoo-bottom' or @class='aylf-yahoo-sidebar']/div"):
        res.add(res.types.LegacyResult({"suggestion": extract_text(suggestion)}))

    for result in eval_xpath_list(
        doc, "//div[contains(@class, 'web-yahoo') or contains(@class, 'web-google')]/div[contains(@class, '__result')]"
    ):
        res.add(
            res.types.MainResult(
                url=extract_text(eval_xpath(result, ".//a[contains(@class, 'title')]/@href")),
                title=extract_text(eval_xpath(result, ".//a[contains(@class, 'title')]")),
                content=extract_text(eval_xpath(result, ".//span[contains(@class, 'description') or @class='']")),
            )
        )

    # store pagination keys to be able to access next pages
    for page_href in eval_xpath_list(doc, "//a[contains(@class, 'pagination__num')]"):
        # target_url looks like "/serp?q=test&page=2&sc=RVlBPMDPVhWR20"
        target_url = extract_text(eval_xpath(page_href, "./@href"))
        target_url = parse_qs(urlparse(target_url).query)
        pageno = int(target_url["page"][0])
        sc = target_url["sc"][0]
        CACHE.set(_cache_key(resp.search_params["query"], pageno), sc)

    return res