From 290d3e0c6a16e943f4a6e6770c1f82474a2041f3 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Sat, 6 Jun 2026 20:55:37 +0200 Subject: [PATCH] [feat] engines: add privacywall engine (#6211) - add https://privacywall.org support - the engine seems to use the Bing index, but not 100% sure - it claims to be privacy friendly, but it's not really by itself [1] [1]: https://discuss.privacyguides.net/t/how-is-privacy-wall-search-engine/29486 --- searx/engines/privacywall.py | 217 +++++++++++++++++++++++++++++++++++ searx/settings.yml | 22 ++++ 2 files changed, 239 insertions(+) create mode 100644 searx/engines/privacywall.py diff --git a/searx/engines/privacywall.py b/searx/engines/privacywall.py new file mode 100644 index 000000000..ac492d71a --- /dev/null +++ b/searx/engines/privacywall.py @@ -0,0 +1,217 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Privacywall_ claims to be a "privacy-friendly" search engine, +but according to a `Privacyguides discussion`_ it's sharing private +user information with Microsoft and Amazon. + +.. _Privacywall : https://www.privacywall.org +.. _`Privacyguides discussion` : https://discuss.privacyguides.net/t/how-is-privacy-wall-search-engine/29486 +""" + +import typing as t +from urllib.parse import urlencode, unquote_plus + +from lxml import html +import babel + +from searx.enginelib.traits import EngineTraits +from searx.utils import eval_xpath_list, eval_xpath, extract_text, get_embeded_stream_url, extr +from searx.locales import region_tag +from searx.result_types import EngineResults + + +if t.TYPE_CHECKING: + from lxml.etree import ElementBase + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + +about = { + "website": "https://privacywall.org", + "wikidata_id": None, + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +paging = True +safesearch = True +time_range_support = True + +base_url = "https://www.privacywall.org" +privacywall_category = "general" +"""Supported categories are ``general``, ``videos`` and ``images``.""" + + +# corresponds to the "k" query param +safesearch_map = {0: "off", 1: "on", 2: "on"} + +# page number sent for videos (is independent of the query) - certainly there's +# a pattern in this, but for our use case it's enough to just support the first +# 10 pages by hardcoding the page "numbers" +video_page_map = { + 2: "CAoQAA", + 3: "CBQQAA", + 4: "CB4QAA", + 5: "CCgQAA", + 6: "CDIQAA", + 7: "CDwQAA", + 8: "CEYQAA", + 9: "CFAQAA", + 10: "CFoQAA", +} + + +def init(_): + if privacywall_category not in ("general", "images", "videos"): + raise ValueError("invalid category: %s" % privacywall_category) + + +def request(query: str, params: "OnlineParams") -> None: + if params["pageno"] > 10: + params["url"] = None + return + + args = {"q": query, "safesearch": safesearch_map[params["safesearch"]]} + if params["searxng_locale"] != "all": + args["cc"] = traits.get_region(params["searxng_locale"]) or "US" + if params["time_range"]: + # time range uses the same "day", "week", "month", "year" naming scheme as SearXNG + args["time"] = params["time_range"] + + if params["pageno"] > 1: + if privacywall_category == "images": + args["page"] = str(params["pageno"]) + elif privacywall_category == "videos": + args["page"] = video_page_map[params["pageno"]] + else: + raise ValueError("general engine does not support pagination") + + if privacywall_category == "general": + params["url"] = f"{base_url}/search/secure/?{urlencode(args)}" + else: + params["url"] = f"{base_url}/{privacywall_category}/?{urlencode(args)}" + + +def _general_results(doc: "ElementBase") -> EngineResults: + res = EngineResults() + for result in eval_xpath_list(doc, "//div[@id='pw-results-main']/div[contains(@class, 'result-card')]"): + ( + res.add( + res.types.MainResult( + url=extract_text(eval_xpath(result, ".//a[contains(@class, 'result-url-anchor')]/@href")) or "", + title=extract_text(eval_xpath(result, ".//div[contains(@class, 'result_title')]")) or "", + content=extract_text(eval_xpath(result, ".//div[contains(@class, 'result-description')]")) or "", + ), + ) + ) + return res + + +def _extract_thumbnail_url(url: str) -> str: + """ + Get the URL from strings like "/videos/video.php?id=". + """ + url_start = url.find("?id=") + len("?id=") + thumbnail = unquote_plus(url[url_start:]) + return thumbnail + + +def _image_results(doc: "ElementBase") -> EngineResults: + res = EngineResults() + for result in eval_xpath_list(doc, "//div[@id='container']/div[contains(@class, 'imgcontainer')]"): + ( + res.add( + res.types.Image( + url=extract_text(eval_xpath(result, "./a/@href")) or "", + content=extract_text(eval_xpath(result, "./a/@alt")) or "", + thumbnail_src=_extract_thumbnail_url(extract_text(eval_xpath(result, ".//img/@src")) or ""), + source=extract_text(eval_xpath(result, ".//div[contains(@class, 'image-source-badge')]")) or "", + ), + ) + ) + return res + + +def _video_results(doc: "ElementBase") -> EngineResults: + res = EngineResults() + for result in eval_xpath_list( + doc, "//div[contains(@class, 'video-container')]/div[contains(@class, 'video-card')]" + ): + url = extract_text(eval_xpath(result, "./a/@href")) or "" + if not url: + continue + + thumbnail = None + # looks like
+ thumbnail_style = extract_text(eval_xpath(result, ".//div[contains(@class, 'video-img')]/@style")) + if thumbnail_style: + thumbnail = _extract_thumbnail_url(extr(thumbnail_style, ":url(", ")")) + + res.add( + res.types.LegacyResult( + template="videos.html", + url=url, + title=extract_text(eval_xpath(result, ".//h2[contains(@class, 'video-card-title')]")) or "", + content=extract_text(eval_xpath(result, ".//p")) or "", + thumbnail=thumbnail or "", + iframe_src=get_embeded_stream_url(url) or "", + ) + ) + + return res + + +def response(resp: "SXNG_Response") -> EngineResults: + doc = html.fromstring(resp.text) + match privacywall_category: + case "general": + return _general_results(doc) + case "images": + return _image_results(doc) + case "videos": + return _video_results(doc) + case _: + raise ValueError("invalid category: %s" % privacywall_category) + + +def fetch_traits(engine_traits: EngineTraits) -> None: + """Fetch regions from Bing-Web.""" + # pylint: disable=import-outside-toplevel + + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + from searx.utils import gen_useragent + + headers = { + "User-Agent": gen_useragent(), + } + + resp = get(base_url, headers=headers) + if not resp.ok: + raise RuntimeError("Response from Privacywall is not OK.") + + dom = html.fromstring(resp.text) + + # + for onclick_listener in eval_xpath( + dom, "//div[contains(@class, 'lang-menu')]//div[contains(@class, 'dropdown-option')]/@onclick" + ): + # this is either a normal lang-country tag (e.g. cs-cz) or only a country code (e.g. de, at, ...) + country_tag = extr(onclick_listener, "(\"", "\")") + + # the locale tag is only a country tag, so we get languages the from the list of official languages + # of the country + lang_tag: str + for lang_tag in babel.languages.get_official_languages(country_tag, de_facto=True): # pyright: ignore + try: + sxng_tag = region_tag(babel.Locale.parse(f"{lang_tag}_{country_tag.upper()}")) + except babel.UnknownLocaleError: + # silently ignore unknown languages + continue + + conflict = engine_traits.regions.get(sxng_tag) + if conflict: + if conflict != sxng_tag: + print("CONFLICT: babel %s --> %s" % (sxng_tag, conflict)) + continue + + engine_traits.regions[sxng_tag] = country_tag diff --git a/searx/settings.yml b/searx/settings.yml index c567af1c9..ee453b694 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -2081,6 +2081,28 @@ engines: base_url: 'https://discourse.pi-hole.net' disabled: true + - name: privacywall + engine: privacywall + categories: general + privacywall_category: general + paging: false # only images and videos support pagination + shortcut: pw + disabled: true + + - name: privacywall images + engine: privacywall + categories: images + privacywall_category: images + shortcut: pwi + disabled: true + + - name: privacywall videos + engine: privacywall + categories: videos + privacywall_category: videos + shortcut: pwv + disabled: true + # - name: searx # engine: searx_engine # shortcut: se