From 6cee4b8947f6ed6286b54bfdf247efcbf2bf4406 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Mon, 11 May 2026 22:23:06 +0200 Subject: [PATCH] [feat] yep: add support for selecting search language (#6075) --- searx/engines/yep.py | 82 +++++++++++++++++++- searxng_extra/update/update_engine_traits.py | 2 +- 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/searx/engines/yep.py b/searx/engines/yep.py index 37ee458c4..e344b4866 100644 --- a/searx/engines/yep.py +++ b/searx/engines/yep.py @@ -1,14 +1,17 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """Yep (general, images, news)""" +import re + import typing as t from urllib.parse import urlencode from searx.result_types import EngineResults -from searx.utils import html_to_text +from searx.utils import html_to_text, eval_xpath_getindex, extract_text if t.TYPE_CHECKING: + from searx.enginelib.traits import EngineTraits from searx.extended_types import SXNG_Response from searx.search.processors import OnlineParams @@ -29,9 +32,17 @@ enable_http2 = False results_per_page = 20 +_IMPORT_RE = re.compile(r"import\"(.*?)\";") +_LANGUAGE_RE = re.compile(r"\{english:\".*?\",code_string:\"(.*?)\",code:\".*?\"\}") + def request(query: str, params: 'OnlineParams') -> None: args = {'query': query, 'safeSearch': safesearch_map[params['safesearch']], 'limit': results_per_page} + + engine_language: str = traits.get_language(params["searxng_locale"]) + if engine_language: + args["hl"] = engine_language + params['url'] = f"{base_url}/fs/2/search?{urlencode(args)}" params['headers']['Referer'] = 'https://yep.com/' params['headers']['Origin'] = 'https://yep.com' @@ -50,3 +61,72 @@ def response(resp: 'SXNG_Response') -> EngineResults: ) return res + + +def fetch_traits(engine_traits: 'EngineTraits'): + """Fetch :ref:`languages ` and :ref:`regions ` from Yep. + + The language options are very well hidden on Yep. To get it, we have to do the following: + - Load the yep.com mainpage and extract the URL of the JavaScript app + - Load the JavaScript source code and extract the URL of all imported modules from it + - Load the imported modules to search for the right one that contains the languages + """ + + # pylint: disable=import-outside-toplevel, too-many-branches + + from lxml import html + import babel + + from searx.locales import language_tag + from searx.network import get # see https://github.com/searxng/searxng/issues/762 + + from searx.utils import gen_useragent + + web_base_url = "https://yep.com" + + headers = { + "User-Agent": gen_useragent(), + "Referer": f"{web_base_url}/", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "same-origin", + } + + resp = get(web_base_url, headers=headers, timeout=5) + if not resp.ok: + raise RuntimeError("Response from Yep languages is not OK.") + + doc = html.fromstring(resp.text) + url = eval_xpath_getindex(doc, "//script[contains(@src, 'PageApp')]/@src", index=0) + + resp = get("https:" + extract_text(url), headers=headers, timeout=5) + if not resp.ok: + raise RuntimeError("Response from Yep languages is not OK.") + + language_codes = [] + for script_path in _IMPORT_RE.findall(resp.text): + resp = get(f"{web_base_url}{script_path}", headers=headers, timeout=5) + if not resp.ok: + raise RuntimeError("Response from Yep languages is not OK.") + + for match in _LANGUAGE_RE.findall(resp.text): + language_codes.append(match) + + if language_codes: + break + + for language_code in language_codes: + try: + sxng_tag = language_tag(babel.Locale.parse(language_code, sep="-")) + except babel.UnknownLocaleError: + # silently ignore unknown languages + continue + # print("%-20s: %s <-- %s" % (extract_text(option), country_tag, sxng_tag)) + + conflict = engine_traits.languages.get(sxng_tag) + if conflict: + if conflict != sxng_tag: + print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, language_code)) + continue + engine_traits.languages[sxng_tag] = language_code diff --git a/searxng_extra/update/update_engine_traits.py b/searxng_extra/update/update_engine_traits.py index 34e5ae6ab..e955c29c9 100755 --- a/searxng_extra/update/update_engine_traits.py +++ b/searxng_extra/update/update_engine_traits.py @@ -132,7 +132,7 @@ def filter_locales(traits_map: EngineTraitsMap) -> set[str]: """Filter language & region tags by a threshold.""" min_eng_per_region = 18 - min_eng_per_lang = 22 + min_eng_per_lang = 23 _: dict[str, int] = {} for eng in traits_map.values():