[enh] data: traits population

Job failing since October 2025. enh: always raise and reuse data fix: brave unknown locale fix: startpage add "brazilian"
2026-05-30 06:34:11 +02:00 · 2026-03-01 11:33:06 +01:00
parent a9f3baefe6
commit 2b03a61832
16 changed files with 946 additions and 864 deletions
@@ -11,40 +11,45 @@ engines:

 """

-import typing as t
-
-import re
 import random
+import re
 import string
 import time
-from urllib.parse import urlencode, unquote
-from lxml import html
+import typing as t
+from urllib.parse import unquote, urlencode
+
 import babel
 import babel.core
 import babel.languages
+from lxml import html

-from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex, gen_gsa_useragent
-from searx.locales import language_tag, region_tag, get_official_locales
-from searx.network import get  # see https://github.com/searxng/searxng/issues/762
-from searx.exceptions import SearxEngineCaptchaException
 from searx.enginelib.traits import EngineTraits
+from searx.exceptions import SearxEngineCaptchaException
+from searx.locales import get_official_locales, language_tag, region_tag
 from searx.result_types import EngineResults
+from searx.utils import (
+    eval_xpath,
+    eval_xpath_getindex,
+    eval_xpath_list,
+    extract_text,
+    gen_gsa_useragent,
+)

 if t.TYPE_CHECKING:
    from searx.extended_types import SXNG_Response
    from searx.search.processors import OnlineParams

 about = {
-    "website": 'https://www.google.com',
-    "wikidata_id": 'Q9366',
-    "official_api_documentation": 'https://developers.google.com/custom-search/',
+    "website": "https://www.google.com",
+    "wikidata_id": "Q9366",
+    "official_api_documentation": "https://developers.google.com/custom-search/",
    "use_official_api": False,
    "require_api_key": False,
-    "results": 'HTML',
+    "results": "HTML",
 }

 # engine dependent config
-categories = ['general', 'web']
+categories = ["general", "web"]
 paging = True
 max_page = 50
 """`Google max 50 pages`_
@@ -54,10 +59,10 @@ max_page = 50
 time_range_support = True
 safesearch = True

-time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
+time_range_dict = {"day": "d", "week": "w", "month": "m", "year": "y"}

 # Filter results. 0: None, 1: Moderate, 2: Strict
-filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
+filter_mapping = {0: "off", 1: "medium", 2: "high"}

 # specific xpath variables
 # ------------------------
@@ -87,7 +92,7 @@ def ui_async(start: int) -> str:

    # create a new random arc_id every hour
    if not _arcid_random or (int(time.time()) - _arcid_random[1]) > 3600:
-        _arcid_random = (''.join(random.choices(_arcid_range, k=23)), int(time.time()))
+        _arcid_random = ("".join(random.choices(_arcid_range, k=23)), int(time.time()))
    arc_id = f"arc_id:srp_{_arcid_random[0]}_1{start:02}"

    return ",".join([arc_id, use_ac, _fmt])
@@ -149,23 +154,23 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
    """

    ret_val: dict[str, t.Any] = {
-        'language': None,
-        'country': None,
-        'subdomain': None,
-        'params': {},
-        'headers': {},
-        'cookies': {},
-        'locale': None,
+        "language": None,
+        "country": None,
+        "subdomain": None,
+        "params": {},
+        "headers": {},
+        "cookies": {},
+        "locale": None,
    }

-    sxng_locale = params.get('searxng_locale', 'all')
+    sxng_locale = params.get("searxng_locale", "all")
    try:
-        locale = babel.Locale.parse(sxng_locale, sep='-')
+        locale = babel.Locale.parse(sxng_locale, sep="-")
    except babel.core.UnknownLocaleError:
        locale = None

-    eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
-    lang_code = eng_lang.split('_')[-1]  # lang_zh-TW --> zh-TW / lang_en --> en
+    eng_lang = eng_traits.get_language(sxng_locale, "lang_en")
+    lang_code = eng_lang.split("_")[-1]  # lang_zh-TW --> zh-TW / lang_en --> en
    country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)

    # Test zh_hans & zh_hant --> in the topmost links in the result list of list
@@ -176,10 +181,10 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
    # '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
    # '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5

-    ret_val['language'] = eng_lang
-    ret_val['country'] = country
-    ret_val['locale'] = locale
-    ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
+    ret_val["language"] = eng_lang
+    ret_val["country"] = country
+    ret_val["locale"] = locale
+    ret_val["subdomain"] = eng_traits.custom["supported_domains"].get(country.upper(), "www.google.com")

    # hl parameter:
    #   The hl parameter specifies the interface language (host language) of
@@ -191,7 +196,7 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
    #   https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages

    # https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
-    ret_val['params']['hl'] = f'{lang_code}-{country}'
+    ret_val["params"]["hl"] = f"{lang_code}-{country}"

    # lr parameter:
    #   The lr (language restrict) parameter restricts search results to
@@ -207,9 +212,9 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
    # By example: &lr=lang_zh-TW%7Clang_de selects articles written in
    # traditional chinese OR german language.

-    ret_val['params']['lr'] = eng_lang
-    if sxng_locale == 'all':
-        ret_val['params']['lr'] = ''
+    ret_val["params"]["lr"] = eng_lang
+    if sxng_locale == "all":
+        ret_val["params"]["lr"] = ""

    # cr parameter:
    #   The cr parameter restricts search results to documents originating in a
@@ -218,9 +223,9 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st

    # specify a region (country) only if a region is given in the selected
    # locale --> https://github.com/searxng/searxng/issues/2672
-    ret_val['params']['cr'] = ''
-    if len(sxng_locale.split('-')) > 1:
-        ret_val['params']['cr'] = 'country' + country
+    ret_val["params"]["cr"] = ""
+    if len(sxng_locale.split("-")) > 1:
+        ret_val["params"]["cr"] = "country" + country

    # gl parameter: (mandatory by Google News)
    #   The gl parameter value is a two-letter country code. For WebSearch
@@ -241,14 +246,14 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
    #   to interpret the query string. The default ie value is latin1.
    #   https://developers.google.com/custom-search/docs/xml_results#iesp

-    ret_val['params']['ie'] = 'utf8'
+    ret_val["params"]["ie"] = "utf8"

    # oe parameter:
    #   The oe parameter sets the character encoding scheme that should be used
    #   to decode the XML result. The default oe value is latin1.
    #   https://developers.google.com/custom-search/docs/xml_results#oesp

-    ret_val['params']['oe'] = 'utf8'
+    ret_val["params"]["oe"] = "utf8"

    # num parameter:
    #   The num parameter identifies the number of search results to return.
@@ -261,43 +266,43 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st

    # HTTP headers

-    ret_val['headers']['Accept'] = '*/*'
-    ret_val['headers']['User-Agent'] = gen_gsa_useragent()
+    ret_val["headers"]["Accept"] = "*/*"
+    ret_val["headers"]["User-Agent"] = gen_gsa_useragent()

    # Cookies

    # - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
    # - https://github.com/searxng/searxng/issues/1555
-    ret_val['cookies']['CONSENT'] = "YES+"
+    ret_val["cookies"]["CONSENT"] = "YES+"

    return ret_val


 def detect_google_sorry(resp):
-    if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
+    if resp.url.host == "sorry.google.com" or resp.url.path.startswith("/sorry"):
        raise SearxEngineCaptchaException()


 def request(query: str, params: "OnlineParams") -> None:
    """Google search request"""
    # pylint: disable=line-too-long
-    start = (params['pageno'] - 1) * 10
+    start = (params["pageno"] - 1) * 10
    str_async = ui_async(start)
    google_info = get_google_info(params, traits)
    logger.debug("ARC_ID: %s", str_async)

    # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
    query_url = (
-        'https://'
-        + google_info['subdomain']
-        + '/search'
+        "https://"
+        + google_info["subdomain"]
+        + "/search"
        + "?"
        + urlencode(
            {
-                'q': query,
-                **google_info['params'],
-                'filter': '0',
-                'start': start,
+                "q": query,
+                **google_info["params"],
+                "filter": "0",
+                "start": start,
                # 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
                # 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
                # 'cs' : 1,
@@ -308,20 +313,20 @@ def request(query: str, params: "OnlineParams") -> None:
                # 'sa': 'N',
                # 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
                # formally known as use_mobile_ui
-                'asearch': 'arc',
-                'async': str_async,
+                "asearch": "arc",
+                "async": str_async,
            }
        )
    )

-    if params['time_range'] in time_range_dict:
-        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
-    if params['safesearch']:
-        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
-    params['url'] = query_url
+    if params["time_range"] in time_range_dict:
+        query_url += "&" + urlencode({"tbs": "qdr:" + time_range_dict[params["time_range"]]})
+    if params["safesearch"]:
+        query_url += "&" + urlencode({"safe": filter_mapping[params["safesearch"]]})
+    params["url"] = query_url

-    params['cookies'] = google_info['cookies']
-    params['headers'].update(google_info['headers'])
+    params["cookies"] = google_info["cookies"]
+    params["headers"].update(google_info["headers"])


 # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
@@ -334,14 +339,14 @@ def parse_data_images(text: str):
    data_image_map = {}

    for img_id, data_image in RE_DATA_IMAGE.findall(text):
-        end_pos = data_image.rfind('=')
+        end_pos = data_image.rfind("=")
        if end_pos > 0:
            data_image = data_image[: end_pos + 1]
        data_image_map[img_id] = data_image
    last = RE_DATA_IMAGE_end.search(text)
    if last:
        data_image_map[last.group(1)] = last.group(2)
-    logger.debug('data:image objects --> %s', list(data_image_map.keys()))
+    logger.debug("data:image objects --> %s", list(data_image_map.keys()))
    return data_image_map


@@ -365,15 +370,18 @@ def response(resp: "SXNG_Response"):
            title_tag = eval_xpath_getindex(result, './/div[contains(@role, "link")]', 0, default=None)
            if title_tag is None:
                # this not one of the common google results *section*
-                logger.debug('ignoring item from the result_xpath list: missing title')
+                logger.debug("ignoring item from the result_xpath list: missing title")
                continue
            title = extract_text(title_tag)

-            raw_url = eval_xpath_getindex(result, './/a/@href', 0, None)
+            raw_url = eval_xpath_getindex(result, ".//a/@href", 0, None)
            if raw_url is None:
-                logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
+                logger.debug(
+                    'ignoring item from the result_xpath list: missing url of title "%s"',
+                    title,
+                )
                continue
-            url = unquote(raw_url[7:].split('&sa=U')[0])  # remove the google redirector
+            url = unquote(raw_url[7:].split("&sa=U")[0])  # remove the google redirector

            content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
            for item in content_nodes:
@@ -383,20 +391,23 @@ def response(resp: "SXNG_Response"):
            content = extract_text(content_nodes)

            if not content:
-                logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
+                logger.debug(
+                    'ignoring item from the result_xpath list: missing content of title "%s"',
+                    title,
+                )
                continue

-            thumbnail = content_nodes[0].xpath('.//img/@src')
+            thumbnail = content_nodes[0].xpath(".//img/@src")
            if thumbnail:
                thumbnail = thumbnail[0]
-                if thumbnail.startswith('data:image'):
-                    img_id = content_nodes[0].xpath('.//img/@id')
+                if thumbnail.startswith("data:image"):
+                    img_id = content_nodes[0].xpath(".//img/@id")
                    if img_id:
                        thumbnail = data_image_map.get(img_id[0])
            else:
                thumbnail = None

-            results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail})
+            results.append({"url": url, "title": title, "content": content, "thumbnail": thumbnail})

        except Exception as e:  # pylint: disable=broad-except
            logger.error(e, exc_info=True)
@@ -405,7 +416,7 @@ def response(resp: "SXNG_Response"):
    # parse suggestion
    for suggestion in eval_xpath_list(dom, suggestion_xpath):
        # append suggestion
-        results.append({'suggestion': extract_text(suggestion)})
+        results.append({"suggestion": extract_text(suggestion)})

    # return results
    return results
@@ -416,27 +427,27 @@ def response(resp: "SXNG_Response"):

 skip_countries = [
    # official language of google-country not in google-languages
-    'AL',  # Albanien (sq)
-    'AZ',  # Aserbaidschan  (az)
-    'BD',  # Bangladesch (bn)
-    'BN',  # Brunei Darussalam (ms)
-    'BT',  # Bhutan (dz)
-    'ET',  # Äthiopien (am)
-    'GE',  # Georgien (ka, os)
-    'GL',  # Grönland (kl)
-    'KH',  # Kambodscha (km)
-    'LA',  # Laos (lo)
-    'LK',  # Sri Lanka (si, ta)
-    'ME',  # Montenegro (sr)
-    'MK',  # Nordmazedonien (mk, sq)
-    'MM',  # Myanmar (my)
-    'MN',  # Mongolei (mn)
-    'MV',  # Malediven (dv) // dv_MV is unknown by babel
-    'MY',  # Malaysia (ms)
-    'NP',  # Nepal (ne)
-    'TJ',  # Tadschikistan (tg)
-    'TM',  # Turkmenistan (tk)
-    'UZ',  # Usbekistan (uz)
+    "AL",  # Albanien (sq)
+    "AZ",  # Aserbaidschan  (az)
+    "BD",  # Bangladesch (bn)
+    "BN",  # Brunei Darussalam (ms)
+    "BT",  # Bhutan (dz)
+    "ET",  # Äthiopien (am)
+    "GE",  # Georgien (ka, os)
+    "GL",  # Grönland (kl)
+    "KH",  # Kambodscha (km)
+    "LA",  # Laos (lo)
+    "LK",  # Sri Lanka (si, ta)
+    "ME",  # Montenegro (sr)
+    "MK",  # Nordmazedonien (mk, sq)
+    "MM",  # Myanmar (my)
+    "MN",  # Mongolei (mn)
+    "MV",  # Malediven (dv) // dv_MV is unknown by babel
+    "MY",  # Malaysia (ms)
+    "NP",  # Nepal (ne)
+    "TJ",  # Tadschikistan (tg)
+    "TM",  # Turkmenistan (tk)
+    "UZ",  # Usbekistan (uz)
 ]


@@ -444,21 +455,23 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
    """Fetch languages from Google."""
    # pylint: disable=import-outside-toplevel, too-many-branches

-    engine_traits.custom['supported_domains'] = {}
+    from searx.network import get  # see https://github.com/searxng/searxng/issues/762

-    resp = get('https://www.google.com/preferences')
-    if not resp.ok:  # type: ignore
-        raise RuntimeError("Response from Google's preferences is not OK.")
+    engine_traits.custom["supported_domains"] = {}

-    dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
+    resp = get("https://www.google.com/preferences", timeout=5)
+    if not resp.ok:
+        raise RuntimeError("Response from Google preferences is not OK.")
+
+    dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ""))

    # supported language codes

-    lang_map = {'no': 'nb'}
+    lang_map = {"no": "nb"}
    for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
        eng_lang = x.get("value")
        try:
-            locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
+            locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep="-")
        except babel.UnknownLocaleError:
            print("INFO:  google UI language %s (%s) is unknown by babel" % (eng_lang, x.text.split("(")[0].strip()))
            continue
@@ -469,10 +482,10 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
            if conflict != eng_lang:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
            continue
-        engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
+        engine_traits.languages[sxng_lang] = "lang_" + eng_lang

    # alias languages
-    engine_traits.languages['zh'] = 'lang_zh-CN'
+    engine_traits.languages["zh"] = "lang_zh-CN"

    # supported region codes

@@ -481,37 +494,37 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):

        if eng_country in skip_countries:
            continue
-        if eng_country == 'ZZ':
-            engine_traits.all_locale = 'ZZ'
+        if eng_country == "ZZ":
+            engine_traits.all_locale = "ZZ"
            continue

        sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)

        if not sxng_locales:
-            print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
+            print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get("data-name"), eng_country))
            continue

        for sxng_locale in sxng_locales:
            engine_traits.regions[region_tag(sxng_locale)] = eng_country

    # alias regions
-    engine_traits.regions['zh-CN'] = 'HK'
+    engine_traits.regions["zh-CN"] = "HK"

    # supported domains

    if add_domains:
-        resp = get('https://www.google.com/supported_domains')
-        if not resp.ok:  # type: ignore
-            raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
+        resp = get("https://www.google.com/supported_domains", timeout=5)
+        if not resp.ok:
+            raise RuntimeError("Response from Google supported domains is not OK.")

-        for domain in resp.text.split():  # type: ignore
+        for domain in resp.text.split():
            domain = domain.strip()
            if not domain or domain in [
-                '.google.com',
+                ".google.com",
            ]:
                continue
-            region = domain.split('.')[-1].upper()
-            engine_traits.custom['supported_domains'][region] = 'www' + domain  # type: ignore
-            if region == 'HK':
+            region = domain.split(".")[-1].upper()
+            engine_traits.custom["supported_domains"][region] = "www" + domain
+            if region == "HK":
                # There is no google.cn, we use .com.hk for zh-CN
-                engine_traits.custom['supported_domains']['CN'] = 'www' + domain  # type: ignore
+                engine_traits.custom["supported_domains"]["CN"] = "www" + domain