[enh] data: traits population

Job failing since October 2025. enh: always raise and reuse data fix: brave unknown locale fix: startpage add "brazilian"
2026-06-05 17:37:18 +02:00 · 2026-03-01 11:33:06 +01:00
parent a9f3baefe6
commit 2b03a61832
16 changed files with 946 additions and 864 deletions
@@ -45,19 +45,19 @@ from datetime import (
 )
 from json import loads
 from urllib.parse import urlencode
-from flask_babel import gettext
+
 import babel
 import lxml
+from flask_babel import gettext

+from searx.enginelib.traits import EngineTraits
 from searx.exceptions import (
-    SearxEngineAPIException,
-    SearxEngineTooManyRequestsException,
-    SearxEngineCaptchaException,
    SearxEngineAccessDeniedException,
+    SearxEngineAPIException,
+    SearxEngineCaptchaException,
+    SearxEngineTooManyRequestsException,
 )
 from searx.network import raise_for_httperror
-from searx.enginelib.traits import EngineTraits
-
 from searx.utils import (
    eval_xpath,
    eval_xpath_list,
@@ -67,12 +67,12 @@ from searx.utils import (

 # about
 about = {
-    "website": 'https://www.qwant.com/',
-    "wikidata_id": 'Q14657870',
+    "website": "https://www.qwant.com/",
+    "wikidata_id": "Q14657870",
    "official_api_documentation": None,
    "use_official_api": True,
    "require_api_key": False,
-    "results": 'JSON',
+    "results": "JSON",
 }

 # engine dependent config
@@ -100,10 +100,10 @@ qwant_news_locales = [

 # search-url

-api_url = 'https://api.qwant.com/v3/search/'
+api_url = "https://api.qwant.com/v3/search/"
 """URL of Qwant's API (JSON)"""

-web_lite_url = 'https://lite.qwant.com/'
+web_lite_url = "https://lite.qwant.com/"
 """URL of Qwant-Lite (HTML)"""


@@ -113,47 +113,44 @@ def request(query, params):
    if not query:
        return None

-    q_locale = traits.get_region(params["searxng_locale"], default='en_US')
+    q_locale = traits.get_region(params["searxng_locale"], default="en_US")

-    url = api_url + f'{qwant_categ}?'
-    args = {'q': query}
-    params['raise_for_httperror'] = False
+    url = api_url + f"{qwant_categ}?"
+    args = {"q": query}
+    params["raise_for_httperror"] = False

-    if qwant_categ == 'web-lite':
+    if qwant_categ == "web-lite":
+        url = web_lite_url + "?"
+        args["locale"] = q_locale.lower()
+        args["l"] = q_locale.split("_")[0]
+        args["s"] = params["safesearch"]
+        args["p"] = params["pageno"]

-        url = web_lite_url + '?'
-        args['locale'] = q_locale.lower()
-        args['l'] = q_locale.split('_')[0]
-        args['s'] = params['safesearch']
-        args['p'] = params['pageno']
+        params["raise_for_httperror"] = True

-        params['raise_for_httperror'] = True
-
-    elif qwant_categ == 'images':
-
-        args['count'] = 50
-        args['locale'] = q_locale
-        args['safesearch'] = params['safesearch']
-        args['tgp'] = 3
-        args['offset'] = (params['pageno'] - 1) * args['count']
+    elif qwant_categ == "images":
+        args["count"] = 50
+        args["locale"] = q_locale
+        args["safesearch"] = params["safesearch"]
+        args["tgp"] = 3
+        args["offset"] = (params["pageno"] - 1) * args["count"]

    else:  # web, news, videos
+        args["count"] = 10
+        args["locale"] = q_locale
+        args["safesearch"] = params["safesearch"]
+        args["llm"] = "false"
+        args["tgp"] = 3
+        args["offset"] = (params["pageno"] - 1) * args["count"]

-        args['count'] = 10
-        args['locale'] = q_locale
-        args['safesearch'] = params['safesearch']
-        args['llm'] = 'false'
-        args['tgp'] = 3
-        args['offset'] = (params['pageno'] - 1) * args['count']
-
-    params['url'] = url + urlencode(args)
+    params["url"] = url + urlencode(args)

    return params


 def response(resp):

-    if qwant_categ == 'web-lite':
+    if qwant_categ == "web-lite":
        return parse_web_lite(resp)
    return parse_web_api(resp)

@@ -164,15 +161,15 @@ def parse_web_lite(resp):
    results = []
    dom = lxml.html.fromstring(resp.text)

-    for item in eval_xpath_list(dom, '//section/article'):
+    for item in eval_xpath_list(dom, "//section/article"):
        if eval_xpath(item, "./span[contains(@class, 'tooltip')]"):
            # ignore randomly interspersed advertising adds
            continue
        results.append(
            {
-                'url': extract_text(eval_xpath(item, "./span[contains(@class, 'url partner')]")),
-                'title': extract_text(eval_xpath(item, './h2/a')),
-                'content': extract_text(eval_xpath(item, './p')),
+                "url": extract_text(eval_xpath(item, "./span[contains(@class, 'url partner')]")),
+                "title": extract_text(eval_xpath(item, "./h2/a")),
+                "content": extract_text(eval_xpath(item, "./p")),
            }
        )

@@ -191,35 +188,35 @@ def parse_web_api(resp):
    except ValueError:
        search_results = {}

-    data = search_results.get('data', {})
+    data = search_results.get("data", {})

    # check for an API error
-    if search_results.get('status') != 'success':
-        error_code = data.get('error_code')
+    if search_results.get("status") != "success":
+        error_code = data.get("error_code")
        if error_code == 24:
            raise SearxEngineTooManyRequestsException()
        if search_results.get("data", {}).get("error_data", {}).get("captchaUrl") is not None:
            raise SearxEngineCaptchaException()
        if resp.status_code == 403:
            raise SearxEngineAccessDeniedException()
-        msg = ",".join(data.get('message', ['unknown']))
+        msg = ",".join(data.get("message", ["unknown"]))
        raise SearxEngineAPIException(f"{msg} ({error_code})")

    # raise for other errors
    raise_for_httperror(resp)

-    if qwant_categ == 'web':
+    if qwant_categ == "web":
        # The WEB query contains a list named 'mainline'.  This list can contain
        # different result types (e.g. mainline[0]['type'] returns type of the
        # result items in mainline[0]['items']
-        mainline = data.get('result', {}).get('items', {}).get('mainline', {})
+        mainline = data.get("result", {}).get("items", {}).get("mainline", {})
    else:
        # Queries on News, Images and Videos do not have a list named 'mainline'
        # in the response.  The result items are directly in the list
        # result['items'].
-        mainline = data.get('result', {}).get('items', [])
+        mainline = data.get("result", {}).get("items", [])
        mainline = [
-            {'type': qwant_categ, 'items': mainline},
+            {"type": qwant_categ, "items": mainline},
        ]

    # return empty array if there are no results
@@ -227,68 +224,66 @@ def parse_web_api(resp):
        return []

    for row in mainline:
-        mainline_type = row.get('type', 'web')
+        mainline_type = row.get("type", "web")
        if mainline_type != qwant_categ:
            continue

-        if mainline_type == 'ads':
+        if mainline_type == "ads":
            # ignore adds
            continue

-        mainline_items = row.get('items', [])
+        mainline_items = row.get("items", [])
        for item in mainline_items:
+            title = item.get("title", None)
+            res_url = item.get("url", None)

-            title = item.get('title', None)
-            res_url = item.get('url', None)
-
-            if mainline_type == 'web':
-                content = item['desc']
+            if mainline_type == "web":
+                content = item["desc"]
                results.append(
                    {
-                        'title': title,
-                        'url': res_url,
-                        'content': content,
+                        "title": title,
+                        "url": res_url,
+                        "content": content,
                    }
                )

-            elif mainline_type == 'news':
-
-                pub_date = item['date']
+            elif mainline_type == "news":
+                pub_date = item["date"]
                if pub_date is not None:
                    pub_date = datetime.fromtimestamp(pub_date)
-                news_media = item.get('media', [])
+                news_media = item.get("media", [])
                thumbnail = None
                if news_media:
-                    thumbnail = news_media[0].get('pict', {}).get('url', None)
+                    thumbnail = news_media[0].get("pict", {}).get("url", None)
                results.append(
                    {
-                        'title': title,
-                        'url': res_url,
-                        'publishedDate': pub_date,
-                        'thumbnail': thumbnail,
+                        "title": title,
+                        "url": res_url,
+                        "publishedDate": pub_date,
+                        "thumbnail": thumbnail,
                    }
                )

-            elif mainline_type == 'images':
-                thumbnail = item['thumbnail']
-                img_src = item['media']
+            elif mainline_type == "images":
+                thumbnail = item["thumbnail"]
+                img_src = item["media"]
                results.append(
                    {
-                        'title': title,
-                        'url': res_url,
-                        'template': 'images.html',
-                        'thumbnail_src': thumbnail,
-                        'img_src': img_src,
-                        'resolution': f"{item['width']} x {item['height']}",
-                        'img_format': item.get('thumb_type'),
+                        "title": title,
+                        "url": res_url,
+                        "template": "images.html",
+                        "thumbnail_src": thumbnail,
+                        "img_src": img_src,
+                        "resolution": f"{item['width']} x {item['height']}",
+                        "img_format": item.get("thumb_type"),
                    }
                )

-            elif mainline_type == 'videos':
+            elif mainline_type == "videos":
                # some videos do not have a description: while qwant-video
                # returns an empty string, such video from a qwant-web query
                # miss the 'desc' key.
-                d, s, c = item.get('desc'), item.get('source'), item.get('channel')
+                d, s, c = item.get("desc"), item.get("source"), item.get("channel")
                content_parts = []
                if d:
                    content_parts.append(d)
@@ -296,27 +291,27 @@ def parse_web_api(resp):
                    content_parts.append("%s: %s " % (gettext("Source"), s))
                if c:
                    content_parts.append("%s: %s " % (gettext("Channel"), c))
-                content = ' // '.join(content_parts)
-                length = item['duration']
+                content = " // ".join(content_parts)
+                length = item["duration"]
                if length is not None:
                    length = timedelta(milliseconds=length)
-                pub_date = item['date']
+                pub_date = item["date"]
                if pub_date is not None:
                    pub_date = datetime.fromtimestamp(pub_date)
-                thumbnail = item['thumbnail']
+                thumbnail = item["thumbnail"]
                # from some locations (DE and others?) the s2 link do
                # response a 'Please wait ..' but does not deliver the thumbnail
-                thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1)
+                thumbnail = thumbnail.replace("https://s2.qwant.com", "https://s1.qwant.com", 1)
                results.append(
                    {
-                        'title': title,
-                        'url': res_url,
-                        'content': content,
-                        'iframe_src': get_embeded_stream_url(res_url),
-                        'publishedDate': pub_date,
-                        'thumbnail': thumbnail,
-                        'template': 'videos.html',
-                        'length': length,
+                        "title": title,
+                        "url": res_url,
+                        "content": content,
+                        "iframe_src": get_embeded_stream_url(res_url),
+                        "publishedDate": pub_date,
+                        "thumbnail": thumbnail,
+                        "template": "videos.html",
+                        "length": length,
                    }
                )

@@ -326,22 +321,28 @@ def parse_web_api(resp):
 def fetch_traits(engine_traits: EngineTraits):

    # pylint: disable=import-outside-toplevel
-    from searx import network
    from searx.locales import region_tag
+    from searx.network import get  # see https://github.com/searxng/searxng/issues/762
    from searx.utils import extr

-    resp = network.get(about['website'])
-    json_string = extr(resp.text, 'INITIAL_PROPS = ', '</script>')
+    resp = get(
+        about["website"],
+        timeout=5,
+    )
+    if not resp.ok:
+        raise RuntimeError("Response from Qwant is not OK.")
+
+    json_string = extr(resp.text, "INITIAL_PROPS = ", "</script>")

    q_initial_props = loads(json_string)
-    q_locales = q_initial_props.get('locales')
+    q_locales = q_initial_props.get("locales")
    eng_tag_list = set()

    for country, v in q_locales.items():
-        for lang in v['langs']:
+        for lang in v["langs"]:
            _locale = "{lang}_{country}".format(lang=lang, country=country)

-            if qwant_categ == 'news' and _locale.lower() not in qwant_news_locales:
+            if qwant_categ == "news" and _locale.lower() not in qwant_news_locales:
                # qwant-news does not support all locales from qwant-web:
                continue

@@ -349,7 +350,7 @@ def fetch_traits(engine_traits: EngineTraits):

    for eng_tag in eng_tag_list:
        try:
-            sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_'))
+            sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep="_"))
        except babel.UnknownLocaleError:
            print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag)
            continue