[enh] data: traits population

Job failing since October 2025.

enh: always raise and reuse data
fix: brave unknown locale
fix: startpage add "brazilian"
This commit is contained in:
Ivan Gabaldon
2026-03-01 11:33:06 +01:00
committed by Markus Heiser
parent a9f3baefe6
commit 2b03a61832
16 changed files with 946 additions and 864 deletions
+104 -103
View File
@@ -45,19 +45,19 @@ from datetime import (
)
from json import loads
from urllib.parse import urlencode
from flask_babel import gettext
import babel
import lxml
from flask_babel import gettext
from searx.enginelib.traits import EngineTraits
from searx.exceptions import (
SearxEngineAPIException,
SearxEngineTooManyRequestsException,
SearxEngineCaptchaException,
SearxEngineAccessDeniedException,
SearxEngineAPIException,
SearxEngineCaptchaException,
SearxEngineTooManyRequestsException,
)
from searx.network import raise_for_httperror
from searx.enginelib.traits import EngineTraits
from searx.utils import (
eval_xpath,
eval_xpath_list,
@@ -67,12 +67,12 @@ from searx.utils import (
# about
about = {
"website": 'https://www.qwant.com/',
"wikidata_id": 'Q14657870',
"website": "https://www.qwant.com/",
"wikidata_id": "Q14657870",
"official_api_documentation": None,
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
"results": "JSON",
}
# engine dependent config
@@ -100,10 +100,10 @@ qwant_news_locales = [
# search-url
api_url = 'https://api.qwant.com/v3/search/'
api_url = "https://api.qwant.com/v3/search/"
"""URL of Qwant's API (JSON)"""
web_lite_url = 'https://lite.qwant.com/'
web_lite_url = "https://lite.qwant.com/"
"""URL of Qwant-Lite (HTML)"""
@@ -113,47 +113,44 @@ def request(query, params):
if not query:
return None
q_locale = traits.get_region(params["searxng_locale"], default='en_US')
q_locale = traits.get_region(params["searxng_locale"], default="en_US")
url = api_url + f'{qwant_categ}?'
args = {'q': query}
params['raise_for_httperror'] = False
url = api_url + f"{qwant_categ}?"
args = {"q": query}
params["raise_for_httperror"] = False
if qwant_categ == 'web-lite':
if qwant_categ == "web-lite":
url = web_lite_url + "?"
args["locale"] = q_locale.lower()
args["l"] = q_locale.split("_")[0]
args["s"] = params["safesearch"]
args["p"] = params["pageno"]
url = web_lite_url + '?'
args['locale'] = q_locale.lower()
args['l'] = q_locale.split('_')[0]
args['s'] = params['safesearch']
args['p'] = params['pageno']
params["raise_for_httperror"] = True
params['raise_for_httperror'] = True
elif qwant_categ == 'images':
args['count'] = 50
args['locale'] = q_locale
args['safesearch'] = params['safesearch']
args['tgp'] = 3
args['offset'] = (params['pageno'] - 1) * args['count']
elif qwant_categ == "images":
args["count"] = 50
args["locale"] = q_locale
args["safesearch"] = params["safesearch"]
args["tgp"] = 3
args["offset"] = (params["pageno"] - 1) * args["count"]
else: # web, news, videos
args["count"] = 10
args["locale"] = q_locale
args["safesearch"] = params["safesearch"]
args["llm"] = "false"
args["tgp"] = 3
args["offset"] = (params["pageno"] - 1) * args["count"]
args['count'] = 10
args['locale'] = q_locale
args['safesearch'] = params['safesearch']
args['llm'] = 'false'
args['tgp'] = 3
args['offset'] = (params['pageno'] - 1) * args['count']
params['url'] = url + urlencode(args)
params["url"] = url + urlencode(args)
return params
def response(resp):
if qwant_categ == 'web-lite':
if qwant_categ == "web-lite":
return parse_web_lite(resp)
return parse_web_api(resp)
@@ -164,15 +161,15 @@ def parse_web_lite(resp):
results = []
dom = lxml.html.fromstring(resp.text)
for item in eval_xpath_list(dom, '//section/article'):
for item in eval_xpath_list(dom, "//section/article"):
if eval_xpath(item, "./span[contains(@class, 'tooltip')]"):
# ignore randomly interspersed advertising adds
continue
results.append(
{
'url': extract_text(eval_xpath(item, "./span[contains(@class, 'url partner')]")),
'title': extract_text(eval_xpath(item, './h2/a')),
'content': extract_text(eval_xpath(item, './p')),
"url": extract_text(eval_xpath(item, "./span[contains(@class, 'url partner')]")),
"title": extract_text(eval_xpath(item, "./h2/a")),
"content": extract_text(eval_xpath(item, "./p")),
}
)
@@ -191,35 +188,35 @@ def parse_web_api(resp):
except ValueError:
search_results = {}
data = search_results.get('data', {})
data = search_results.get("data", {})
# check for an API error
if search_results.get('status') != 'success':
error_code = data.get('error_code')
if search_results.get("status") != "success":
error_code = data.get("error_code")
if error_code == 24:
raise SearxEngineTooManyRequestsException()
if search_results.get("data", {}).get("error_data", {}).get("captchaUrl") is not None:
raise SearxEngineCaptchaException()
if resp.status_code == 403:
raise SearxEngineAccessDeniedException()
msg = ",".join(data.get('message', ['unknown']))
msg = ",".join(data.get("message", ["unknown"]))
raise SearxEngineAPIException(f"{msg} ({error_code})")
# raise for other errors
raise_for_httperror(resp)
if qwant_categ == 'web':
if qwant_categ == "web":
# The WEB query contains a list named 'mainline'. This list can contain
# different result types (e.g. mainline[0]['type'] returns type of the
# result items in mainline[0]['items']
mainline = data.get('result', {}).get('items', {}).get('mainline', {})
mainline = data.get("result", {}).get("items", {}).get("mainline", {})
else:
# Queries on News, Images and Videos do not have a list named 'mainline'
# in the response. The result items are directly in the list
# result['items'].
mainline = data.get('result', {}).get('items', [])
mainline = data.get("result", {}).get("items", [])
mainline = [
{'type': qwant_categ, 'items': mainline},
{"type": qwant_categ, "items": mainline},
]
# return empty array if there are no results
@@ -227,68 +224,66 @@ def parse_web_api(resp):
return []
for row in mainline:
mainline_type = row.get('type', 'web')
mainline_type = row.get("type", "web")
if mainline_type != qwant_categ:
continue
if mainline_type == 'ads':
if mainline_type == "ads":
# ignore adds
continue
mainline_items = row.get('items', [])
mainline_items = row.get("items", [])
for item in mainline_items:
title = item.get("title", None)
res_url = item.get("url", None)
title = item.get('title', None)
res_url = item.get('url', None)
if mainline_type == 'web':
content = item['desc']
if mainline_type == "web":
content = item["desc"]
results.append(
{
'title': title,
'url': res_url,
'content': content,
"title": title,
"url": res_url,
"content": content,
}
)
elif mainline_type == 'news':
pub_date = item['date']
elif mainline_type == "news":
pub_date = item["date"]
if pub_date is not None:
pub_date = datetime.fromtimestamp(pub_date)
news_media = item.get('media', [])
news_media = item.get("media", [])
thumbnail = None
if news_media:
thumbnail = news_media[0].get('pict', {}).get('url', None)
thumbnail = news_media[0].get("pict", {}).get("url", None)
results.append(
{
'title': title,
'url': res_url,
'publishedDate': pub_date,
'thumbnail': thumbnail,
"title": title,
"url": res_url,
"publishedDate": pub_date,
"thumbnail": thumbnail,
}
)
elif mainline_type == 'images':
thumbnail = item['thumbnail']
img_src = item['media']
elif mainline_type == "images":
thumbnail = item["thumbnail"]
img_src = item["media"]
results.append(
{
'title': title,
'url': res_url,
'template': 'images.html',
'thumbnail_src': thumbnail,
'img_src': img_src,
'resolution': f"{item['width']} x {item['height']}",
'img_format': item.get('thumb_type'),
"title": title,
"url": res_url,
"template": "images.html",
"thumbnail_src": thumbnail,
"img_src": img_src,
"resolution": f"{item['width']} x {item['height']}",
"img_format": item.get("thumb_type"),
}
)
elif mainline_type == 'videos':
elif mainline_type == "videos":
# some videos do not have a description: while qwant-video
# returns an empty string, such video from a qwant-web query
# miss the 'desc' key.
d, s, c = item.get('desc'), item.get('source'), item.get('channel')
d, s, c = item.get("desc"), item.get("source"), item.get("channel")
content_parts = []
if d:
content_parts.append(d)
@@ -296,27 +291,27 @@ def parse_web_api(resp):
content_parts.append("%s: %s " % (gettext("Source"), s))
if c:
content_parts.append("%s: %s " % (gettext("Channel"), c))
content = ' // '.join(content_parts)
length = item['duration']
content = " // ".join(content_parts)
length = item["duration"]
if length is not None:
length = timedelta(milliseconds=length)
pub_date = item['date']
pub_date = item["date"]
if pub_date is not None:
pub_date = datetime.fromtimestamp(pub_date)
thumbnail = item['thumbnail']
thumbnail = item["thumbnail"]
# from some locations (DE and others?) the s2 link do
# response a 'Please wait ..' but does not deliver the thumbnail
thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1)
thumbnail = thumbnail.replace("https://s2.qwant.com", "https://s1.qwant.com", 1)
results.append(
{
'title': title,
'url': res_url,
'content': content,
'iframe_src': get_embeded_stream_url(res_url),
'publishedDate': pub_date,
'thumbnail': thumbnail,
'template': 'videos.html',
'length': length,
"title": title,
"url": res_url,
"content": content,
"iframe_src": get_embeded_stream_url(res_url),
"publishedDate": pub_date,
"thumbnail": thumbnail,
"template": "videos.html",
"length": length,
}
)
@@ -326,22 +321,28 @@ def parse_web_api(resp):
def fetch_traits(engine_traits: EngineTraits):
# pylint: disable=import-outside-toplevel
from searx import network
from searx.locales import region_tag
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.utils import extr
resp = network.get(about['website'])
json_string = extr(resp.text, 'INITIAL_PROPS = ', '</script>')
resp = get(
about["website"],
timeout=5,
)
if not resp.ok:
raise RuntimeError("Response from Qwant is not OK.")
json_string = extr(resp.text, "INITIAL_PROPS = ", "</script>")
q_initial_props = loads(json_string)
q_locales = q_initial_props.get('locales')
q_locales = q_initial_props.get("locales")
eng_tag_list = set()
for country, v in q_locales.items():
for lang in v['langs']:
for lang in v["langs"]:
_locale = "{lang}_{country}".format(lang=lang, country=country)
if qwant_categ == 'news' and _locale.lower() not in qwant_news_locales:
if qwant_categ == "news" and _locale.lower() not in qwant_news_locales:
# qwant-news does not support all locales from qwant-web:
continue
@@ -349,7 +350,7 @@ def fetch_traits(engine_traits: EngineTraits):
for eng_tag in eng_tag_list:
try:
sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_'))
sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep="_"))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag)
continue