[enh] data: traits population

Job failing since October 2025.

enh: always raise and reuse data
fix: brave unknown locale
fix: startpage add "brazilian"
This commit is contained in:
Ivan Gabaldon
2026-03-01 11:33:06 +01:00
committed by Markus Heiser
parent a9f3baefe6
commit 2b03a61832
16 changed files with 946 additions and 864 deletions
+130 -117
View File
@@ -11,40 +11,45 @@ engines:
"""
import typing as t
import re
import random
import re
import string
import time
from urllib.parse import urlencode, unquote
from lxml import html
import typing as t
from urllib.parse import unquote, urlencode
import babel
import babel.core
import babel.languages
from lxml import html
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex, gen_gsa_useragent
from searx.locales import language_tag, region_tag, get_official_locales
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.exceptions import SearxEngineCaptchaException
from searx.enginelib.traits import EngineTraits
from searx.exceptions import SearxEngineCaptchaException
from searx.locales import get_official_locales, language_tag, region_tag
from searx.result_types import EngineResults
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
eval_xpath_list,
extract_text,
gen_gsa_useragent,
)
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": 'https://www.google.com',
"wikidata_id": 'Q9366',
"official_api_documentation": 'https://developers.google.com/custom-search/',
"website": "https://www.google.com",
"wikidata_id": "Q9366",
"official_api_documentation": "https://developers.google.com/custom-search/",
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
"results": "HTML",
}
# engine dependent config
categories = ['general', 'web']
categories = ["general", "web"]
paging = True
max_page = 50
"""`Google max 50 pages`_
@@ -54,10 +59,10 @@ max_page = 50
time_range_support = True
safesearch = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
time_range_dict = {"day": "d", "week": "w", "month": "m", "year": "y"}
# Filter results. 0: None, 1: Moderate, 2: Strict
filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
filter_mapping = {0: "off", 1: "medium", 2: "high"}
# specific xpath variables
# ------------------------
@@ -87,7 +92,7 @@ def ui_async(start: int) -> str:
# create a new random arc_id every hour
if not _arcid_random or (int(time.time()) - _arcid_random[1]) > 3600:
_arcid_random = (''.join(random.choices(_arcid_range, k=23)), int(time.time()))
_arcid_random = ("".join(random.choices(_arcid_range, k=23)), int(time.time()))
arc_id = f"arc_id:srp_{_arcid_random[0]}_1{start:02}"
return ",".join([arc_id, use_ac, _fmt])
@@ -149,23 +154,23 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
"""
ret_val: dict[str, t.Any] = {
'language': None,
'country': None,
'subdomain': None,
'params': {},
'headers': {},
'cookies': {},
'locale': None,
"language": None,
"country": None,
"subdomain": None,
"params": {},
"headers": {},
"cookies": {},
"locale": None,
}
sxng_locale = params.get('searxng_locale', 'all')
sxng_locale = params.get("searxng_locale", "all")
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
locale = babel.Locale.parse(sxng_locale, sep="-")
except babel.core.UnknownLocaleError:
locale = None
eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
eng_lang = eng_traits.get_language(sxng_locale, "lang_en")
lang_code = eng_lang.split("_")[-1] # lang_zh-TW --> zh-TW / lang_en --> en
country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
# Test zh_hans & zh_hant --> in the topmost links in the result list of list
@@ -176,10 +181,10 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
# '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
# '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
ret_val['language'] = eng_lang
ret_val['country'] = country
ret_val['locale'] = locale
ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
ret_val["language"] = eng_lang
ret_val["country"] = country
ret_val["locale"] = locale
ret_val["subdomain"] = eng_traits.custom["supported_domains"].get(country.upper(), "www.google.com")
# hl parameter:
# The hl parameter specifies the interface language (host language) of
@@ -191,7 +196,7 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
# https://github.com/searxng/searxng/issues/2515#issuecomment-1607150817
ret_val['params']['hl'] = f'{lang_code}-{country}'
ret_val["params"]["hl"] = f"{lang_code}-{country}"
# lr parameter:
# The lr (language restrict) parameter restricts search results to
@@ -207,9 +212,9 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
# By example: &lr=lang_zh-TW%7Clang_de selects articles written in
# traditional chinese OR german language.
ret_val['params']['lr'] = eng_lang
if sxng_locale == 'all':
ret_val['params']['lr'] = ''
ret_val["params"]["lr"] = eng_lang
if sxng_locale == "all":
ret_val["params"]["lr"] = ""
# cr parameter:
# The cr parameter restricts search results to documents originating in a
@@ -218,9 +223,9 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
# specify a region (country) only if a region is given in the selected
# locale --> https://github.com/searxng/searxng/issues/2672
ret_val['params']['cr'] = ''
if len(sxng_locale.split('-')) > 1:
ret_val['params']['cr'] = 'country' + country
ret_val["params"]["cr"] = ""
if len(sxng_locale.split("-")) > 1:
ret_val["params"]["cr"] = "country" + country
# gl parameter: (mandatory by Google News)
# The gl parameter value is a two-letter country code. For WebSearch
@@ -241,14 +246,14 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
# to interpret the query string. The default ie value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#iesp
ret_val['params']['ie'] = 'utf8'
ret_val["params"]["ie"] = "utf8"
# oe parameter:
# The oe parameter sets the character encoding scheme that should be used
# to decode the XML result. The default oe value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#oesp
ret_val['params']['oe'] = 'utf8'
ret_val["params"]["oe"] = "utf8"
# num parameter:
# The num parameter identifies the number of search results to return.
@@ -261,43 +266,43 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
# HTTP headers
ret_val['headers']['Accept'] = '*/*'
ret_val['headers']['User-Agent'] = gen_gsa_useragent()
ret_val["headers"]["Accept"] = "*/*"
ret_val["headers"]["User-Agent"] = gen_gsa_useragent()
# Cookies
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
# - https://github.com/searxng/searxng/issues/1555
ret_val['cookies']['CONSENT'] = "YES+"
ret_val["cookies"]["CONSENT"] = "YES+"
return ret_val
def detect_google_sorry(resp):
if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
if resp.url.host == "sorry.google.com" or resp.url.path.startswith("/sorry"):
raise SearxEngineCaptchaException()
def request(query: str, params: "OnlineParams") -> None:
"""Google search request"""
# pylint: disable=line-too-long
start = (params['pageno'] - 1) * 10
start = (params["pageno"] - 1) * 10
str_async = ui_async(start)
google_info = get_google_info(params, traits)
logger.debug("ARC_ID: %s", str_async)
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = (
'https://'
+ google_info['subdomain']
+ '/search'
"https://"
+ google_info["subdomain"]
+ "/search"
+ "?"
+ urlencode(
{
'q': query,
**google_info['params'],
'filter': '0',
'start': start,
"q": query,
**google_info["params"],
"filter": "0",
"start": start,
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
# 'cs' : 1,
@@ -308,20 +313,20 @@ def request(query: str, params: "OnlineParams") -> None:
# 'sa': 'N',
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
# formally known as use_mobile_ui
'asearch': 'arc',
'async': str_async,
"asearch": "arc",
"async": str_async,
}
)
)
if params['time_range'] in time_range_dict:
query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
if params['safesearch']:
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
if params["time_range"] in time_range_dict:
query_url += "&" + urlencode({"tbs": "qdr:" + time_range_dict[params["time_range"]]})
if params["safesearch"]:
query_url += "&" + urlencode({"safe": filter_mapping[params["safesearch"]]})
params["url"] = query_url
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
params["cookies"] = google_info["cookies"]
params["headers"].update(google_info["headers"])
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
@@ -334,14 +339,14 @@ def parse_data_images(text: str):
data_image_map = {}
for img_id, data_image in RE_DATA_IMAGE.findall(text):
end_pos = data_image.rfind('=')
end_pos = data_image.rfind("=")
if end_pos > 0:
data_image = data_image[: end_pos + 1]
data_image_map[img_id] = data_image
last = RE_DATA_IMAGE_end.search(text)
if last:
data_image_map[last.group(1)] = last.group(2)
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
logger.debug("data:image objects --> %s", list(data_image_map.keys()))
return data_image_map
@@ -365,15 +370,18 @@ def response(resp: "SXNG_Response"):
title_tag = eval_xpath_getindex(result, './/div[contains(@role, "link")]', 0, default=None)
if title_tag is None:
# this not one of the common google results *section*
logger.debug('ignoring item from the result_xpath list: missing title')
logger.debug("ignoring item from the result_xpath list: missing title")
continue
title = extract_text(title_tag)
raw_url = eval_xpath_getindex(result, './/a/@href', 0, None)
raw_url = eval_xpath_getindex(result, ".//a/@href", 0, None)
if raw_url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
logger.debug(
'ignoring item from the result_xpath list: missing url of title "%s"',
title,
)
continue
url = unquote(raw_url[7:].split('&sa=U')[0]) # remove the google redirector
url = unquote(raw_url[7:].split("&sa=U")[0]) # remove the google redirector
content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
for item in content_nodes:
@@ -383,20 +391,23 @@ def response(resp: "SXNG_Response"):
content = extract_text(content_nodes)
if not content:
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
logger.debug(
'ignoring item from the result_xpath list: missing content of title "%s"',
title,
)
continue
thumbnail = content_nodes[0].xpath('.//img/@src')
thumbnail = content_nodes[0].xpath(".//img/@src")
if thumbnail:
thumbnail = thumbnail[0]
if thumbnail.startswith('data:image'):
img_id = content_nodes[0].xpath('.//img/@id')
if thumbnail.startswith("data:image"):
img_id = content_nodes[0].xpath(".//img/@id")
if img_id:
thumbnail = data_image_map.get(img_id[0])
else:
thumbnail = None
results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail})
results.append({"url": url, "title": title, "content": content, "thumbnail": thumbnail})
except Exception as e: # pylint: disable=broad-except
logger.error(e, exc_info=True)
@@ -405,7 +416,7 @@ def response(resp: "SXNG_Response"):
# parse suggestion
for suggestion in eval_xpath_list(dom, suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
results.append({"suggestion": extract_text(suggestion)})
# return results
return results
@@ -416,27 +427,27 @@ def response(resp: "SXNG_Response"):
skip_countries = [
# official language of google-country not in google-languages
'AL', # Albanien (sq)
'AZ', # Aserbaidschan (az)
'BD', # Bangladesch (bn)
'BN', # Brunei Darussalam (ms)
'BT', # Bhutan (dz)
'ET', # Äthiopien (am)
'GE', # Georgien (ka, os)
'GL', # Grönland (kl)
'KH', # Kambodscha (km)
'LA', # Laos (lo)
'LK', # Sri Lanka (si, ta)
'ME', # Montenegro (sr)
'MK', # Nordmazedonien (mk, sq)
'MM', # Myanmar (my)
'MN', # Mongolei (mn)
'MV', # Malediven (dv) // dv_MV is unknown by babel
'MY', # Malaysia (ms)
'NP', # Nepal (ne)
'TJ', # Tadschikistan (tg)
'TM', # Turkmenistan (tk)
'UZ', # Usbekistan (uz)
"AL", # Albanien (sq)
"AZ", # Aserbaidschan (az)
"BD", # Bangladesch (bn)
"BN", # Brunei Darussalam (ms)
"BT", # Bhutan (dz)
"ET", # Äthiopien (am)
"GE", # Georgien (ka, os)
"GL", # Grönland (kl)
"KH", # Kambodscha (km)
"LA", # Laos (lo)
"LK", # Sri Lanka (si, ta)
"ME", # Montenegro (sr)
"MK", # Nordmazedonien (mk, sq)
"MM", # Myanmar (my)
"MN", # Mongolei (mn)
"MV", # Malediven (dv) // dv_MV is unknown by babel
"MY", # Malaysia (ms)
"NP", # Nepal (ne)
"TJ", # Tadschikistan (tg)
"TM", # Turkmenistan (tk)
"UZ", # Usbekistan (uz)
]
@@ -444,21 +455,23 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
"""Fetch languages from Google."""
# pylint: disable=import-outside-toplevel, too-many-branches
engine_traits.custom['supported_domains'] = {}
from searx.network import get # see https://github.com/searxng/searxng/issues/762
resp = get('https://www.google.com/preferences')
if not resp.ok: # type: ignore
raise RuntimeError("Response from Google's preferences is not OK.")
engine_traits.custom["supported_domains"] = {}
dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ''))
resp = get("https://www.google.com/preferences", timeout=5)
if not resp.ok:
raise RuntimeError("Response from Google preferences is not OK.")
dom = html.fromstring(resp.text.replace('<?xml version="1.0" encoding="UTF-8"?>', ""))
# supported language codes
lang_map = {'no': 'nb'}
lang_map = {"no": "nb"}
for x in eval_xpath_list(dom, "//select[@name='hl']/option"):
eng_lang = x.get("value")
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep="-")
except babel.UnknownLocaleError:
print("INFO: google UI language %s (%s) is unknown by babel" % (eng_lang, x.text.split("(")[0].strip()))
continue
@@ -469,10 +482,10 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
engine_traits.languages[sxng_lang] = "lang_" + eng_lang
# alias languages
engine_traits.languages['zh'] = 'lang_zh-CN'
engine_traits.languages["zh"] = "lang_zh-CN"
# supported region codes
@@ -481,37 +494,37 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
if eng_country in skip_countries:
continue
if eng_country == 'ZZ':
engine_traits.all_locale = 'ZZ'
if eng_country == "ZZ":
engine_traits.all_locale = "ZZ"
continue
sxng_locales = get_official_locales(eng_country, engine_traits.languages.keys(), regional=True)
if not sxng_locales:
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get("data-name"), eng_country))
continue
for sxng_locale in sxng_locales:
engine_traits.regions[region_tag(sxng_locale)] = eng_country
# alias regions
engine_traits.regions['zh-CN'] = 'HK'
engine_traits.regions["zh-CN"] = "HK"
# supported domains
if add_domains:
resp = get('https://www.google.com/supported_domains')
if not resp.ok: # type: ignore
raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
resp = get("https://www.google.com/supported_domains", timeout=5)
if not resp.ok:
raise RuntimeError("Response from Google supported domains is not OK.")
for domain in resp.text.split(): # type: ignore
for domain in resp.text.split():
domain = domain.strip()
if not domain or domain in [
'.google.com',
".google.com",
]:
continue
region = domain.split('.')[-1].upper()
engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore
if region == 'HK':
region = domain.split(".")[-1].upper()
engine_traits.custom["supported_domains"][region] = "www" + domain
if region == "HK":
# There is no google.cn, we use .com.hk for zh-CN
engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore
engine_traits.custom["supported_domains"]["CN"] = "www" + domain