From 490f28f0a6649f6cfdd1b2d00f488b381c3a9d1e Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Fri, 20 Feb 2026 15:04:59 +0100 Subject: [PATCH] [mod] online engines - set common HTTP headers The online engines emulate a request as it would come from a web browser, which is why the HTTP headers in the default settings should also be set the way a standard web browser would set them. Signed-off-by: Markus Heiser --- searx/enginelib/__init__.py | 6 +-- searx/engines/__init__.py | 2 +- searx/engines/adobe_stock.py | 1 - searx/engines/brave.py | 1 - searx/engines/demo_online.py | 2 +- searx/engines/duckduckgo_definitions.py | 1 - searx/engines/duckduckgo_weather.py | 2 - searx/engines/google_images.py | 1 - searx/engines/google_news.py | 1 - searx/engines/google_play.py | 2 - searx/engines/google_scholar.py | 1 - searx/engines/openstreetmap.py | 1 - searx/engines/presearch.py | 3 +- searx/engines/qwant.py | 3 -- searx/engines/startpage.py | 57 +++++++++---------------- searx/engines/wikipedia.py | 6 +-- searx/engines/yahoo.py | 1 - searx/search/processors/online.py | 25 ++++++----- 18 files changed, 41 insertions(+), 75 deletions(-) diff --git a/searx/enginelib/__init__.py b/searx/enginelib/__init__.py index 9d864e622..ad112df24 100644 --- a/searx/enginelib/__init__.py +++ b/searx/enginelib/__init__.py @@ -298,9 +298,9 @@ class Engine(abc.ABC): # pylint: disable=too-few-public-methods """Using tor proxy (``true``) or not (``false``) for this engine.""" send_accept_language_header: bool - """When this option is activated, the language (locale) that is selected by - the user is used to build and send a ``Accept-Language`` header in the - request to the origin search engine.""" + """When this option is activated (default), the language (locale) that is + selected by the user is used to build and send a ``Accept-Language`` header + in the request to the origin search engine.""" tokens: list[str] """A list of secret tokens to make this engine *private*, more details see diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 8b2bcad48..1b1574b8b 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -43,7 +43,7 @@ ENGINE_DEFAULT_ARGS: dict[str, int | str | list[t.Any] | dict[str, t.Any] | bool "inactive": False, "about": {}, "using_tor_proxy": False, - "send_accept_language_header": False, + "send_accept_language_header": True, "tokens": [], "max_page": 0, } diff --git a/searx/engines/adobe_stock.py b/searx/engines/adobe_stock.py index c6870a7e5..335a505ee 100644 --- a/searx/engines/adobe_stock.py +++ b/searx/engines/adobe_stock.py @@ -52,7 +52,6 @@ about = { categories = [] paging = True -send_accept_language_header = True results_per_page = 10 base_url = "https://stock.adobe.com" diff --git a/searx/engines/brave.py b/searx/engines/brave.py index 7a7ef3866..9b716c843 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -172,7 +172,6 @@ the UI of Brave the user gets warned about this, since we can not warn the user in SearXNG, the spellchecking is disabled by default. """ -send_accept_language_header = True paging = False """Brave only supports paging in :py:obj:`brave_category` ``search`` (UI category All) and in the goggles category.""" diff --git a/searx/engines/demo_online.py b/searx/engines/demo_online.py index 3a5b6d817..3dcf3935b 100644 --- a/searx/engines/demo_online.py +++ b/searx/engines/demo_online.py @@ -32,7 +32,7 @@ if t.TYPE_CHECKING: engine_type = "online" -send_accept_language_header = True +# send_accept_language_header = False categories = ["general"] disabled = True timeout = 2.0 diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 1ca590505..6d330cebe 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -31,7 +31,6 @@ about = { "results": 'JSON', } -send_accept_language_header = True URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1' diff --git a/searx/engines/duckduckgo_weather.py b/searx/engines/duckduckgo_weather.py index 4d52effcd..f6d97a15b 100644 --- a/searx/engines/duckduckgo_weather.py +++ b/searx/engines/duckduckgo_weather.py @@ -27,8 +27,6 @@ about = { "results": "JSON", } -send_accept_language_header = True - # engine dependent config categories = ["weather"] base_url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 3baf29373..8fc36f853 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -44,7 +44,6 @@ max_page = 50 time_range_support = True safesearch = True -send_accept_language_header = True filter_mapping = {0: 'images', 1: 'active', 2: 'active'} diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index cb714597a..231840d87 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -64,7 +64,6 @@ time_range_support = False # # safesearch : results are identical for safesearch=0 and safesearch=2 safesearch = True -# send_accept_language_header = True def request(query, params): diff --git a/searx/engines/google_play.py b/searx/engines/google_play.py index 8135161a1..2636ed659 100644 --- a/searx/engines/google_play.py +++ b/searx/engines/google_play.py @@ -19,8 +19,6 @@ about = { "results": "HTML", } -send_accept_language_header = True - play_categ = None # apps|movies base_url = 'https://play.google.com' search_url = base_url + "/store/search?{query}&c={play_categ}" diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index b60b257bd..e032e25a1 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -71,7 +71,6 @@ max_page = 50 language_support = True time_range_support = True safesearch = False -send_accept_language_header = True def request(query: str, params: "OnlineParams") -> None: diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 27afa4866..e62bd205a 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -27,7 +27,6 @@ about = { categories = ['map'] paging = False language_support = True -send_accept_language_header = True # search-url base_url = 'https://nominatim.openstreetmap.org/' diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py index 55bf16adb..00a9d3ed8 100644 --- a/searx/engines/presearch.py +++ b/searx/engines/presearch.py @@ -56,7 +56,7 @@ Since the region is already "auto" by default, we only need to set the ``use_local_search_results`` cookie and send the ``Accept-Language`` header. We have to set these values in both requests we send to Presearch; in the first request to get the request-ID from Presearch and in the final request to get the -result list (see ``send_accept_language_header``). +result list. The time format returned by Presearch varies depending on the language set. Multiple different formats can be supported by using ``dateutil`` parser, but @@ -86,7 +86,6 @@ about = { paging = True safesearch = True time_range_support = True -send_accept_language_header = True categories = ["general", "web"] # general, images, videos, news # HTTP2 requests immediately get blocked by a CAPTCHA diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index b0c08c6eb..b2940832d 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -82,9 +82,6 @@ max_page = 5 """5 pages maximum (``&p=5``): Trying to do more just results in an improper redirect""" -# Otherwise Qwant will return 403 if not set -send_accept_language_header = True - qwant_categ = None """One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``""" diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index cfa5de331..3267daee5 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -49,10 +49,15 @@ W3C recommends subtag over macrolanguage [2]_. Startpage languages =================== -:py:obj:`send_accept_language_header`: +HTTP ``Accept-Language`` header (``send_accept_language_header``): The displayed name in Startpage's settings page depend on the location of the - IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits` - we use:: + IP when ``Accept-Language`` HTTP header is unset. + + Startpage tries to guess user's language and territory from the HTTP + ``Accept-Language``. Optional the user can select a search-language (can be + different to the UI language) and a region filter. + + In :py:obj:`fetch_traits` we use:: 'Accept-Language': "en-US,en;q=0.5", .. @@ -112,12 +117,6 @@ startpage_categ = 'web' """Startpage's category, visit :ref:`startpage categories`. """ -send_accept_language_header = True -"""Startpage tries to guess user's language and territory from the HTTP -``Accept-Language``. Optional the user can select a search-language (can be -different to the UI language) and a region filter. -""" - # engine dependent config categories = ['general', 'web'] paging = True @@ -163,14 +162,14 @@ def init(_): # hint: all three startpage engines (WEB, Images & News) can/should use the # same sc_code .. - CACHE = EngineCache("startpage") # type:ignore + CACHE = EngineCache("startpage") sc_code_cache_sec = 3600 """Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`.""" -def get_sc_code(searxng_locale, params): +def get_sc_code(params): """Get an actual ``sc`` argument from Startpage's search form (HTML page). Startpage puts a ``sc`` argument on every HTML :py:obj:`search form @@ -183,30 +182,14 @@ def get_sc_code(searxng_locale, params): :py:obj:`sc_code_cache_sec` seconds.""" sc_code = CACHE.get("SC_CODE") - if sc_code: logger.debug("get_sc_code: using cached value: %s", sc_code) return sc_code - headers = {**params['headers']} - - # add Accept-Language header - if searxng_locale == 'all': - searxng_locale = 'en-US' - locale = babel.Locale.parse(searxng_locale, sep='-') - - if send_accept_language_header: - ac_lang = locale.language - if locale.territory: - ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( - locale.language, - locale.territory, - locale.language, - ) - headers['Accept-Language'] = ac_lang - - get_sc_url = base_url + '/' + get_sc_url = base_url + "/" logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url) + + headers = {**params['headers']} logger.debug("get_sc_code: request headers: %s", headers) resp = get(get_sc_url, headers=headers) @@ -214,19 +197,19 @@ def get_sc_code(searxng_locale, params): # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21 - if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore + if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): raise SearxEngineCaptchaException( message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha", ) - dom = lxml.html.fromstring(resp.text) # type: ignore + dom = lxml.html.fromstring(resp.text) try: sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0] except IndexError as exc: logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695") raise SearxEngineCaptchaException( - message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url, # type: ignore + message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url, ) from exc sc_code = str(sc_code) @@ -259,7 +242,7 @@ def request(query, params): 'query': query, 'cat': startpage_categ, 't': 'device', - 'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers + 'sc': get_sc_code(params), 'with_date': time_range_dict.get(params['time_range'], ''), 'abp': '1', 'abd': '1', @@ -437,10 +420,10 @@ def fetch_traits(engine_traits: EngineTraits): } resp = get('https://www.startpage.com/do/settings', headers=headers) - if not resp.ok: # type: ignore + if not resp.ok: print("ERROR: response from Startpage is not OK.") - dom = lxml.html.fromstring(resp.text) # type: ignore + dom = lxml.html.fromstring(resp.text) # regions @@ -453,7 +436,7 @@ def fetch_traits(engine_traits: EngineTraits): continue babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway - if '-' in babel_region_tag: + if '-' in babel_region_tag: # pyright: ignore[reportOperatorIssue] l, r = babel_region_tag.split('-') r = r.split('_')[-1] sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_')) diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 00537f162..e5403d194 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -79,10 +79,6 @@ display_type = ["infobox"] one will add a hit to the result list. The first one will show a hit in the info box. Both values can be set, or one of the two can be set.""" -send_accept_language_header = True -"""The HTTP ``Accept-Language`` header is needed for wikis where -LanguageConverter_ is enabled.""" - list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' """`List of all wikipedias `_ """ @@ -102,7 +98,7 @@ rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}' previews (fka. Hovercards, aka. Popups) on the web and link previews in the apps. -HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`): +HTTP ``Accept-Language`` header (``send_accept_language_header``): The desired language variant code for wikis where LanguageConverter_ is enabled. diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index d20598982..ca455c603 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -33,7 +33,6 @@ about = { categories = ['general', 'web'] paging = True time_range_support = True -# send_accept_language_header = True time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'} safesearch_dict = {0: 'p', 1: 'i', 2: 'r'} diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index 1195b2fad..53389ad04 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -141,28 +141,31 @@ class OnlineProcessor(EngineProcessor): params: OnlineParams = {**default_request_params(), **base_params} headers = params["headers"] + headers["Accept-Encoding"] = "gzip, deflate" + headers["Cache-Control"] = "no-cache" + headers["DNT"] = "1" + headers["Connection"] = "keep-alive" # add an user agent headers["User-Agent"] = gen_useragent() # add Accept-Language header + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Accept-Language + + headers["Accept-Language"] = "en,en-US;q=0.7,en;q=0.3" if self.engine.send_accept_language_header and search_query.locale: - ac_lang = search_query.locale.language - if search_query.locale.territory: - ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( - search_query.locale.language, - search_query.locale.territory, - search_query.locale.language, - ) - headers["Accept-Language"] = ac_lang + _l = search_query.locale.language + _t = search_query.locale.territory or _l + headers["Accept-Language"] = f"{_l},{_l}-{_t};q=0.7,en;q=0.3" self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", "")) # https://developer.mozilla.org/en-US/docs/Glossary/Fetch_metadata_request_header - headers["Sec-Fetch-Dest"] = "empty" - headers["Sec-Fetch-Mode"] = "cors" + headers["Sec-Fetch-Dest"] = "document" + headers["Sec-Fetch-Mode"] = "navigate" headers["Sec-Fetch-Site"] = "same-origin" headers["Sec-Fetch-User"] = "?1" - headers["Sec-GPC"] = "1" + # Sec-GPC is in an experimental state (FFox only) + # headers["Sec-GPC"] = "1" return params