[mod] online engines - set common HTTP headers

The online engines emulate a request as it would come from a web browser, which
is why the HTTP headers in the default settings should also be set the way a
standard web browser would set them.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser
2026-02-20 15:04:59 +01:00
committed by Markus Heiser
parent cc39cf7df3
commit 490f28f0a6
18 changed files with 41 additions and 75 deletions
+3 -3
View File
@@ -298,9 +298,9 @@ class Engine(abc.ABC): # pylint: disable=too-few-public-methods
"""Using tor proxy (``true``) or not (``false``) for this engine.""" """Using tor proxy (``true``) or not (``false``) for this engine."""
send_accept_language_header: bool send_accept_language_header: bool
"""When this option is activated, the language (locale) that is selected by """When this option is activated (default), the language (locale) that is
the user is used to build and send a ``Accept-Language`` header in the selected by the user is used to build and send a ``Accept-Language`` header
request to the origin search engine.""" in the request to the origin search engine."""
tokens: list[str] tokens: list[str]
"""A list of secret tokens to make this engine *private*, more details see """A list of secret tokens to make this engine *private*, more details see
+1 -1
View File
@@ -43,7 +43,7 @@ ENGINE_DEFAULT_ARGS: dict[str, int | str | list[t.Any] | dict[str, t.Any] | bool
"inactive": False, "inactive": False,
"about": {}, "about": {},
"using_tor_proxy": False, "using_tor_proxy": False,
"send_accept_language_header": False, "send_accept_language_header": True,
"tokens": [], "tokens": [],
"max_page": 0, "max_page": 0,
} }
-1
View File
@@ -52,7 +52,6 @@ about = {
categories = [] categories = []
paging = True paging = True
send_accept_language_header = True
results_per_page = 10 results_per_page = 10
base_url = "https://stock.adobe.com" base_url = "https://stock.adobe.com"
-1
View File
@@ -172,7 +172,6 @@ the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default. in SearXNG, the spellchecking is disabled by default.
""" """
send_accept_language_header = True
paging = False paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI """Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All) and in the goggles category.""" category All) and in the goggles category."""
+1 -1
View File
@@ -32,7 +32,7 @@ if t.TYPE_CHECKING:
engine_type = "online" engine_type = "online"
send_accept_language_header = True # send_accept_language_header = False
categories = ["general"] categories = ["general"]
disabled = True disabled = True
timeout = 2.0 timeout = 2.0
-1
View File
@@ -31,7 +31,6 @@ about = {
"results": 'JSON', "results": 'JSON',
} }
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1' URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
-2
View File
@@ -27,8 +27,6 @@ about = {
"results": "JSON", "results": "JSON",
} }
send_accept_language_header = True
# engine dependent config # engine dependent config
categories = ["weather"] categories = ["weather"]
base_url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}" base_url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
-1
View File
@@ -44,7 +44,6 @@ max_page = 50
time_range_support = True time_range_support = True
safesearch = True safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'} filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
-1
View File
@@ -64,7 +64,6 @@ time_range_support = False
# #
# safesearch : results are identical for safesearch=0 and safesearch=2 # safesearch : results are identical for safesearch=0 and safesearch=2
safesearch = True safesearch = True
# send_accept_language_header = True
def request(query, params): def request(query, params):
-2
View File
@@ -19,8 +19,6 @@ about = {
"results": "HTML", "results": "HTML",
} }
send_accept_language_header = True
play_categ = None # apps|movies play_categ = None # apps|movies
base_url = 'https://play.google.com' base_url = 'https://play.google.com'
search_url = base_url + "/store/search?{query}&c={play_categ}" search_url = base_url + "/store/search?{query}&c={play_categ}"
-1
View File
@@ -71,7 +71,6 @@ max_page = 50
language_support = True language_support = True
time_range_support = True time_range_support = True
safesearch = False safesearch = False
send_accept_language_header = True
def request(query: str, params: "OnlineParams") -> None: def request(query: str, params: "OnlineParams") -> None:
-1
View File
@@ -27,7 +27,6 @@ about = {
categories = ['map'] categories = ['map']
paging = False paging = False
language_support = True language_support = True
send_accept_language_header = True
# search-url # search-url
base_url = 'https://nominatim.openstreetmap.org/' base_url = 'https://nominatim.openstreetmap.org/'
+1 -2
View File
@@ -56,7 +56,7 @@ Since the region is already "auto" by default, we only need to set the
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We ``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
have to set these values in both requests we send to Presearch; in the first have to set these values in both requests we send to Presearch; in the first
request to get the request-ID from Presearch and in the final request to get the request to get the request-ID from Presearch and in the final request to get the
result list (see ``send_accept_language_header``). result list.
The time format returned by Presearch varies depending on the language set. The time format returned by Presearch varies depending on the language set.
Multiple different formats can be supported by using ``dateutil`` parser, but Multiple different formats can be supported by using ``dateutil`` parser, but
@@ -86,7 +86,6 @@ about = {
paging = True paging = True
safesearch = True safesearch = True
time_range_support = True time_range_support = True
send_accept_language_header = True
categories = ["general", "web"] # general, images, videos, news categories = ["general", "web"] # general, images, videos, news
# HTTP2 requests immediately get blocked by a CAPTCHA # HTTP2 requests immediately get blocked by a CAPTCHA
-3
View File
@@ -82,9 +82,6 @@ max_page = 5
"""5 pages maximum (``&p=5``): Trying to do more just results in an improper """5 pages maximum (``&p=5``): Trying to do more just results in an improper
redirect""" redirect"""
# Otherwise Qwant will return 403 if not set
send_accept_language_header = True
qwant_categ = None qwant_categ = None
"""One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``""" """One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``"""
+20 -37
View File
@@ -49,10 +49,15 @@ W3C recommends subtag over macrolanguage [2]_.
Startpage languages Startpage languages
=================== ===================
:py:obj:`send_accept_language_header`: HTTP ``Accept-Language`` header (``send_accept_language_header``):
The displayed name in Startpage's settings page depend on the location of the The displayed name in Startpage's settings page depend on the location of the
IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits` IP when ``Accept-Language`` HTTP header is unset.
we use::
Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``. Optional the user can select a search-language (can be
different to the UI language) and a region filter.
In :py:obj:`fetch_traits` we use::
'Accept-Language': "en-US,en;q=0.5", 'Accept-Language': "en-US,en;q=0.5",
.. ..
@@ -112,12 +117,6 @@ startpage_categ = 'web'
"""Startpage's category, visit :ref:`startpage categories`. """Startpage's category, visit :ref:`startpage categories`.
""" """
send_accept_language_header = True
"""Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``. Optional the user can select a search-language (can be
different to the UI language) and a region filter.
"""
# engine dependent config # engine dependent config
categories = ['general', 'web'] categories = ['general', 'web']
paging = True paging = True
@@ -163,14 +162,14 @@ def init(_):
# hint: all three startpage engines (WEB, Images & News) can/should use the # hint: all three startpage engines (WEB, Images & News) can/should use the
# same sc_code .. # same sc_code ..
CACHE = EngineCache("startpage") # type:ignore CACHE = EngineCache("startpage")
sc_code_cache_sec = 3600 sc_code_cache_sec = 3600
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`.""" """Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
def get_sc_code(searxng_locale, params): def get_sc_code(params):
"""Get an actual ``sc`` argument from Startpage's search form (HTML page). """Get an actual ``sc`` argument from Startpage's search form (HTML page).
Startpage puts a ``sc`` argument on every HTML :py:obj:`search form Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
@@ -183,30 +182,14 @@ def get_sc_code(searxng_locale, params):
:py:obj:`sc_code_cache_sec` seconds.""" :py:obj:`sc_code_cache_sec` seconds."""
sc_code = CACHE.get("SC_CODE") sc_code = CACHE.get("SC_CODE")
if sc_code: if sc_code:
logger.debug("get_sc_code: using cached value: %s", sc_code) logger.debug("get_sc_code: using cached value: %s", sc_code)
return sc_code return sc_code
headers = {**params['headers']} get_sc_url = base_url + "/"
# add Accept-Language header
if searxng_locale == 'all':
searxng_locale = 'en-US'
locale = babel.Locale.parse(searxng_locale, sep='-')
if send_accept_language_header:
ac_lang = locale.language
if locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
locale.language,
locale.territory,
locale.language,
)
headers['Accept-Language'] = ac_lang
get_sc_url = base_url + '/'
logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url) logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url)
headers = {**params['headers']}
logger.debug("get_sc_code: request headers: %s", headers) logger.debug("get_sc_code: request headers: %s", headers)
resp = get(get_sc_url, headers=headers) resp = get(get_sc_url, headers=headers)
@@ -214,19 +197,19 @@ def get_sc_code(searxng_locale, params):
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21 # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
raise SearxEngineCaptchaException( raise SearxEngineCaptchaException(
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha", message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
) )
dom = lxml.html.fromstring(resp.text) # type: ignore dom = lxml.html.fromstring(resp.text)
try: try:
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0] sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
except IndexError as exc: except IndexError as exc:
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695") logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
raise SearxEngineCaptchaException( raise SearxEngineCaptchaException(
message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url, # type: ignore message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url,
) from exc ) from exc
sc_code = str(sc_code) sc_code = str(sc_code)
@@ -259,7 +242,7 @@ def request(query, params):
'query': query, 'query': query,
'cat': startpage_categ, 'cat': startpage_categ,
't': 'device', 't': 'device',
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers 'sc': get_sc_code(params),
'with_date': time_range_dict.get(params['time_range'], ''), 'with_date': time_range_dict.get(params['time_range'], ''),
'abp': '1', 'abp': '1',
'abd': '1', 'abd': '1',
@@ -437,10 +420,10 @@ def fetch_traits(engine_traits: EngineTraits):
} }
resp = get('https://www.startpage.com/do/settings', headers=headers) resp = get('https://www.startpage.com/do/settings', headers=headers)
if not resp.ok: # type: ignore if not resp.ok:
print("ERROR: response from Startpage is not OK.") print("ERROR: response from Startpage is not OK.")
dom = lxml.html.fromstring(resp.text) # type: ignore dom = lxml.html.fromstring(resp.text)
# regions # regions
@@ -453,7 +436,7 @@ def fetch_traits(engine_traits: EngineTraits):
continue continue
babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
if '-' in babel_region_tag: if '-' in babel_region_tag: # pyright: ignore[reportOperatorIssue]
l, r = babel_region_tag.split('-') l, r = babel_region_tag.split('-')
r = r.split('_')[-1] r = r.split('_')[-1]
sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_')) sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
+1 -5
View File
@@ -79,10 +79,6 @@ display_type = ["infobox"]
one will add a hit to the result list. The first one will show a hit in the one will add a hit to the result list. The first one will show a hit in the
info box. Both values can be set, or one of the two can be set.""" info box. Both values can be set, or one of the two can be set."""
send_accept_language_header = True
"""The HTTP ``Accept-Language`` header is needed for wikis where
LanguageConverter_ is enabled."""
list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_ """`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
""" """
@@ -102,7 +98,7 @@ rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
previews (fka. Hovercards, aka. Popups) on the web and link previews in the previews (fka. Hovercards, aka. Popups) on the web and link previews in the
apps. apps.
HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`): HTTP ``Accept-Language`` header (``send_accept_language_header``):
The desired language variant code for wikis where LanguageConverter_ is The desired language variant code for wikis where LanguageConverter_ is
enabled. enabled.
-1
View File
@@ -33,7 +33,6 @@ about = {
categories = ['general', 'web'] categories = ['general', 'web']
paging = True paging = True
time_range_support = True time_range_support = True
# send_accept_language_header = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'} time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'}
safesearch_dict = {0: 'p', 1: 'i', 2: 'r'} safesearch_dict = {0: 'p', 1: 'i', 2: 'r'}
+14 -11
View File
@@ -141,28 +141,31 @@ class OnlineProcessor(EngineProcessor):
params: OnlineParams = {**default_request_params(), **base_params} params: OnlineParams = {**default_request_params(), **base_params}
headers = params["headers"] headers = params["headers"]
headers["Accept-Encoding"] = "gzip, deflate"
headers["Cache-Control"] = "no-cache"
headers["DNT"] = "1"
headers["Connection"] = "keep-alive"
# add an user agent # add an user agent
headers["User-Agent"] = gen_useragent() headers["User-Agent"] = gen_useragent()
# add Accept-Language header # add Accept-Language header
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Accept-Language
headers["Accept-Language"] = "en,en-US;q=0.7,en;q=0.3"
if self.engine.send_accept_language_header and search_query.locale: if self.engine.send_accept_language_header and search_query.locale:
ac_lang = search_query.locale.language _l = search_query.locale.language
if search_query.locale.territory: _t = search_query.locale.territory or _l
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % ( headers["Accept-Language"] = f"{_l},{_l}-{_t};q=0.7,en;q=0.3"
search_query.locale.language,
search_query.locale.territory,
search_query.locale.language,
)
headers["Accept-Language"] = ac_lang
self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", "")) self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", ""))
# https://developer.mozilla.org/en-US/docs/Glossary/Fetch_metadata_request_header # https://developer.mozilla.org/en-US/docs/Glossary/Fetch_metadata_request_header
headers["Sec-Fetch-Dest"] = "empty" headers["Sec-Fetch-Dest"] = "document"
headers["Sec-Fetch-Mode"] = "cors" headers["Sec-Fetch-Mode"] = "navigate"
headers["Sec-Fetch-Site"] = "same-origin" headers["Sec-Fetch-Site"] = "same-origin"
headers["Sec-Fetch-User"] = "?1" headers["Sec-Fetch-User"] = "?1"
headers["Sec-GPC"] = "1" # Sec-GPC is in an experimental state (FFox only)
# headers["Sec-GPC"] = "1"
return params return params