[mod] online engines - set common HTTP headers

The online engines emulate a request as it would come from a web browser, which
is why the HTTP headers in the default settings should also be set the way a
standard web browser would set them.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser
2026-02-20 15:04:59 +01:00
committed by Markus Heiser
parent cc39cf7df3
commit 490f28f0a6
18 changed files with 41 additions and 75 deletions
+3 -3
View File
@@ -298,9 +298,9 @@ class Engine(abc.ABC): # pylint: disable=too-few-public-methods
"""Using tor proxy (``true``) or not (``false``) for this engine."""
send_accept_language_header: bool
"""When this option is activated, the language (locale) that is selected by
the user is used to build and send a ``Accept-Language`` header in the
request to the origin search engine."""
"""When this option is activated (default), the language (locale) that is
selected by the user is used to build and send a ``Accept-Language`` header
in the request to the origin search engine."""
tokens: list[str]
"""A list of secret tokens to make this engine *private*, more details see
+1 -1
View File
@@ -43,7 +43,7 @@ ENGINE_DEFAULT_ARGS: dict[str, int | str | list[t.Any] | dict[str, t.Any] | bool
"inactive": False,
"about": {},
"using_tor_proxy": False,
"send_accept_language_header": False,
"send_accept_language_header": True,
"tokens": [],
"max_page": 0,
}
-1
View File
@@ -52,7 +52,6 @@ about = {
categories = []
paging = True
send_accept_language_header = True
results_per_page = 10
base_url = "https://stock.adobe.com"
-1
View File
@@ -172,7 +172,6 @@ the UI of Brave the user gets warned about this, since we can not warn the user
in SearXNG, the spellchecking is disabled by default.
"""
send_accept_language_header = True
paging = False
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
category All) and in the goggles category."""
+1 -1
View File
@@ -32,7 +32,7 @@ if t.TYPE_CHECKING:
engine_type = "online"
send_accept_language_header = True
# send_accept_language_header = False
categories = ["general"]
disabled = True
timeout = 2.0
-1
View File
@@ -31,7 +31,6 @@ about = {
"results": 'JSON',
}
send_accept_language_header = True
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
-2
View File
@@ -27,8 +27,6 @@ about = {
"results": "JSON",
}
send_accept_language_header = True
# engine dependent config
categories = ["weather"]
base_url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
-1
View File
@@ -44,7 +44,6 @@ max_page = 50
time_range_support = True
safesearch = True
send_accept_language_header = True
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
-1
View File
@@ -64,7 +64,6 @@ time_range_support = False
#
# safesearch : results are identical for safesearch=0 and safesearch=2
safesearch = True
# send_accept_language_header = True
def request(query, params):
-2
View File
@@ -19,8 +19,6 @@ about = {
"results": "HTML",
}
send_accept_language_header = True
play_categ = None # apps|movies
base_url = 'https://play.google.com'
search_url = base_url + "/store/search?{query}&c={play_categ}"
-1
View File
@@ -71,7 +71,6 @@ max_page = 50
language_support = True
time_range_support = True
safesearch = False
send_accept_language_header = True
def request(query: str, params: "OnlineParams") -> None:
-1
View File
@@ -27,7 +27,6 @@ about = {
categories = ['map']
paging = False
language_support = True
send_accept_language_header = True
# search-url
base_url = 'https://nominatim.openstreetmap.org/'
+1 -2
View File
@@ -56,7 +56,7 @@ Since the region is already "auto" by default, we only need to set the
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
have to set these values in both requests we send to Presearch; in the first
request to get the request-ID from Presearch and in the final request to get the
result list (see ``send_accept_language_header``).
result list.
The time format returned by Presearch varies depending on the language set.
Multiple different formats can be supported by using ``dateutil`` parser, but
@@ -86,7 +86,6 @@ about = {
paging = True
safesearch = True
time_range_support = True
send_accept_language_header = True
categories = ["general", "web"] # general, images, videos, news
# HTTP2 requests immediately get blocked by a CAPTCHA
-3
View File
@@ -82,9 +82,6 @@ max_page = 5
"""5 pages maximum (``&p=5``): Trying to do more just results in an improper
redirect"""
# Otherwise Qwant will return 403 if not set
send_accept_language_header = True
qwant_categ = None
"""One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``"""
+20 -37
View File
@@ -49,10 +49,15 @@ W3C recommends subtag over macrolanguage [2]_.
Startpage languages
===================
:py:obj:`send_accept_language_header`:
HTTP ``Accept-Language`` header (``send_accept_language_header``):
The displayed name in Startpage's settings page depend on the location of the
IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
we use::
IP when ``Accept-Language`` HTTP header is unset.
Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``. Optional the user can select a search-language (can be
different to the UI language) and a region filter.
In :py:obj:`fetch_traits` we use::
'Accept-Language': "en-US,en;q=0.5",
..
@@ -112,12 +117,6 @@ startpage_categ = 'web'
"""Startpage's category, visit :ref:`startpage categories`.
"""
send_accept_language_header = True
"""Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``. Optional the user can select a search-language (can be
different to the UI language) and a region filter.
"""
# engine dependent config
categories = ['general', 'web']
paging = True
@@ -163,14 +162,14 @@ def init(_):
# hint: all three startpage engines (WEB, Images & News) can/should use the
# same sc_code ..
CACHE = EngineCache("startpage") # type:ignore
CACHE = EngineCache("startpage")
sc_code_cache_sec = 3600
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
def get_sc_code(searxng_locale, params):
def get_sc_code(params):
"""Get an actual ``sc`` argument from Startpage's search form (HTML page).
Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
@@ -183,30 +182,14 @@ def get_sc_code(searxng_locale, params):
:py:obj:`sc_code_cache_sec` seconds."""
sc_code = CACHE.get("SC_CODE")
if sc_code:
logger.debug("get_sc_code: using cached value: %s", sc_code)
return sc_code
headers = {**params['headers']}
# add Accept-Language header
if searxng_locale == 'all':
searxng_locale = 'en-US'
locale = babel.Locale.parse(searxng_locale, sep='-')
if send_accept_language_header:
ac_lang = locale.language
if locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
locale.language,
locale.territory,
locale.language,
)
headers['Accept-Language'] = ac_lang
get_sc_url = base_url + '/'
get_sc_url = base_url + "/"
logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url)
headers = {**params['headers']}
logger.debug("get_sc_code: request headers: %s", headers)
resp = get(get_sc_url, headers=headers)
@@ -214,19 +197,19 @@ def get_sc_code(searxng_locale, params):
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
raise SearxEngineCaptchaException(
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
)
dom = lxml.html.fromstring(resp.text) # type: ignore
dom = lxml.html.fromstring(resp.text)
try:
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
except IndexError as exc:
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
raise SearxEngineCaptchaException(
message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url, # type: ignore
message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url,
) from exc
sc_code = str(sc_code)
@@ -259,7 +242,7 @@ def request(query, params):
'query': query,
'cat': startpage_categ,
't': 'device',
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers
'sc': get_sc_code(params),
'with_date': time_range_dict.get(params['time_range'], ''),
'abp': '1',
'abd': '1',
@@ -437,10 +420,10 @@ def fetch_traits(engine_traits: EngineTraits):
}
resp = get('https://www.startpage.com/do/settings', headers=headers)
if not resp.ok: # type: ignore
if not resp.ok:
print("ERROR: response from Startpage is not OK.")
dom = lxml.html.fromstring(resp.text) # type: ignore
dom = lxml.html.fromstring(resp.text)
# regions
@@ -453,7 +436,7 @@ def fetch_traits(engine_traits: EngineTraits):
continue
babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
if '-' in babel_region_tag:
if '-' in babel_region_tag: # pyright: ignore[reportOperatorIssue]
l, r = babel_region_tag.split('-')
r = r.split('_')[-1]
sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
+1 -5
View File
@@ -79,10 +79,6 @@ display_type = ["infobox"]
one will add a hit to the result list. The first one will show a hit in the
info box. Both values can be set, or one of the two can be set."""
send_accept_language_header = True
"""The HTTP ``Accept-Language`` header is needed for wikis where
LanguageConverter_ is enabled."""
list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
"""
@@ -102,7 +98,7 @@ rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
previews (fka. Hovercards, aka. Popups) on the web and link previews in the
apps.
HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
HTTP ``Accept-Language`` header (``send_accept_language_header``):
The desired language variant code for wikis where LanguageConverter_ is
enabled.
-1
View File
@@ -33,7 +33,6 @@ about = {
categories = ['general', 'web']
paging = True
time_range_support = True
# send_accept_language_header = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'}
safesearch_dict = {0: 'p', 1: 'i', 2: 'r'}
+14 -11
View File
@@ -141,28 +141,31 @@ class OnlineProcessor(EngineProcessor):
params: OnlineParams = {**default_request_params(), **base_params}
headers = params["headers"]
headers["Accept-Encoding"] = "gzip, deflate"
headers["Cache-Control"] = "no-cache"
headers["DNT"] = "1"
headers["Connection"] = "keep-alive"
# add an user agent
headers["User-Agent"] = gen_useragent()
# add Accept-Language header
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Accept-Language
headers["Accept-Language"] = "en,en-US;q=0.7,en;q=0.3"
if self.engine.send_accept_language_header and search_query.locale:
ac_lang = search_query.locale.language
if search_query.locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
search_query.locale.language,
search_query.locale.territory,
search_query.locale.language,
)
headers["Accept-Language"] = ac_lang
_l = search_query.locale.language
_t = search_query.locale.territory or _l
headers["Accept-Language"] = f"{_l},{_l}-{_t};q=0.7,en;q=0.3"
self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", ""))
# https://developer.mozilla.org/en-US/docs/Glossary/Fetch_metadata_request_header
headers["Sec-Fetch-Dest"] = "empty"
headers["Sec-Fetch-Mode"] = "cors"
headers["Sec-Fetch-Dest"] = "document"
headers["Sec-Fetch-Mode"] = "navigate"
headers["Sec-Fetch-Site"] = "same-origin"
headers["Sec-Fetch-User"] = "?1"
headers["Sec-GPC"] = "1"
# Sec-GPC is in an experimental state (FFox only)
# headers["Sec-GPC"] = "1"
return params