mirror of
https://github.com/searxng/searxng.git
synced 2026-05-24 03:44:31 +02:00
[mod] online engines - set common HTTP headers
The online engines emulate a request as it would come from a web browser, which is why the HTTP headers in the default settings should also be set the way a standard web browser would set them. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
committed by
Markus Heiser
parent
cc39cf7df3
commit
490f28f0a6
@@ -298,9 +298,9 @@ class Engine(abc.ABC): # pylint: disable=too-few-public-methods
|
|||||||
"""Using tor proxy (``true``) or not (``false``) for this engine."""
|
"""Using tor proxy (``true``) or not (``false``) for this engine."""
|
||||||
|
|
||||||
send_accept_language_header: bool
|
send_accept_language_header: bool
|
||||||
"""When this option is activated, the language (locale) that is selected by
|
"""When this option is activated (default), the language (locale) that is
|
||||||
the user is used to build and send a ``Accept-Language`` header in the
|
selected by the user is used to build and send a ``Accept-Language`` header
|
||||||
request to the origin search engine."""
|
in the request to the origin search engine."""
|
||||||
|
|
||||||
tokens: list[str]
|
tokens: list[str]
|
||||||
"""A list of secret tokens to make this engine *private*, more details see
|
"""A list of secret tokens to make this engine *private*, more details see
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ ENGINE_DEFAULT_ARGS: dict[str, int | str | list[t.Any] | dict[str, t.Any] | bool
|
|||||||
"inactive": False,
|
"inactive": False,
|
||||||
"about": {},
|
"about": {},
|
||||||
"using_tor_proxy": False,
|
"using_tor_proxy": False,
|
||||||
"send_accept_language_header": False,
|
"send_accept_language_header": True,
|
||||||
"tokens": [],
|
"tokens": [],
|
||||||
"max_page": 0,
|
"max_page": 0,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -52,7 +52,6 @@ about = {
|
|||||||
|
|
||||||
categories = []
|
categories = []
|
||||||
paging = True
|
paging = True
|
||||||
send_accept_language_header = True
|
|
||||||
results_per_page = 10
|
results_per_page = 10
|
||||||
|
|
||||||
base_url = "https://stock.adobe.com"
|
base_url = "https://stock.adobe.com"
|
||||||
|
|||||||
@@ -172,7 +172,6 @@ the UI of Brave the user gets warned about this, since we can not warn the user
|
|||||||
in SearXNG, the spellchecking is disabled by default.
|
in SearXNG, the spellchecking is disabled by default.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
send_accept_language_header = True
|
|
||||||
paging = False
|
paging = False
|
||||||
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
|
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
|
||||||
category All) and in the goggles category."""
|
category All) and in the goggles category."""
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ if t.TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
engine_type = "online"
|
engine_type = "online"
|
||||||
send_accept_language_header = True
|
# send_accept_language_header = False
|
||||||
categories = ["general"]
|
categories = ["general"]
|
||||||
disabled = True
|
disabled = True
|
||||||
timeout = 2.0
|
timeout = 2.0
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ about = {
|
|||||||
"results": 'JSON',
|
"results": 'JSON',
|
||||||
}
|
}
|
||||||
|
|
||||||
send_accept_language_header = True
|
|
||||||
|
|
||||||
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
||||||
|
|
||||||
|
|||||||
@@ -27,8 +27,6 @@ about = {
|
|||||||
"results": "JSON",
|
"results": "JSON",
|
||||||
}
|
}
|
||||||
|
|
||||||
send_accept_language_header = True
|
|
||||||
|
|
||||||
# engine dependent config
|
# engine dependent config
|
||||||
categories = ["weather"]
|
categories = ["weather"]
|
||||||
base_url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
|
base_url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
|
||||||
|
|||||||
@@ -44,7 +44,6 @@ max_page = 50
|
|||||||
|
|
||||||
time_range_support = True
|
time_range_support = True
|
||||||
safesearch = True
|
safesearch = True
|
||||||
send_accept_language_header = True
|
|
||||||
|
|
||||||
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
|
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
|
||||||
|
|
||||||
|
|||||||
@@ -64,7 +64,6 @@ time_range_support = False
|
|||||||
#
|
#
|
||||||
# safesearch : results are identical for safesearch=0 and safesearch=2
|
# safesearch : results are identical for safesearch=0 and safesearch=2
|
||||||
safesearch = True
|
safesearch = True
|
||||||
# send_accept_language_header = True
|
|
||||||
|
|
||||||
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
|
|||||||
@@ -19,8 +19,6 @@ about = {
|
|||||||
"results": "HTML",
|
"results": "HTML",
|
||||||
}
|
}
|
||||||
|
|
||||||
send_accept_language_header = True
|
|
||||||
|
|
||||||
play_categ = None # apps|movies
|
play_categ = None # apps|movies
|
||||||
base_url = 'https://play.google.com'
|
base_url = 'https://play.google.com'
|
||||||
search_url = base_url + "/store/search?{query}&c={play_categ}"
|
search_url = base_url + "/store/search?{query}&c={play_categ}"
|
||||||
|
|||||||
@@ -71,7 +71,6 @@ max_page = 50
|
|||||||
language_support = True
|
language_support = True
|
||||||
time_range_support = True
|
time_range_support = True
|
||||||
safesearch = False
|
safesearch = False
|
||||||
send_accept_language_header = True
|
|
||||||
|
|
||||||
|
|
||||||
def request(query: str, params: "OnlineParams") -> None:
|
def request(query: str, params: "OnlineParams") -> None:
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ about = {
|
|||||||
categories = ['map']
|
categories = ['map']
|
||||||
paging = False
|
paging = False
|
||||||
language_support = True
|
language_support = True
|
||||||
send_accept_language_header = True
|
|
||||||
|
|
||||||
# search-url
|
# search-url
|
||||||
base_url = 'https://nominatim.openstreetmap.org/'
|
base_url = 'https://nominatim.openstreetmap.org/'
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ Since the region is already "auto" by default, we only need to set the
|
|||||||
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
|
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
|
||||||
have to set these values in both requests we send to Presearch; in the first
|
have to set these values in both requests we send to Presearch; in the first
|
||||||
request to get the request-ID from Presearch and in the final request to get the
|
request to get the request-ID from Presearch and in the final request to get the
|
||||||
result list (see ``send_accept_language_header``).
|
result list.
|
||||||
|
|
||||||
The time format returned by Presearch varies depending on the language set.
|
The time format returned by Presearch varies depending on the language set.
|
||||||
Multiple different formats can be supported by using ``dateutil`` parser, but
|
Multiple different formats can be supported by using ``dateutil`` parser, but
|
||||||
@@ -86,7 +86,6 @@ about = {
|
|||||||
paging = True
|
paging = True
|
||||||
safesearch = True
|
safesearch = True
|
||||||
time_range_support = True
|
time_range_support = True
|
||||||
send_accept_language_header = True
|
|
||||||
categories = ["general", "web"] # general, images, videos, news
|
categories = ["general", "web"] # general, images, videos, news
|
||||||
|
|
||||||
# HTTP2 requests immediately get blocked by a CAPTCHA
|
# HTTP2 requests immediately get blocked by a CAPTCHA
|
||||||
|
|||||||
@@ -82,9 +82,6 @@ max_page = 5
|
|||||||
"""5 pages maximum (``&p=5``): Trying to do more just results in an improper
|
"""5 pages maximum (``&p=5``): Trying to do more just results in an improper
|
||||||
redirect"""
|
redirect"""
|
||||||
|
|
||||||
# Otherwise Qwant will return 403 if not set
|
|
||||||
send_accept_language_header = True
|
|
||||||
|
|
||||||
qwant_categ = None
|
qwant_categ = None
|
||||||
"""One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``"""
|
"""One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``"""
|
||||||
|
|
||||||
|
|||||||
+20
-37
@@ -49,10 +49,15 @@ W3C recommends subtag over macrolanguage [2]_.
|
|||||||
Startpage languages
|
Startpage languages
|
||||||
===================
|
===================
|
||||||
|
|
||||||
:py:obj:`send_accept_language_header`:
|
HTTP ``Accept-Language`` header (``send_accept_language_header``):
|
||||||
The displayed name in Startpage's settings page depend on the location of the
|
The displayed name in Startpage's settings page depend on the location of the
|
||||||
IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
|
IP when ``Accept-Language`` HTTP header is unset.
|
||||||
we use::
|
|
||||||
|
Startpage tries to guess user's language and territory from the HTTP
|
||||||
|
``Accept-Language``. Optional the user can select a search-language (can be
|
||||||
|
different to the UI language) and a region filter.
|
||||||
|
|
||||||
|
In :py:obj:`fetch_traits` we use::
|
||||||
|
|
||||||
'Accept-Language': "en-US,en;q=0.5",
|
'Accept-Language': "en-US,en;q=0.5",
|
||||||
..
|
..
|
||||||
@@ -112,12 +117,6 @@ startpage_categ = 'web'
|
|||||||
"""Startpage's category, visit :ref:`startpage categories`.
|
"""Startpage's category, visit :ref:`startpage categories`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
send_accept_language_header = True
|
|
||||||
"""Startpage tries to guess user's language and territory from the HTTP
|
|
||||||
``Accept-Language``. Optional the user can select a search-language (can be
|
|
||||||
different to the UI language) and a region filter.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# engine dependent config
|
# engine dependent config
|
||||||
categories = ['general', 'web']
|
categories = ['general', 'web']
|
||||||
paging = True
|
paging = True
|
||||||
@@ -163,14 +162,14 @@ def init(_):
|
|||||||
|
|
||||||
# hint: all three startpage engines (WEB, Images & News) can/should use the
|
# hint: all three startpage engines (WEB, Images & News) can/should use the
|
||||||
# same sc_code ..
|
# same sc_code ..
|
||||||
CACHE = EngineCache("startpage") # type:ignore
|
CACHE = EngineCache("startpage")
|
||||||
|
|
||||||
|
|
||||||
sc_code_cache_sec = 3600
|
sc_code_cache_sec = 3600
|
||||||
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
|
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
|
||||||
|
|
||||||
|
|
||||||
def get_sc_code(searxng_locale, params):
|
def get_sc_code(params):
|
||||||
"""Get an actual ``sc`` argument from Startpage's search form (HTML page).
|
"""Get an actual ``sc`` argument from Startpage's search form (HTML page).
|
||||||
|
|
||||||
Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
|
Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
|
||||||
@@ -183,30 +182,14 @@ def get_sc_code(searxng_locale, params):
|
|||||||
:py:obj:`sc_code_cache_sec` seconds."""
|
:py:obj:`sc_code_cache_sec` seconds."""
|
||||||
|
|
||||||
sc_code = CACHE.get("SC_CODE")
|
sc_code = CACHE.get("SC_CODE")
|
||||||
|
|
||||||
if sc_code:
|
if sc_code:
|
||||||
logger.debug("get_sc_code: using cached value: %s", sc_code)
|
logger.debug("get_sc_code: using cached value: %s", sc_code)
|
||||||
return sc_code
|
return sc_code
|
||||||
|
|
||||||
headers = {**params['headers']}
|
get_sc_url = base_url + "/"
|
||||||
|
|
||||||
# add Accept-Language header
|
|
||||||
if searxng_locale == 'all':
|
|
||||||
searxng_locale = 'en-US'
|
|
||||||
locale = babel.Locale.parse(searxng_locale, sep='-')
|
|
||||||
|
|
||||||
if send_accept_language_header:
|
|
||||||
ac_lang = locale.language
|
|
||||||
if locale.territory:
|
|
||||||
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
|
|
||||||
locale.language,
|
|
||||||
locale.territory,
|
|
||||||
locale.language,
|
|
||||||
)
|
|
||||||
headers['Accept-Language'] = ac_lang
|
|
||||||
|
|
||||||
get_sc_url = base_url + '/'
|
|
||||||
logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url)
|
logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url)
|
||||||
|
|
||||||
|
headers = {**params['headers']}
|
||||||
logger.debug("get_sc_code: request headers: %s", headers)
|
logger.debug("get_sc_code: request headers: %s", headers)
|
||||||
resp = get(get_sc_url, headers=headers)
|
resp = get(get_sc_url, headers=headers)
|
||||||
|
|
||||||
@@ -214,19 +197,19 @@ def get_sc_code(searxng_locale, params):
|
|||||||
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
|
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
|
||||||
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
|
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
|
||||||
|
|
||||||
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
|
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
|
||||||
raise SearxEngineCaptchaException(
|
raise SearxEngineCaptchaException(
|
||||||
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
|
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
|
||||||
)
|
)
|
||||||
|
|
||||||
dom = lxml.html.fromstring(resp.text) # type: ignore
|
dom = lxml.html.fromstring(resp.text)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
|
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
|
||||||
except IndexError as exc:
|
except IndexError as exc:
|
||||||
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
|
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
|
||||||
raise SearxEngineCaptchaException(
|
raise SearxEngineCaptchaException(
|
||||||
message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url, # type: ignore
|
message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url,
|
||||||
) from exc
|
) from exc
|
||||||
|
|
||||||
sc_code = str(sc_code)
|
sc_code = str(sc_code)
|
||||||
@@ -259,7 +242,7 @@ def request(query, params):
|
|||||||
'query': query,
|
'query': query,
|
||||||
'cat': startpage_categ,
|
'cat': startpage_categ,
|
||||||
't': 'device',
|
't': 'device',
|
||||||
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers
|
'sc': get_sc_code(params),
|
||||||
'with_date': time_range_dict.get(params['time_range'], ''),
|
'with_date': time_range_dict.get(params['time_range'], ''),
|
||||||
'abp': '1',
|
'abp': '1',
|
||||||
'abd': '1',
|
'abd': '1',
|
||||||
@@ -437,10 +420,10 @@ def fetch_traits(engine_traits: EngineTraits):
|
|||||||
}
|
}
|
||||||
resp = get('https://www.startpage.com/do/settings', headers=headers)
|
resp = get('https://www.startpage.com/do/settings', headers=headers)
|
||||||
|
|
||||||
if not resp.ok: # type: ignore
|
if not resp.ok:
|
||||||
print("ERROR: response from Startpage is not OK.")
|
print("ERROR: response from Startpage is not OK.")
|
||||||
|
|
||||||
dom = lxml.html.fromstring(resp.text) # type: ignore
|
dom = lxml.html.fromstring(resp.text)
|
||||||
|
|
||||||
# regions
|
# regions
|
||||||
|
|
||||||
@@ -453,7 +436,7 @@ def fetch_traits(engine_traits: EngineTraits):
|
|||||||
continue
|
continue
|
||||||
babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
|
babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
|
||||||
|
|
||||||
if '-' in babel_region_tag:
|
if '-' in babel_region_tag: # pyright: ignore[reportOperatorIssue]
|
||||||
l, r = babel_region_tag.split('-')
|
l, r = babel_region_tag.split('-')
|
||||||
r = r.split('_')[-1]
|
r = r.split('_')[-1]
|
||||||
sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
|
sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
|
||||||
|
|||||||
@@ -79,10 +79,6 @@ display_type = ["infobox"]
|
|||||||
one will add a hit to the result list. The first one will show a hit in the
|
one will add a hit to the result list. The first one will show a hit in the
|
||||||
info box. Both values can be set, or one of the two can be set."""
|
info box. Both values can be set, or one of the two can be set."""
|
||||||
|
|
||||||
send_accept_language_header = True
|
|
||||||
"""The HTTP ``Accept-Language`` header is needed for wikis where
|
|
||||||
LanguageConverter_ is enabled."""
|
|
||||||
|
|
||||||
list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
|
list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
|
||||||
"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
|
"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
|
||||||
"""
|
"""
|
||||||
@@ -102,7 +98,7 @@ rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
|
|||||||
previews (fka. Hovercards, aka. Popups) on the web and link previews in the
|
previews (fka. Hovercards, aka. Popups) on the web and link previews in the
|
||||||
apps.
|
apps.
|
||||||
|
|
||||||
HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
|
HTTP ``Accept-Language`` header (``send_accept_language_header``):
|
||||||
The desired language variant code for wikis where LanguageConverter_ is
|
The desired language variant code for wikis where LanguageConverter_ is
|
||||||
enabled.
|
enabled.
|
||||||
|
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ about = {
|
|||||||
categories = ['general', 'web']
|
categories = ['general', 'web']
|
||||||
paging = True
|
paging = True
|
||||||
time_range_support = True
|
time_range_support = True
|
||||||
# send_accept_language_header = True
|
|
||||||
|
|
||||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'}
|
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'}
|
||||||
safesearch_dict = {0: 'p', 1: 'i', 2: 'r'}
|
safesearch_dict = {0: 'p', 1: 'i', 2: 'r'}
|
||||||
|
|||||||
@@ -141,28 +141,31 @@ class OnlineProcessor(EngineProcessor):
|
|||||||
params: OnlineParams = {**default_request_params(), **base_params}
|
params: OnlineParams = {**default_request_params(), **base_params}
|
||||||
|
|
||||||
headers = params["headers"]
|
headers = params["headers"]
|
||||||
|
headers["Accept-Encoding"] = "gzip, deflate"
|
||||||
|
headers["Cache-Control"] = "no-cache"
|
||||||
|
headers["DNT"] = "1"
|
||||||
|
headers["Connection"] = "keep-alive"
|
||||||
|
|
||||||
# add an user agent
|
# add an user agent
|
||||||
headers["User-Agent"] = gen_useragent()
|
headers["User-Agent"] = gen_useragent()
|
||||||
|
|
||||||
# add Accept-Language header
|
# add Accept-Language header
|
||||||
|
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Accept-Language
|
||||||
|
|
||||||
|
headers["Accept-Language"] = "en,en-US;q=0.7,en;q=0.3"
|
||||||
if self.engine.send_accept_language_header and search_query.locale:
|
if self.engine.send_accept_language_header and search_query.locale:
|
||||||
ac_lang = search_query.locale.language
|
_l = search_query.locale.language
|
||||||
if search_query.locale.territory:
|
_t = search_query.locale.territory or _l
|
||||||
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
|
headers["Accept-Language"] = f"{_l},{_l}-{_t};q=0.7,en;q=0.3"
|
||||||
search_query.locale.language,
|
|
||||||
search_query.locale.territory,
|
|
||||||
search_query.locale.language,
|
|
||||||
)
|
|
||||||
headers["Accept-Language"] = ac_lang
|
|
||||||
self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", ""))
|
self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", ""))
|
||||||
|
|
||||||
# https://developer.mozilla.org/en-US/docs/Glossary/Fetch_metadata_request_header
|
# https://developer.mozilla.org/en-US/docs/Glossary/Fetch_metadata_request_header
|
||||||
headers["Sec-Fetch-Dest"] = "empty"
|
headers["Sec-Fetch-Dest"] = "document"
|
||||||
headers["Sec-Fetch-Mode"] = "cors"
|
headers["Sec-Fetch-Mode"] = "navigate"
|
||||||
headers["Sec-Fetch-Site"] = "same-origin"
|
headers["Sec-Fetch-Site"] = "same-origin"
|
||||||
headers["Sec-Fetch-User"] = "?1"
|
headers["Sec-Fetch-User"] = "?1"
|
||||||
headers["Sec-GPC"] = "1"
|
# Sec-GPC is in an experimental state (FFox only)
|
||||||
|
# headers["Sec-GPC"] = "1"
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user