mirror of
https://github.com/searxng/searxng.git
synced 2026-05-07 18:03:51 +02:00
[mod] online engines - set common HTTP headers
The online engines emulate a request as it would come from a web browser, which is why the HTTP headers in the default settings should also be set the way a standard web browser would set them. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
committed by
Markus Heiser
parent
cc39cf7df3
commit
490f28f0a6
@@ -298,9 +298,9 @@ class Engine(abc.ABC): # pylint: disable=too-few-public-methods
|
||||
"""Using tor proxy (``true``) or not (``false``) for this engine."""
|
||||
|
||||
send_accept_language_header: bool
|
||||
"""When this option is activated, the language (locale) that is selected by
|
||||
the user is used to build and send a ``Accept-Language`` header in the
|
||||
request to the origin search engine."""
|
||||
"""When this option is activated (default), the language (locale) that is
|
||||
selected by the user is used to build and send a ``Accept-Language`` header
|
||||
in the request to the origin search engine."""
|
||||
|
||||
tokens: list[str]
|
||||
"""A list of secret tokens to make this engine *private*, more details see
|
||||
|
||||
@@ -43,7 +43,7 @@ ENGINE_DEFAULT_ARGS: dict[str, int | str | list[t.Any] | dict[str, t.Any] | bool
|
||||
"inactive": False,
|
||||
"about": {},
|
||||
"using_tor_proxy": False,
|
||||
"send_accept_language_header": False,
|
||||
"send_accept_language_header": True,
|
||||
"tokens": [],
|
||||
"max_page": 0,
|
||||
}
|
||||
|
||||
@@ -52,7 +52,6 @@ about = {
|
||||
|
||||
categories = []
|
||||
paging = True
|
||||
send_accept_language_header = True
|
||||
results_per_page = 10
|
||||
|
||||
base_url = "https://stock.adobe.com"
|
||||
|
||||
@@ -172,7 +172,6 @@ the UI of Brave the user gets warned about this, since we can not warn the user
|
||||
in SearXNG, the spellchecking is disabled by default.
|
||||
"""
|
||||
|
||||
send_accept_language_header = True
|
||||
paging = False
|
||||
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
|
||||
category All) and in the goggles category."""
|
||||
|
||||
@@ -32,7 +32,7 @@ if t.TYPE_CHECKING:
|
||||
|
||||
|
||||
engine_type = "online"
|
||||
send_accept_language_header = True
|
||||
# send_accept_language_header = False
|
||||
categories = ["general"]
|
||||
disabled = True
|
||||
timeout = 2.0
|
||||
|
||||
@@ -31,7 +31,6 @@ about = {
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
|
||||
URL = 'https://api.duckduckgo.com/' + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
|
||||
|
||||
|
||||
@@ -27,8 +27,6 @@ about = {
|
||||
"results": "JSON",
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
|
||||
# engine dependent config
|
||||
categories = ["weather"]
|
||||
base_url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
|
||||
|
||||
@@ -44,7 +44,6 @@ max_page = 50
|
||||
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
|
||||
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
|
||||
|
||||
|
||||
@@ -64,7 +64,6 @@ time_range_support = False
|
||||
#
|
||||
# safesearch : results are identical for safesearch=0 and safesearch=2
|
||||
safesearch = True
|
||||
# send_accept_language_header = True
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
@@ -19,8 +19,6 @@ about = {
|
||||
"results": "HTML",
|
||||
}
|
||||
|
||||
send_accept_language_header = True
|
||||
|
||||
play_categ = None # apps|movies
|
||||
base_url = 'https://play.google.com'
|
||||
search_url = base_url + "/store/search?{query}&c={play_categ}"
|
||||
|
||||
@@ -71,7 +71,6 @@ max_page = 50
|
||||
language_support = True
|
||||
time_range_support = True
|
||||
safesearch = False
|
||||
send_accept_language_header = True
|
||||
|
||||
|
||||
def request(query: str, params: "OnlineParams") -> None:
|
||||
|
||||
@@ -27,7 +27,6 @@ about = {
|
||||
categories = ['map']
|
||||
paging = False
|
||||
language_support = True
|
||||
send_accept_language_header = True
|
||||
|
||||
# search-url
|
||||
base_url = 'https://nominatim.openstreetmap.org/'
|
||||
|
||||
@@ -56,7 +56,7 @@ Since the region is already "auto" by default, we only need to set the
|
||||
``use_local_search_results`` cookie and send the ``Accept-Language`` header. We
|
||||
have to set these values in both requests we send to Presearch; in the first
|
||||
request to get the request-ID from Presearch and in the final request to get the
|
||||
result list (see ``send_accept_language_header``).
|
||||
result list.
|
||||
|
||||
The time format returned by Presearch varies depending on the language set.
|
||||
Multiple different formats can be supported by using ``dateutil`` parser, but
|
||||
@@ -86,7 +86,6 @@ about = {
|
||||
paging = True
|
||||
safesearch = True
|
||||
time_range_support = True
|
||||
send_accept_language_header = True
|
||||
categories = ["general", "web"] # general, images, videos, news
|
||||
|
||||
# HTTP2 requests immediately get blocked by a CAPTCHA
|
||||
|
||||
@@ -82,9 +82,6 @@ max_page = 5
|
||||
"""5 pages maximum (``&p=5``): Trying to do more just results in an improper
|
||||
redirect"""
|
||||
|
||||
# Otherwise Qwant will return 403 if not set
|
||||
send_accept_language_header = True
|
||||
|
||||
qwant_categ = None
|
||||
"""One of ``web-lite`` (or ``web``), ``news``, ``images`` or ``videos``"""
|
||||
|
||||
|
||||
+20
-37
@@ -49,10 +49,15 @@ W3C recommends subtag over macrolanguage [2]_.
|
||||
Startpage languages
|
||||
===================
|
||||
|
||||
:py:obj:`send_accept_language_header`:
|
||||
HTTP ``Accept-Language`` header (``send_accept_language_header``):
|
||||
The displayed name in Startpage's settings page depend on the location of the
|
||||
IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
|
||||
we use::
|
||||
IP when ``Accept-Language`` HTTP header is unset.
|
||||
|
||||
Startpage tries to guess user's language and territory from the HTTP
|
||||
``Accept-Language``. Optional the user can select a search-language (can be
|
||||
different to the UI language) and a region filter.
|
||||
|
||||
In :py:obj:`fetch_traits` we use::
|
||||
|
||||
'Accept-Language': "en-US,en;q=0.5",
|
||||
..
|
||||
@@ -112,12 +117,6 @@ startpage_categ = 'web'
|
||||
"""Startpage's category, visit :ref:`startpage categories`.
|
||||
"""
|
||||
|
||||
send_accept_language_header = True
|
||||
"""Startpage tries to guess user's language and territory from the HTTP
|
||||
``Accept-Language``. Optional the user can select a search-language (can be
|
||||
different to the UI language) and a region filter.
|
||||
"""
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
@@ -163,14 +162,14 @@ def init(_):
|
||||
|
||||
# hint: all three startpage engines (WEB, Images & News) can/should use the
|
||||
# same sc_code ..
|
||||
CACHE = EngineCache("startpage") # type:ignore
|
||||
CACHE = EngineCache("startpage")
|
||||
|
||||
|
||||
sc_code_cache_sec = 3600
|
||||
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
|
||||
|
||||
|
||||
def get_sc_code(searxng_locale, params):
|
||||
def get_sc_code(params):
|
||||
"""Get an actual ``sc`` argument from Startpage's search form (HTML page).
|
||||
|
||||
Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
|
||||
@@ -183,30 +182,14 @@ def get_sc_code(searxng_locale, params):
|
||||
:py:obj:`sc_code_cache_sec` seconds."""
|
||||
|
||||
sc_code = CACHE.get("SC_CODE")
|
||||
|
||||
if sc_code:
|
||||
logger.debug("get_sc_code: using cached value: %s", sc_code)
|
||||
return sc_code
|
||||
|
||||
headers = {**params['headers']}
|
||||
|
||||
# add Accept-Language header
|
||||
if searxng_locale == 'all':
|
||||
searxng_locale = 'en-US'
|
||||
locale = babel.Locale.parse(searxng_locale, sep='-')
|
||||
|
||||
if send_accept_language_header:
|
||||
ac_lang = locale.language
|
||||
if locale.territory:
|
||||
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
|
||||
locale.language,
|
||||
locale.territory,
|
||||
locale.language,
|
||||
)
|
||||
headers['Accept-Language'] = ac_lang
|
||||
|
||||
get_sc_url = base_url + '/'
|
||||
get_sc_url = base_url + "/"
|
||||
logger.debug("get_sc_code: querying new sc timestamp @ %s", get_sc_url)
|
||||
|
||||
headers = {**params['headers']}
|
||||
logger.debug("get_sc_code: request headers: %s", headers)
|
||||
resp = get(get_sc_url, headers=headers)
|
||||
|
||||
@@ -214,19 +197,19 @@ def get_sc_code(searxng_locale, params):
|
||||
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
|
||||
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
|
||||
|
||||
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore
|
||||
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
|
||||
raise SearxEngineCaptchaException(
|
||||
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
|
||||
)
|
||||
|
||||
dom = lxml.html.fromstring(resp.text) # type: ignore
|
||||
dom = lxml.html.fromstring(resp.text)
|
||||
|
||||
try:
|
||||
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
|
||||
except IndexError as exc:
|
||||
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
|
||||
raise SearxEngineCaptchaException(
|
||||
message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url, # type: ignore
|
||||
message="get_sc_code: [PR-695] querying new sc timestamp failed! (%s)" % resp.url,
|
||||
) from exc
|
||||
|
||||
sc_code = str(sc_code)
|
||||
@@ -259,7 +242,7 @@ def request(query, params):
|
||||
'query': query,
|
||||
'cat': startpage_categ,
|
||||
't': 'device',
|
||||
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers
|
||||
'sc': get_sc_code(params),
|
||||
'with_date': time_range_dict.get(params['time_range'], ''),
|
||||
'abp': '1',
|
||||
'abd': '1',
|
||||
@@ -437,10 +420,10 @@ def fetch_traits(engine_traits: EngineTraits):
|
||||
}
|
||||
resp = get('https://www.startpage.com/do/settings', headers=headers)
|
||||
|
||||
if not resp.ok: # type: ignore
|
||||
if not resp.ok:
|
||||
print("ERROR: response from Startpage is not OK.")
|
||||
|
||||
dom = lxml.html.fromstring(resp.text) # type: ignore
|
||||
dom = lxml.html.fromstring(resp.text)
|
||||
|
||||
# regions
|
||||
|
||||
@@ -453,7 +436,7 @@ def fetch_traits(engine_traits: EngineTraits):
|
||||
continue
|
||||
babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
|
||||
|
||||
if '-' in babel_region_tag:
|
||||
if '-' in babel_region_tag: # pyright: ignore[reportOperatorIssue]
|
||||
l, r = babel_region_tag.split('-')
|
||||
r = r.split('_')[-1]
|
||||
sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
|
||||
|
||||
@@ -79,10 +79,6 @@ display_type = ["infobox"]
|
||||
one will add a hit to the result list. The first one will show a hit in the
|
||||
info box. Both values can be set, or one of the two can be set."""
|
||||
|
||||
send_accept_language_header = True
|
||||
"""The HTTP ``Accept-Language`` header is needed for wikis where
|
||||
LanguageConverter_ is enabled."""
|
||||
|
||||
list_of_wikipedias = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
|
||||
"""`List of all wikipedias <https://meta.wikimedia.org/wiki/List_of_Wikipedias>`_
|
||||
"""
|
||||
@@ -102,7 +98,7 @@ rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
|
||||
previews (fka. Hovercards, aka. Popups) on the web and link previews in the
|
||||
apps.
|
||||
|
||||
HTTP ``Accept-Language`` header (:py:obj:`send_accept_language_header`):
|
||||
HTTP ``Accept-Language`` header (``send_accept_language_header``):
|
||||
The desired language variant code for wikis where LanguageConverter_ is
|
||||
enabled.
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ about = {
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
time_range_support = True
|
||||
# send_accept_language_header = True
|
||||
|
||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'}
|
||||
safesearch_dict = {0: 'p', 1: 'i', 2: 'r'}
|
||||
|
||||
@@ -141,28 +141,31 @@ class OnlineProcessor(EngineProcessor):
|
||||
params: OnlineParams = {**default_request_params(), **base_params}
|
||||
|
||||
headers = params["headers"]
|
||||
headers["Accept-Encoding"] = "gzip, deflate"
|
||||
headers["Cache-Control"] = "no-cache"
|
||||
headers["DNT"] = "1"
|
||||
headers["Connection"] = "keep-alive"
|
||||
|
||||
# add an user agent
|
||||
headers["User-Agent"] = gen_useragent()
|
||||
|
||||
# add Accept-Language header
|
||||
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/Accept-Language
|
||||
|
||||
headers["Accept-Language"] = "en,en-US;q=0.7,en;q=0.3"
|
||||
if self.engine.send_accept_language_header and search_query.locale:
|
||||
ac_lang = search_query.locale.language
|
||||
if search_query.locale.territory:
|
||||
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
|
||||
search_query.locale.language,
|
||||
search_query.locale.territory,
|
||||
search_query.locale.language,
|
||||
)
|
||||
headers["Accept-Language"] = ac_lang
|
||||
_l = search_query.locale.language
|
||||
_t = search_query.locale.territory or _l
|
||||
headers["Accept-Language"] = f"{_l},{_l}-{_t};q=0.7,en;q=0.3"
|
||||
self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", ""))
|
||||
|
||||
# https://developer.mozilla.org/en-US/docs/Glossary/Fetch_metadata_request_header
|
||||
headers["Sec-Fetch-Dest"] = "empty"
|
||||
headers["Sec-Fetch-Mode"] = "cors"
|
||||
headers["Sec-Fetch-Dest"] = "document"
|
||||
headers["Sec-Fetch-Mode"] = "navigate"
|
||||
headers["Sec-Fetch-Site"] = "same-origin"
|
||||
headers["Sec-Fetch-User"] = "?1"
|
||||
headers["Sec-GPC"] = "1"
|
||||
# Sec-GPC is in an experimental state (FFox only)
|
||||
# headers["Sec-GPC"] = "1"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
Reference in New Issue
Block a user