mirror of
https://github.com/searxng/searxng.git
synced 2026-06-03 00:17:16 +02:00
[fix] complete overhaul of the DuckDuckGo engines
DDG has reimplemented its bot protection, and the DDG engines "images", "news" and "videos" no longer work in SearXNG and DDG-Web access often ends with a CAPTCHA. Related: - issue 4824 - https://github.com/ggfevans/searxng/blob/mod-sidecar-harvester/docs/ddg-bot-detection-research.md Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
committed by
Markus Heiser
parent
490f28f0a6
commit
2e6eeb1d79
@@ -4,84 +4,155 @@ DuckDuckGo Extra (images, videos, news)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
"""
|
||||
|
||||
import typing as t
|
||||
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import get_embeded_stream_url, html_to_text
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
from searx.utils import get_embeded_stream_url, html_to_text, gen_useragent, extr
|
||||
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
||||
|
||||
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
|
||||
from searx.engines.duckduckgo import get_ddg_lang, get_vqd
|
||||
from searx.engines.duckduckgo import get_ddg_lang, get_vqd, set_vqd
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from searx.extended_types import SXNG_Response
|
||||
from searx.search.processors import OnlineParams
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"wikidata_id": 'Q12805',
|
||||
"website": "https://duckduckgo.com/",
|
||||
"wikidata_id": "Q12805",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON (site requires js to get images)',
|
||||
"results": "JSON (site requires js to get images)",
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['images', 'web']
|
||||
ddg_category = 'images'
|
||||
categories = []
|
||||
ddg_category = ""
|
||||
"""The category must be any of ``images``, ``videos`` and ``news``
|
||||
"""
|
||||
paging = True
|
||||
safesearch = True
|
||||
send_accept_language_header = True
|
||||
|
||||
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
|
||||
safesearch_args = {0: '1', 1: None, 2: '1'}
|
||||
safesearch_cookies = {0: "-2", 1: None, 2: "1"}
|
||||
safesearch_args = {0: "1", 1: None, 2: "1"}
|
||||
|
||||
search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
|
||||
search_path_map = {"images": "i", "videos": "v", "news": "news"}
|
||||
_HTTP_User_Agent: str = gen_useragent()
|
||||
|
||||
|
||||
def request(query, params):
|
||||
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
|
||||
def init(engine_settings: dict[str, t.Any]):
|
||||
|
||||
# request needs a vqd argument
|
||||
vqd = get_vqd(query, eng_region, force_request=True)
|
||||
if engine_settings["ddg_category"] not in ["images", "videos", "news"]:
|
||||
raise ValueError(f"Unsupported DuckDuckGo category: {engine_settings['ddg_category']}")
|
||||
|
||||
if not vqd:
|
||||
# some search terms do not have results and therefore no vqd value
|
||||
params['url'] = None
|
||||
return params
|
||||
|
||||
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
|
||||
def fetch_vqd(
|
||||
query: str,
|
||||
params: "OnlineParams",
|
||||
):
|
||||
|
||||
args = {
|
||||
'q': query,
|
||||
'o': 'json',
|
||||
# 'u': 'bing',
|
||||
'l': eng_region,
|
||||
'f': ',,,,,',
|
||||
'vqd': vqd,
|
||||
logger.debug("fetch_vqd: request value from from duckduckgo.com")
|
||||
resp = get(
|
||||
url=f"https://duckduckgo.com/?q={quote_plus(query)}&iar=images&t=h_",
|
||||
headers=params["headers"],
|
||||
timeout=2,
|
||||
)
|
||||
|
||||
value = ""
|
||||
if resp.status_code == 200:
|
||||
value = extr(resp.text, 'vqd="', '"')
|
||||
if value:
|
||||
logger.debug("vqd value from duckduckgo.com request: '%s'", value)
|
||||
else:
|
||||
logger.error("vqd: can't parse value from ddg response (return empty string)")
|
||||
return ""
|
||||
else:
|
||||
logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
|
||||
|
||||
if value:
|
||||
set_vqd(query=query, value=value, params=params)
|
||||
else:
|
||||
logger.error("none vqd value from duckduckgo.com: HTTP %s", resp.status_code)
|
||||
return value
|
||||
|
||||
|
||||
def request(query: str, params: "OnlineParams") -> None:
|
||||
|
||||
if len(query) >= 500:
|
||||
# DDG does not accept queries with more than 499 chars
|
||||
params["url"] = None
|
||||
return
|
||||
|
||||
if params["pageno"] > 1 and ddg_category in ["videos", "news"]:
|
||||
# DDG has limited results for videos and news and we got already all
|
||||
# results from the first request.
|
||||
params["url"] = None
|
||||
return
|
||||
|
||||
# HTTP headers
|
||||
# ============
|
||||
|
||||
headers = params["headers"]
|
||||
# The vqd value is generated from the query and the UA header. To be able to
|
||||
# reuse the vqd value, the UA header must be static.
|
||||
headers["User-Agent"] = _HTTP_User_Agent
|
||||
vqd = get_vqd(query=query, params=params) or fetch_vqd(query=query, params=params)
|
||||
|
||||
# Sec-Fetch-* headers are already set by the ``OnlineProcessor``.
|
||||
# Overwrite the default Sec-Fetch-* headers to fit to a XHTMLRequest
|
||||
headers["Sec-Fetch-Dest"] = "empty"
|
||||
headers["Sec-Fetch-Mode"] = "cors"
|
||||
|
||||
headers["Accept"] = "*/*"
|
||||
headers["Referer"] = "https://duckduckgo.com/"
|
||||
headers["Host"] = "duckduckgo.com"
|
||||
# headers["X-Requested-With"] = "XMLHttpRequest"
|
||||
|
||||
# DDG XHTMLRequest
|
||||
# ================
|
||||
|
||||
eng_region: str = traits.get_region(
|
||||
params["searxng_locale"],
|
||||
traits.all_locale,
|
||||
) # pyright: ignore[reportAssignmentType]
|
||||
|
||||
eng_lang: str = get_ddg_lang(traits, params["searxng_locale"]) or "wt-wt"
|
||||
|
||||
args: dict[str, str | int] = {
|
||||
"o": "json",
|
||||
"q": query,
|
||||
"u": "bing",
|
||||
"l": eng_region,
|
||||
"bpia": "1",
|
||||
"vqd": vqd,
|
||||
"a": "h_",
|
||||
}
|
||||
|
||||
if params['pageno'] > 1:
|
||||
args['s'] = (params['pageno'] - 1) * 100
|
||||
params["cookies"]["ad"] = eng_lang # zh_CN
|
||||
params["cookies"]["ah"] = eng_region # "us-en,de-de"
|
||||
params["cookies"]["l"] = eng_region # "hk-tzh"
|
||||
|
||||
params['cookies']['ad'] = eng_lang # zh_CN
|
||||
params['cookies']['ah'] = eng_region # "us-en,de-de"
|
||||
params['cookies']['l'] = eng_region # "hk-tzh"
|
||||
args["ct"] = "EN"
|
||||
if params["searxng_locale"] != "all":
|
||||
args["ct"] = params["searxng_locale"].split("-")[0].upper()
|
||||
|
||||
safe_search = safesearch_cookies.get(params['safesearch'])
|
||||
if params["pageno"] > 1:
|
||||
args["s"] = (params["pageno"] - 1) * 100
|
||||
|
||||
safe_search = safesearch_cookies.get(params["safesearch"])
|
||||
if safe_search is not None:
|
||||
params['cookies']['p'] = safe_search # "-2", "1"
|
||||
safe_search = safesearch_args.get(params['safesearch'])
|
||||
if safe_search is not None:
|
||||
args['p'] = safe_search # "-1", "1"
|
||||
params["cookies"]["p"] = safe_search # "-2", "1"
|
||||
args["p"] = safe_search
|
||||
|
||||
logger.debug("cookies: %s", params['cookies'])
|
||||
params["url"] = f"https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}"
|
||||
|
||||
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
|
||||
|
||||
# sending these two headers prevents rate limiting for the query
|
||||
params['headers'] = {
|
||||
'Referer': 'https://duckduckgo.com/',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
}
|
||||
|
||||
return params
|
||||
logger.debug("param headers: %s", params["headers"])
|
||||
logger.debug("param data: %s", params["data"])
|
||||
logger.debug("param cookies: %s", params["cookies"])
|
||||
|
||||
|
||||
def _image_result(result):
|
||||
|
||||
Reference in New Issue
Block a user