[fix] complete overhaul of the DuckDuckGo engines

DDG has reimplemented its bot protection, and the DDG engines "images", "news"
and "videos" no longer work in SearXNG and DDG-Web access often ends with a
CAPTCHA.

Related:

- issue 4824
- https://github.com/ggfevans/searxng/blob/mod-sidecar-harvester/docs/ddg-bot-detection-research.md

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser
2026-02-20 15:33:53 +01:00
committed by Markus Heiser
parent 490f28f0a6
commit 2e6eeb1d79
4 changed files with 628 additions and 405 deletions
+118 -47
View File
@@ -4,84 +4,155 @@ DuckDuckGo Extra (images, videos, news)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""
import typing as t
from datetime import datetime
from urllib.parse import urlencode
from searx.utils import get_embeded_stream_url, html_to_text
from urllib.parse import quote_plus
from searx.utils import get_embeded_stream_url, html_to_text, gen_useragent, extr
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang, get_vqd
from searx.engines.duckduckgo import get_ddg_lang, get_vqd, set_vqd
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"website": "https://duckduckgo.com/",
"wikidata_id": "Q12805",
"use_official_api": False,
"require_api_key": False,
"results": 'JSON (site requires js to get images)',
"results": "JSON (site requires js to get images)",
}
# engine dependent config
categories = ['images', 'web']
ddg_category = 'images'
categories = []
ddg_category = ""
"""The category must be any of ``images``, ``videos`` and ``news``
"""
paging = True
safesearch = True
send_accept_language_header = True
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
safesearch_args = {0: '1', 1: None, 2: '1'}
safesearch_cookies = {0: "-2", 1: None, 2: "1"}
safesearch_args = {0: "1", 1: None, 2: "1"}
search_path_map = {'images': 'i', 'videos': 'v', 'news': 'news'}
search_path_map = {"images": "i", "videos": "v", "news": "news"}
_HTTP_User_Agent: str = gen_useragent()
def request(query, params):
eng_region: str = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore
def init(engine_settings: dict[str, t.Any]):
# request needs a vqd argument
vqd = get_vqd(query, eng_region, force_request=True)
if engine_settings["ddg_category"] not in ["images", "videos", "news"]:
raise ValueError(f"Unsupported DuckDuckGo category: {engine_settings['ddg_category']}")
if not vqd:
# some search terms do not have results and therefore no vqd value
params['url'] = None
return params
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
def fetch_vqd(
query: str,
params: "OnlineParams",
):
args = {
'q': query,
'o': 'json',
# 'u': 'bing',
'l': eng_region,
'f': ',,,,,',
'vqd': vqd,
logger.debug("fetch_vqd: request value from from duckduckgo.com")
resp = get(
url=f"https://duckduckgo.com/?q={quote_plus(query)}&iar=images&t=h_",
headers=params["headers"],
timeout=2,
)
value = ""
if resp.status_code == 200:
value = extr(resp.text, 'vqd="', '"')
if value:
logger.debug("vqd value from duckduckgo.com request: '%s'", value)
else:
logger.error("vqd: can't parse value from ddg response (return empty string)")
return ""
else:
logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
if value:
set_vqd(query=query, value=value, params=params)
else:
logger.error("none vqd value from duckduckgo.com: HTTP %s", resp.status_code)
return value
def request(query: str, params: "OnlineParams") -> None:
if len(query) >= 500:
# DDG does not accept queries with more than 499 chars
params["url"] = None
return
if params["pageno"] > 1 and ddg_category in ["videos", "news"]:
# DDG has limited results for videos and news and we got already all
# results from the first request.
params["url"] = None
return
# HTTP headers
# ============
headers = params["headers"]
# The vqd value is generated from the query and the UA header. To be able to
# reuse the vqd value, the UA header must be static.
headers["User-Agent"] = _HTTP_User_Agent
vqd = get_vqd(query=query, params=params) or fetch_vqd(query=query, params=params)
# Sec-Fetch-* headers are already set by the ``OnlineProcessor``.
# Overwrite the default Sec-Fetch-* headers to fit to a XHTMLRequest
headers["Sec-Fetch-Dest"] = "empty"
headers["Sec-Fetch-Mode"] = "cors"
headers["Accept"] = "*/*"
headers["Referer"] = "https://duckduckgo.com/"
headers["Host"] = "duckduckgo.com"
# headers["X-Requested-With"] = "XMLHttpRequest"
# DDG XHTMLRequest
# ================
eng_region: str = traits.get_region(
params["searxng_locale"],
traits.all_locale,
) # pyright: ignore[reportAssignmentType]
eng_lang: str = get_ddg_lang(traits, params["searxng_locale"]) or "wt-wt"
args: dict[str, str | int] = {
"o": "json",
"q": query,
"u": "bing",
"l": eng_region,
"bpia": "1",
"vqd": vqd,
"a": "h_",
}
if params['pageno'] > 1:
args['s'] = (params['pageno'] - 1) * 100
params["cookies"]["ad"] = eng_lang # zh_CN
params["cookies"]["ah"] = eng_region # "us-en,de-de"
params["cookies"]["l"] = eng_region # "hk-tzh"
params['cookies']['ad'] = eng_lang # zh_CN
params['cookies']['ah'] = eng_region # "us-en,de-de"
params['cookies']['l'] = eng_region # "hk-tzh"
args["ct"] = "EN"
if params["searxng_locale"] != "all":
args["ct"] = params["searxng_locale"].split("-")[0].upper()
safe_search = safesearch_cookies.get(params['safesearch'])
if params["pageno"] > 1:
args["s"] = (params["pageno"] - 1) * 100
safe_search = safesearch_cookies.get(params["safesearch"])
if safe_search is not None:
params['cookies']['p'] = safe_search # "-2", "1"
safe_search = safesearch_args.get(params['safesearch'])
if safe_search is not None:
args['p'] = safe_search # "-1", "1"
params["cookies"]["p"] = safe_search # "-2", "1"
args["p"] = safe_search
logger.debug("cookies: %s", params['cookies'])
params["url"] = f"https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}"
params['url'] = f'https://duckduckgo.com/{search_path_map[ddg_category]}.js?{urlencode(args)}'
# sending these two headers prevents rate limiting for the query
params['headers'] = {
'Referer': 'https://duckduckgo.com/',
'X-Requested-With': 'XMLHttpRequest',
}
return params
logger.debug("param headers: %s", params["headers"])
logger.debug("param data: %s", params["data"])
logger.debug("param cookies: %s", params["cookies"])
def _image_result(result):