[feat] engines: add duckduckgo web engine as alternative to html.duckduckgo.com

html.duckduckgo.com captchas all my IPs very fast. I figured out that using duckduckgo.com works even if html.duckduckgo.com is captcha-ed, hence adding support for duckduckgo.com's general web search here. This implementation fetches the link to the first API page (i.e. ``links.duckduckgo.com/d.js?...``) from duckduckgo.com and uses the ``n`` parameter of the API to fetch all subsequent pages. This also means that it's not possible to immediately search for the third page - the first and the second page would need to be loaded first. The reason why we can't just normally use the `vqd` value is that the API URLs require an additional parameter `dp` which seems generated at server-side, so we can't build it ourselves and must scrape it from the HTML pages.
2026-06-11 04:17:50 +02:00 · 2026-06-01 11:01:23 +02:00
parent 26801e92af
commit f4c63c8eb0
2 changed files with 161 additions and 0 deletions
@@ -0,0 +1,154 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 """DuckDuckGo Web (general)
 This implementation fetches the link to the first API page
 (i.e. ``links.duckduckgo.com/d.js?...``) from duckduckgo.com and uses the ``n``
 parameter of the API to fetch all subsequent pages.
 This also means that it's not possible to immediately search for the third
 page - the first and the second page would need to be loaded first.
 The reason why we can't just normally use the `vqd` value is that the API URLs
 require an additional parameter `dp` which seems generated at server-side, so we
 can't build it ourselves and must scrape it from the HTML pages.
 """
 import typing as t
 from urllib.parse import quote_plus
 from lxml import html
 from searx.utils import html_to_text, gen_useragent, extract_text, eval_xpath
 from searx.result_types import EngineResults
 from searx.enginelib import EngineCache
 from searx.network import get
 if t.TYPE_CHECKING:
    from searx.extended_types import SXNG_Response
    from searx.search.processors import OnlineParams
 about = {
    "website": "https://duckduckgo.com/",
    "wikidata_id": "Q12805",
    "use_official_api": False,
    "require_api_key": False,
    "results": "JSON",
 }
 # engine dependent config
 categories = ["general"]
 paging = True
 _HTTP_User_Agent: str = gen_useragent()
 base_url = "https://duckduckgo.com"
 CACHE: EngineCache
 """Cache to store the API URLs for combinations of (query, page)."""
 def setup(engine_settings: dict[str, str]):
    global CACHE  # pylint:disable=global-statement
    CACHE = EngineCache(engine_settings["name"])
    return CACHE
 def _fetch_first_page_link(
    query: str,
    headers: dict[str, str],
 ):
    """Search for a::
        <link id="deep_preload_link" rel="preload" as="script"
              href="https://links.duckduckgo.com/d.js?q=rust&t=D&l=us-en&s=0&a=h_&ct=DE&vqd=VQD_VALUE&bing_market=en-US&p_ent=&ex=-1&dp=LONG_TOKEN
        >
    This points to the first page
    """  # pylint:disable=line-too-long
    cache_key = _cache_key(query, 1)
    cached: str | None = CACHE.get(cache_key)
    if cached:
        return cached
    resp = get(
        url=f"{base_url}/?q={quote_plus(query)}&t=h_&ia=web",
        headers=headers,
        timeout=2,
    )
    if resp.status_code != 200:
        logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
    dom = html.fromstring(resp.text)
    first_page_link = extract_text(eval_xpath(dom, "//link[@id='deep_preload_link']/@href"))
    if not first_page_link:
        logger.error("vqd: failed to load first page JS url from ddg response (return empty string)")
        return ""
    logger.debug("got link to first page from duckduckgo.com request: '%s'", first_page_link)
    CACHE.set(cache_key, first_page_link, expire=7200)
    return first_page_link
 def _cache_key(query: str, pageno: int) -> str:
    return f"nextpage_url|{query}|{pageno}"
 def request(query: str, params: "OnlineParams") -> None:
    if len(query) >= 500:
        # DDG does not accept queries with more than 499 chars
        params["url"] = None
        return
    headers = params["headers"]
    # The vqd value is generated from the query and the UA header.  To be able
    # to reuse the vqd value, the UA header must be static.
    headers["User-Agent"] = _HTTP_User_Agent
    headers["Accept"] = "*/*"
    headers["Referer"] = f"{base_url}/"
    headers["Host"] = "duckduckgo.com"
    # Sec-Fetch headers are required to not get blocked when sending a Firefox user agent
    headers["Sec-Fetch-Dest"] = "script"
    headers["Sec-Fetch-Mode"] = "no-cors"
    headers["Sec-Fetch-Site"] = "same-site"
    api_url = ""
    if params["pageno"] > 1:
        api_url = CACHE.get(_cache_key(query, params["pageno"]))
    else:
        api_url = _fetch_first_page_link(query, headers)
    if not api_url:
        params["url"] = None
        return
    params["url"] = api_url.replace("/d.js?", "/d.js?o=json&")
    # TODO: support safesearch, timerange and engine traits  # pylint:disable=fixme
 def response(resp: "SXNG_Response"):
    res = EngineResults()
    res_json = resp.json()
    for result in res_json["results"]:
        if "u" not in result:
            continue
        res.add(res.types.MainResult(url=result["u"], title=result["t"], content=html_to_text(result["a"])))
    # link to next page
    next_page_path = res_json["results"][-1].get("n")
    if next_page_path:
        CACHE.set(
            _cache_key(resp.search_params["query"], resp.search_params["pageno"] + 1),
            base_url + next_page_path,
            expire=60 * 60,
        )
    return res
@@ -803,10 +803,17 @@ engines:
    display_type: ["infobox"]
    categories: [general]
  # duckduckgo uses html.duckduckgo.com,
  # duckduckgo web uses duckduckgo.com
  - name: duckduckgo
    engine: duckduckgo
    shortcut: ddg
  - name: duckduckgo web
    engine: duckduckgo_web
    shortcut: ddgw
    disabled: true
  - name: duckduckgo images
    engine: duckduckgo_extra
    categories: [images]