From f4c63c8eb03db3b880a5afe986a46dc9737c72e5 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Mon, 1 Jun 2026 11:01:23 +0200 Subject: [PATCH] [feat] engines: add duckduckgo web engine as alternative to html.duckduckgo.com html.duckduckgo.com captchas all my IPs very fast. I figured out that using duckduckgo.com works even if html.duckduckgo.com is captcha-ed, hence adding support for duckduckgo.com's general web search here. This implementation fetches the link to the first API page (i.e. ``links.duckduckgo.com/d.js?...``) from duckduckgo.com and uses the ``n`` parameter of the API to fetch all subsequent pages. This also means that it's not possible to immediately search for the third page - the first and the second page would need to be loaded first. The reason why we can't just normally use the `vqd` value is that the API URLs require an additional parameter `dp` which seems generated at server-side, so we can't build it ourselves and must scrape it from the HTML pages. --- searx/engines/duckduckgo_web.py | 154 ++++++++++++++++++++++++++++++++ searx/settings.yml | 7 ++ 2 files changed, 161 insertions(+) create mode 100644 searx/engines/duckduckgo_web.py diff --git a/searx/engines/duckduckgo_web.py b/searx/engines/duckduckgo_web.py new file mode 100644 index 000000000..e58378ba1 --- /dev/null +++ b/searx/engines/duckduckgo_web.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""DuckDuckGo Web (general) + +This implementation fetches the link to the first API page +(i.e. ``links.duckduckgo.com/d.js?...``) from duckduckgo.com and uses the ``n`` +parameter of the API to fetch all subsequent pages. + +This also means that it's not possible to immediately search for the third +page - the first and the second page would need to be loaded first. + +The reason why we can't just normally use the `vqd` value is that the API URLs +require an additional parameter `dp` which seems generated at server-side, so we +can't build it ourselves and must scrape it from the HTML pages. +""" + +import typing as t + +from urllib.parse import quote_plus +from lxml import html + +from searx.utils import html_to_text, gen_useragent, extract_text, eval_xpath +from searx.result_types import EngineResults +from searx.enginelib import EngineCache +from searx.network import get + +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + +about = { + "website": "https://duckduckgo.com/", + "wikidata_id": "Q12805", + "use_official_api": False, + "require_api_key": False, + "results": "JSON", +} + +# engine dependent config +categories = ["general"] +paging = True +_HTTP_User_Agent: str = gen_useragent() + +base_url = "https://duckduckgo.com" + +CACHE: EngineCache +"""Cache to store the API URLs for combinations of (query, page).""" + + +def setup(engine_settings: dict[str, str]): + global CACHE # pylint:disable=global-statement + CACHE = EngineCache(engine_settings["name"]) + return CACHE + + +def _fetch_first_page_link( + query: str, + headers: dict[str, str], +): + """Search for a:: + + str: + return f"nextpage_url|{query}|{pageno}" + + +def request(query: str, params: "OnlineParams") -> None: + + if len(query) >= 500: + # DDG does not accept queries with more than 499 chars + params["url"] = None + return + + headers = params["headers"] + + # The vqd value is generated from the query and the UA header. To be able + # to reuse the vqd value, the UA header must be static. + headers["User-Agent"] = _HTTP_User_Agent + headers["Accept"] = "*/*" + headers["Referer"] = f"{base_url}/" + headers["Host"] = "duckduckgo.com" + + # Sec-Fetch headers are required to not get blocked when sending a Firefox user agent + headers["Sec-Fetch-Dest"] = "script" + headers["Sec-Fetch-Mode"] = "no-cors" + headers["Sec-Fetch-Site"] = "same-site" + + api_url = "" + if params["pageno"] > 1: + api_url = CACHE.get(_cache_key(query, params["pageno"])) + else: + api_url = _fetch_first_page_link(query, headers) + + if not api_url: + params["url"] = None + return + + params["url"] = api_url.replace("/d.js?", "/d.js?o=json&") + + # TODO: support safesearch, timerange and engine traits # pylint:disable=fixme + + +def response(resp: "SXNG_Response"): + res = EngineResults() + res_json = resp.json() + + for result in res_json["results"]: + if "u" not in result: + continue + + res.add(res.types.MainResult(url=result["u"], title=result["t"], content=html_to_text(result["a"]))) + + # link to next page + next_page_path = res_json["results"][-1].get("n") + if next_page_path: + CACHE.set( + _cache_key(resp.search_params["query"], resp.search_params["pageno"] + 1), + base_url + next_page_path, + expire=60 * 60, + ) + + return res diff --git a/searx/settings.yml b/searx/settings.yml index 58b23ebee..3f6c2e2f0 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -803,10 +803,17 @@ engines: display_type: ["infobox"] categories: [general] + # duckduckgo uses html.duckduckgo.com, + # duckduckgo web uses duckduckgo.com - name: duckduckgo engine: duckduckgo shortcut: ddg + - name: duckduckgo web + engine: duckduckgo_web + shortcut: ddgw + disabled: true + - name: duckduckgo images engine: duckduckgo_extra categories: [images]