From 790683bbd738dcf1e725f748f263a6fa43959305 Mon Sep 17 00:00:00 2001 From: Arnaud Jeannin Date: Fri, 15 May 2026 08:25:13 +0100 Subject: [PATCH] [fix] google: improve CAPTCHA detection (#5922) - Detect HTTP 302 responses (Google redirecting to /sorry/index without the HTTP client following the redirect) - Detect short HTML responses (<2000 bytes) containing "/sorry/" links (meta-refresh or JS redirect variants) Instances with rotating IPs can set the `suspended_times.SearxEngineCaptcha` to 0 in the search settings [1], the next request will typically use a different outgoing IP when rotating proxies are configured [1] https://docs.searxng.org/admin/settings/settings_search.html --- searx/engines/google.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index d6f581b99..e0048222b 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -278,10 +278,28 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st return ret_val -def detect_google_sorry(resp): +def detect_google_sorry(resp: "SXNG_Response"): + """Detect Google's bot-protection responses (CAPTCHA / sorry pages). + + Google may block requests in several ways: + + 1. Redirect to sorry.google.com (standard CAPTCHA). + 2. HTTP 302 redirect to ``/sorry/index?...`` on the same host -- when the + HTTP client doesn't follow the redirect, the response body is a short + HTML stub with a link to the sorry page. + 3. Short HTML response (<2000 bytes) containing "/sorry/" -- a meta-refresh + or JS redirect variant. + """ + if resp.url.host == "sorry.google.com" or resp.url.path.startswith("/sorry"): raise SearxEngineCaptchaException() + if resp.status_code == 302: + raise SearxEngineCaptchaException() + + if len(resp.text) < 2000 and "/sorry/" in resp.text: + raise SearxEngineCaptchaException() + def request(query: str, params: "OnlineParams") -> None: """Google search request"""