diff --git a/searx/engines/google.py b/searx/engines/google.py index d6f581b99..e0048222b 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -278,10 +278,28 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st return ret_val -def detect_google_sorry(resp): +def detect_google_sorry(resp: "SXNG_Response"): + """Detect Google's bot-protection responses (CAPTCHA / sorry pages). + + Google may block requests in several ways: + + 1. Redirect to sorry.google.com (standard CAPTCHA). + 2. HTTP 302 redirect to ``/sorry/index?...`` on the same host -- when the + HTTP client doesn't follow the redirect, the response body is a short + HTML stub with a link to the sorry page. + 3. Short HTML response (<2000 bytes) containing "/sorry/" -- a meta-refresh + or JS redirect variant. + """ + if resp.url.host == "sorry.google.com" or resp.url.path.startswith("/sorry"): raise SearxEngineCaptchaException() + if resp.status_code == 302: + raise SearxEngineCaptchaException() + + if len(resp.text) < 2000 and "/sorry/" in resp.text: + raise SearxEngineCaptchaException() + def request(query: str, params: "OnlineParams") -> None: """Google search request"""