mirror of
https://github.com/searxng/searxng.git
synced 2026-05-15 23:45:49 +02:00
[fix] google: improve CAPTCHA detection (#5922)
- Detect HTTP 302 responses (Google redirecting to /sorry/index without the HTTP client following the redirect) - Detect short HTML responses (<2000 bytes) containing "/sorry/" links (meta-refresh or JS redirect variants) Instances with rotating IPs can set the `suspended_times.SearxEngineCaptcha` to 0 in the search settings [1], the next request will typically use a different outgoing IP when rotating proxies are configured [1] https://docs.searxng.org/admin/settings/settings_search.html
This commit is contained in:
+19
-1
@@ -278,10 +278,28 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
|
||||
return ret_val
|
||||
|
||||
|
||||
def detect_google_sorry(resp):
|
||||
def detect_google_sorry(resp: "SXNG_Response"):
|
||||
"""Detect Google's bot-protection responses (CAPTCHA / sorry pages).
|
||||
|
||||
Google may block requests in several ways:
|
||||
|
||||
1. Redirect to sorry.google.com (standard CAPTCHA).
|
||||
2. HTTP 302 redirect to ``/sorry/index?...`` on the same host -- when the
|
||||
HTTP client doesn't follow the redirect, the response body is a short
|
||||
HTML stub with a link to the sorry page.
|
||||
3. Short HTML response (<2000 bytes) containing "/sorry/" -- a meta-refresh
|
||||
or JS redirect variant.
|
||||
"""
|
||||
|
||||
if resp.url.host == "sorry.google.com" or resp.url.path.startswith("/sorry"):
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
if resp.status_code == 302:
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
if len(resp.text) < 2000 and "/sorry/" in resp.text:
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
|
||||
def request(query: str, params: "OnlineParams") -> None:
|
||||
"""Google search request"""
|
||||
|
||||
Reference in New Issue
Block a user