[feat] gmx: detect captchas

[feat] json engine: add option to not send page num on first page
[fix] xpath engine: add missing send_page_num_on_first_page docstring
2026-06-22 09:38:34 +02:00 · 2026-06-05 08:07:30 +02:00 · 2026-06-05 08:04:49 +02:00 · 2026-06-05 08:04:49 +02:00
3 changed files with 18 additions and 2 deletions
@@ -10,10 +10,12 @@ import time
 import typing as t
 from urllib.parse import urlencode
 from lxml import html
 from searx.result_types import EngineResults
 from searx.exceptions import SearxEngineCaptchaException
 from searx.extended_types import SXNG_Response
-from searx.utils import extr, gen_useragent, html_to_text
+from searx.utils import extr, gen_useragent, html_to_text, eval_xpath
 from searx.network import get
 if t.TYPE_CHECKING:
@@ -40,6 +42,11 @@ time_range_map = {"day": "d", "week": "w", "month": "m", "year": "y"}
 def _get_page_hash(query: str, page: int, headers: dict[str, str]) -> str:
    resp = get(f"{base_url}/web/result?q={query}&page={page}", headers=headers)
    # detect captcha (if any)
    doc = html.fromstring(resp.text)
    if eval_xpath(doc, "//*[@id='spam-messages']"):
        raise SearxEngineCaptchaException()
    # the text we search for looks like:
    # load("/desk?lang="+eV.p.param['hl']+"&q="+eV['p']['q_encode']+"&page=5&h=aa45603&t=177582576&origin=web&comp=web_serp_pag&p=gmx-com&sp=&lr="+eV.p.param['lr0']+"&mkt="+eV.p.param['mkt0']+"&family="+eV.p.param['familyFilter']+"&fcons="+eV.p.perm.fCons,"google", "eMMO", "eMH","eMP");  # pylint: disable=line-too-long
    return extr(resp.text, "&h=", "&t=")
@@ -20,6 +20,7 @@ Paging:
 - :py:obj:`paging`
 - :py:obj:`page_size`
 - :py:obj:`first_page_num`
 - :py:obj:`send_page_num_on_first_page`
 Time Range:
@@ -169,6 +170,10 @@ number, but an offset.'''
 first_page_num = 1
 '''Number of the first page (usually 0 or 1).'''
 send_page_num_on_first_page = True
 '''Whether to include the page number in the request for the first page.
 This can help if an engine blocks request that send a page number for the first page.'''
 results_query = ''
 '''JSON query for the list of result items.
@@ -322,10 +327,13 @@ def request(query, params):  # pylint: disable=redefined-outer-name
    if params['safesearch']:
        safe_search = safe_search_map[params['safesearch']]
    pageno = ""
    if send_page_num_on_first_page or params["pageno"] != 1:
        pageno = (params['pageno'] - 1) * page_size + first_page_num
    fp = {  # pylint: disable=invalid-name
        'query': urlencode({'q': query})[2:],
        'lang': lang,
-        'pageno': (params['pageno'] - 1) * page_size + first_page_num,
+        'pageno': pageno,
        'time_range': time_range,
        'safe_search': safe_search,
    }
@@ -22,6 +22,7 @@ Paging:
 - :py:obj:`paging`
 - :py:obj:`page_size`
 - :py:obj:`first_page_num`
 - :py:obj:`send_page_num_on_first_page`
 Time Range:
Author	SHA1	Message	Date
Bnyro	26fa181b84	[feat] gmx: detect captchas	2026-06-05 08:07:30 +02:00
Bnyro	0f35ef7cd6	[feat] json engine: add option to not send page num on first page	2026-06-05 08:04:49 +02:00
Bnyro	b1ae576b2d	[fix] xpath engine: add missing `send_page_num_on_first_page` docstring	2026-06-05 08:04:49 +02:00