From 0c284b5b098e2803fe9a39f248a95d0fb7691763 Mon Sep 17 00:00:00 2001 From: Zhijie He Date: Tue, 24 Feb 2026 20:54:20 +0800 Subject: [PATCH] [mod] sogou: support published date, redirect URLs, ... --- searx/engines/sogou.py | 111 +++++++++++++++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 20 deletions(-) diff --git a/searx/engines/sogou.py b/searx/engines/sogou.py index 6fb970243..ea011d970 100644 --- a/searx/engines/sogou.py +++ b/searx/engines/sogou.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """Sogou search engine for searxng""" +import re +from datetime import datetime from urllib.parse import urlencode from lxml import html +from searx.exceptions import SearxEngineCaptchaException from searx.utils import extract_text # Metadata @@ -21,7 +24,12 @@ categories = ["general"] paging = True time_range_support = True -time_range_dict = {'day': 'inttime_day', 'week': 'inttime_week', 'month': 'inttime_month', 'year': 'inttime_year'} +time_range_dict = { + "day": "inttime_day", + "week": "inttime_week", + "month": "inttime_month", + "year": "inttime_year", +} # Base URL base_url = "https://www.sogou.com" @@ -33,36 +41,99 @@ def request(query, params): "page": params["pageno"], } - if time_range_dict.get(params['time_range']): - query_params["s_from"] = time_range_dict.get(params['time_range']) + if time_range_dict.get(params["time_range"]): + query_params["s_from"] = time_range_dict.get(params["time_range"]) query_params["tsn"] = 1 + params["allow_redirects"] = False params["url"] = f"{base_url}/web?{urlencode(query_params)}" return params def response(resp): + if ( + resp.status_code == 302 + and resp.next_request is not None + and str(resp.next_request.url).startswith("http://www.sogou.com/antispider") + ): + raise SearxEngineCaptchaException() + dom = html.fromstring(resp.text) results = [] - for item in dom.xpath('//div[contains(@class, "vrwrap")]'): - title = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a')) - url = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a/@href')) + # pylint: disable=line-too-long + for item in dom.xpath( + '//div[contains(@class, "rb")] | //div[contains(@class, "vrwrap") and not(.//div[contains(@class, "special-wrap")])]' + ): + item_html = html.tostring(item, encoding="unicode") - if url.startswith("/link?url="): - url = f"{base_url}{url}" + if item.xpath('.//h3[@class="pt"]/a'): + result = _parse_results(item, item_html) + elif item.xpath('.//h3[contains(@class, "vr-title")]/a'): + result = _parse_results_with_image(item, item_html) + else: + continue - content = extract_text(item.xpath('.//div[contains(@class, "text-layout")]//p[contains(@class, "star-wiki")]')) - if not content: - content = extract_text(item.xpath('.//div[contains(@class, "fz-mid space-txt")]')) - - if title and url: - results.append( - { - "title": title, - "url": url, - "content": content, - } - ) + if result["title"] and result["url"]: + results.append(result) return results + + +def _extract_url(url, item_html): + if url and url.startswith("/link?url="): + match = re.search(r'data-url="([^"]+)"', item_html) + if match: + return match.group(1) + return f"{base_url}{url}" + return url + + +def _parse_date(text): + if text: + text = text.strip().lstrip("-").strip() + date_match = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", text) + if date_match: + try: + return datetime.strptime(date_match.group(1), "%Y-%m-%d") + except (ValueError, TypeError): + pass + return None + + +def _parse_results(item, item_html): + title = extract_text(item.xpath('.//h3[@class="pt"]/a')) + content = extract_text(item.xpath('.//div[@class="ft"]')) + url = _extract_url(extract_text(item.xpath('.//h3[@class="pt"]/a/@href')), item_html) + publishedDate = _parse_date(extract_text(item.xpath(".//cite"))) + return { + "title": title, + "url": url, + "content": content, + "publishedDate": publishedDate, + } + + +def _parse_results_with_image(item, item_html): + title = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a')) + content = extract_text(item.xpath('.//div[contains(@class, "attribute-centent")]')) + if not content: + content = extract_text(item.xpath('.//div[contains(@class, "fz-mid space-txt")]')) + url = _extract_url(extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a/@href')), item_html) + publishedDate = _parse_date(extract_text(item.xpath('.//span[@class="cite-date"]'))) + + thumbnail = None + try: + thumbnail_src = extract_text(item.xpath('.//div[contains(@class, "img-layout")]//img/@src')) + if thumbnail_src: + thumbnail = thumbnail_src.replace("http://", "https://") + except (ValueError, TypeError): + pass + + return { + "title": title, + "url": url, + "content": content, + "publishedDate": publishedDate, + "thumbnail": thumbnail, + }