[mod] sogou: support published date, redirect URLs, ...

This commit is contained in:
Zhijie He
2026-02-24 20:54:20 +08:00
committed by Bnyro
parent 8e9ed5f9be
commit 0c284b5b09
+91 -20
View File
@@ -1,9 +1,12 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
"""Sogou search engine for searxng""" """Sogou search engine for searxng"""
import re
from datetime import datetime
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from searx.exceptions import SearxEngineCaptchaException
from searx.utils import extract_text from searx.utils import extract_text
# Metadata # Metadata
@@ -21,7 +24,12 @@ categories = ["general"]
paging = True paging = True
time_range_support = True time_range_support = True
time_range_dict = {'day': 'inttime_day', 'week': 'inttime_week', 'month': 'inttime_month', 'year': 'inttime_year'} time_range_dict = {
"day": "inttime_day",
"week": "inttime_week",
"month": "inttime_month",
"year": "inttime_year",
}
# Base URL # Base URL
base_url = "https://www.sogou.com" base_url = "https://www.sogou.com"
@@ -33,36 +41,99 @@ def request(query, params):
"page": params["pageno"], "page": params["pageno"],
} }
if time_range_dict.get(params['time_range']): if time_range_dict.get(params["time_range"]):
query_params["s_from"] = time_range_dict.get(params['time_range']) query_params["s_from"] = time_range_dict.get(params["time_range"])
query_params["tsn"] = 1 query_params["tsn"] = 1
params["allow_redirects"] = False
params["url"] = f"{base_url}/web?{urlencode(query_params)}" params["url"] = f"{base_url}/web?{urlencode(query_params)}"
return params return params
def response(resp): def response(resp):
if (
resp.status_code == 302
and resp.next_request is not None
and str(resp.next_request.url).startswith("http://www.sogou.com/antispider")
):
raise SearxEngineCaptchaException()
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
results = [] results = []
for item in dom.xpath('//div[contains(@class, "vrwrap")]'): # pylint: disable=line-too-long
title = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a')) for item in dom.xpath(
url = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a/@href')) '//div[contains(@class, "rb")] | //div[contains(@class, "vrwrap") and not(.//div[contains(@class, "special-wrap")])]'
):
item_html = html.tostring(item, encoding="unicode")
if url.startswith("/link?url="): if item.xpath('.//h3[@class="pt"]/a'):
url = f"{base_url}{url}" result = _parse_results(item, item_html)
elif item.xpath('.//h3[contains(@class, "vr-title")]/a'):
result = _parse_results_with_image(item, item_html)
else:
continue
content = extract_text(item.xpath('.//div[contains(@class, "text-layout")]//p[contains(@class, "star-wiki")]')) if result["title"] and result["url"]:
if not content: results.append(result)
content = extract_text(item.xpath('.//div[contains(@class, "fz-mid space-txt")]'))
if title and url:
results.append(
{
"title": title,
"url": url,
"content": content,
}
)
return results return results
def _extract_url(url, item_html):
if url and url.startswith("/link?url="):
match = re.search(r'data-url="([^"]+)"', item_html)
if match:
return match.group(1)
return f"{base_url}{url}"
return url
def _parse_date(text):
if text:
text = text.strip().lstrip("-").strip()
date_match = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", text)
if date_match:
try:
return datetime.strptime(date_match.group(1), "%Y-%m-%d")
except (ValueError, TypeError):
pass
return None
def _parse_results(item, item_html):
title = extract_text(item.xpath('.//h3[@class="pt"]/a'))
content = extract_text(item.xpath('.//div[@class="ft"]'))
url = _extract_url(extract_text(item.xpath('.//h3[@class="pt"]/a/@href')), item_html)
publishedDate = _parse_date(extract_text(item.xpath(".//cite")))
return {
"title": title,
"url": url,
"content": content,
"publishedDate": publishedDate,
}
def _parse_results_with_image(item, item_html):
title = extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a'))
content = extract_text(item.xpath('.//div[contains(@class, "attribute-centent")]'))
if not content:
content = extract_text(item.xpath('.//div[contains(@class, "fz-mid space-txt")]'))
url = _extract_url(extract_text(item.xpath('.//h3[contains(@class, "vr-title")]/a/@href')), item_html)
publishedDate = _parse_date(extract_text(item.xpath('.//span[@class="cite-date"]')))
thumbnail = None
try:
thumbnail_src = extract_text(item.xpath('.//div[contains(@class, "img-layout")]//img/@src'))
if thumbnail_src:
thumbnail = thumbnail_src.replace("http://", "https://")
except (ValueError, TypeError):
pass
return {
"title": title,
"url": url,
"content": content,
"publishedDate": publishedDate,
"thumbnail": thumbnail,
}