mirror of
https://github.com/searxng/searxng.git
synced 2026-06-08 19:07:50 +02:00
72a827ae93
Avoids yep's botblocking by sending Sec-Fetch-* headers (as the browser does).
140 lines
4.3 KiB
Python
140 lines
4.3 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""Yep (general, images, news)"""
|
|
|
|
import re
|
|
|
|
import typing as t
|
|
|
|
from urllib.parse import urlencode
|
|
|
|
from searx.result_types import EngineResults
|
|
from searx.utils import html_to_text, eval_xpath_getindex, extract_text
|
|
|
|
if t.TYPE_CHECKING:
|
|
from searx.enginelib.traits import EngineTraits
|
|
from searx.extended_types import SXNG_Response
|
|
from searx.search.processors import OnlineParams
|
|
|
|
about = {
|
|
"website": "https://yep.com/",
|
|
"official_api_documentation": "https://docs.developer.yelp.com",
|
|
"use_official_api": False,
|
|
"require_api_key": False,
|
|
"results": "JSON",
|
|
}
|
|
|
|
base_url = "https://api.yep.com"
|
|
web_base_url = "https://yep.com"
|
|
|
|
safesearch = True
|
|
safesearch_map = {0: "off", 1: "moderate", 2: "strict"}
|
|
|
|
enable_http2 = False
|
|
|
|
results_per_page = 20
|
|
|
|
_IMPORT_RE = re.compile(r"import\"(.*?)\";")
|
|
_LANGUAGE_RE = re.compile(r"\{english:\".*?\",code_string:\"(.*?)\",code:\".*?\"\}")
|
|
|
|
|
|
def request(query: str, params: "OnlineParams") -> None:
|
|
args = {"query": query, "safeSearch": safesearch_map[params["safesearch"]], "limit": results_per_page}
|
|
|
|
engine_language: str | None = traits.get_language(params["searxng_locale"])
|
|
if engine_language:
|
|
args["hl"] = engine_language
|
|
|
|
params["url"] = f"{base_url}/search?{urlencode(args)}"
|
|
params["headers"].update(
|
|
{
|
|
"Referer": f"{web_base_url}/",
|
|
"Origin": web_base_url,
|
|
"Sec-Fetch-Dest": "empty",
|
|
"Sec-Fetch-Mode": "cors",
|
|
"Sec-Fetch-Site": "same-site",
|
|
}
|
|
)
|
|
|
|
|
|
def response(resp: "SXNG_Response") -> EngineResults:
|
|
res = EngineResults()
|
|
|
|
result: dict[str, str]
|
|
for result in resp.json()[1]["results"]:
|
|
res.add(
|
|
res.types.MainResult(
|
|
url=result["url"],
|
|
title=result["title"],
|
|
content=html_to_text(result["snippet"]),
|
|
)
|
|
)
|
|
|
|
return res
|
|
|
|
|
|
def fetch_traits(engine_traits: "EngineTraits"):
|
|
"""Fetch :ref:`languages <yep languages>` and :ref:`regions <yep
|
|
regions>` from Yep.
|
|
|
|
The language options are very well hidden on Yep. To get it, we have to do the following:
|
|
- Load the yep.com mainpage and extract the URL of the JavaScript app
|
|
- Load the JavaScript source code and extract the URL of all imported modules from it
|
|
- Load the imported modules to search for the right one that contains the languages
|
|
"""
|
|
|
|
# pylint: disable=import-outside-toplevel, too-many-branches
|
|
|
|
from lxml import html
|
|
import babel
|
|
|
|
from searx.locales import language_tag
|
|
from searx.network import get # see https://github.com/searxng/searxng/issues/762
|
|
|
|
from searx.utils import gen_useragent
|
|
|
|
headers = {
|
|
"User-Agent": gen_useragent(),
|
|
"Referer": f"{web_base_url}/",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "same-origin",
|
|
}
|
|
|
|
resp = get(web_base_url, headers=headers, timeout=5)
|
|
if not resp.ok:
|
|
raise RuntimeError("Response from Yep languages is not OK.")
|
|
|
|
doc = html.fromstring(resp.text)
|
|
url = eval_xpath_getindex(doc, "//script[contains(@src, 'PageApp')]/@src", index=0)
|
|
|
|
resp = get("https:" + extract_text(url), headers=headers, timeout=5)
|
|
if not resp.ok:
|
|
raise RuntimeError("Response from Yep languages is not OK.")
|
|
|
|
language_codes = []
|
|
for script_path in _IMPORT_RE.findall(resp.text):
|
|
resp = get(f"{web_base_url}{script_path}", headers=headers, timeout=5)
|
|
if not resp.ok:
|
|
raise RuntimeError("Response from Yep languages is not OK.")
|
|
|
|
for match in _LANGUAGE_RE.findall(resp.text):
|
|
language_codes.append(match)
|
|
|
|
if language_codes:
|
|
break
|
|
|
|
for language_code in language_codes:
|
|
try:
|
|
sxng_tag = language_tag(babel.Locale.parse(language_code, sep="-"))
|
|
except babel.UnknownLocaleError:
|
|
# silently ignore unknown languages
|
|
continue
|
|
# print("%-20s: %s <-- %s" % (extract_text(option), country_tag, sxng_tag))
|
|
|
|
conflict = engine_traits.languages.get(sxng_tag)
|
|
if conflict:
|
|
if conflict != sxng_tag:
|
|
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, language_code))
|
|
continue
|
|
engine_traits.languages[sxng_tag] = language_code
|