mirror of
https://github.com/searxng/searxng.git
synced 2026-05-24 03:44:31 +02:00
[fix] google: switch to using "Google App" for Android useragent (#5892)
I found a bypass using the Android Google App this time. However, unlike the iPhone GSA method, this one does have rate limits. Although it took a couple of hundred consecutive requests to trigger them.
This commit is contained in:
+1525
-46
File diff suppressed because it is too large
Load Diff
+19
-11
@@ -69,7 +69,7 @@ filter_mapping = {0: "off", 1: "medium", 2: "high"}
|
|||||||
|
|
||||||
# Suggestions are links placed in a *card-section*, we extract only the text
|
# Suggestions are links placed in a *card-section*, we extract only the text
|
||||||
# from the links not the links itself.
|
# from the links not the links itself.
|
||||||
suggestion_xpath = '//div[contains(@class, "ouy7Mc")]//a'
|
suggestion_xpath = '//div[contains(@class, "gGQDvd iIWm4b")]//a'
|
||||||
|
|
||||||
|
|
||||||
_arcid_range = string.ascii_letters + string.digits + "_-"
|
_arcid_range = string.ascii_letters + string.digits + "_-"
|
||||||
@@ -269,6 +269,15 @@ def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[st
|
|||||||
ret_val["headers"]["Accept"] = "*/*"
|
ret_val["headers"]["Accept"] = "*/*"
|
||||||
ret_val["headers"]["User-Agent"] = gen_gsa_useragent()
|
ret_val["headers"]["User-Agent"] = gen_gsa_useragent()
|
||||||
|
|
||||||
|
# Hardcoded default ENID Header required alongside the Android Google App
|
||||||
|
# User Agent
|
||||||
|
ret_val["headers"]["__Secure-ENID"] = (
|
||||||
|
"28.SE=II9FMkz92GewodDwKRBFsMISph7GsQs8JYLdXmAlprl6UcC02O2p7kfQlAWuwT"
|
||||||
|
"oygcrqHpmwQSH57b0c2kXfRfo35J8aV5FYSeUzYB67hqZQ2tZB7-o0hlTKwb5qMjn8Cf"
|
||||||
|
"w_AZ2s_6KIFMAl2goXGcXHSfgu4jwZOqShlHCcag0ppy_NnxJYWxpLkaeuGCICwWoIFJ"
|
||||||
|
"HP6Gy4BOkIEsl1N_k6F6jMF_OklE9qIubiyKkNaA"
|
||||||
|
)
|
||||||
|
|
||||||
# Cookies
|
# Cookies
|
||||||
|
|
||||||
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
|
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
|
||||||
@@ -328,14 +337,14 @@ def request(query: str, params: "OnlineParams") -> None:
|
|||||||
|
|
||||||
|
|
||||||
# regex match to get image map that is found inside the returned javascript:
|
# regex match to get image map that is found inside the returned javascript:
|
||||||
# (function(){google.ldi={ ... };google.pim={ ... };google.sib=false;google ...
|
# (function(){var s='...';var i=['...'] ...}
|
||||||
RE_DATA_IMAGE = re.compile(r'"((?:dimg|pimg|tsuid)_[^"]*)":"((?:https?:)?//[^"]*)')
|
RE_DATA_IMAGE = re.compile(r"(data:image[^']*?)'[^']*?'((?:dimg|pimg|tsuid)[^']*)")
|
||||||
|
|
||||||
|
|
||||||
def parse_url_images(text: str):
|
def parse_url_images(text: str):
|
||||||
data_image_map = {}
|
data_image_map = {}
|
||||||
|
|
||||||
for img_id, image_url in RE_DATA_IMAGE.findall(text):
|
for image_url, img_id in RE_DATA_IMAGE.findall(text):
|
||||||
data_image_map[img_id] = image_url.encode('utf-8').decode("unicode-escape")
|
data_image_map[img_id] = image_url.encode('utf-8').decode("unicode-escape")
|
||||||
logger.debug("data:image objects --> %s", list(data_image_map.keys()))
|
logger.debug("data:image objects --> %s", list(data_image_map.keys()))
|
||||||
return data_image_map
|
return data_image_map
|
||||||
@@ -353,19 +362,18 @@ def response(resp: "SXNG_Response"):
|
|||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
# parse results
|
# parse results
|
||||||
|
for result in eval_xpath_list(dom, '//a[@data-ved and not(@class)]'):
|
||||||
for result in eval_xpath_list(dom, './/div[contains(@class, "MjjYud")]'):
|
|
||||||
# pylint: disable=too-many-nested-blocks
|
# pylint: disable=too-many-nested-blocks
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title_tag = eval_xpath_getindex(result, './/div[contains(@role, "link")]', 0, default=None)
|
title_tag = eval_xpath_getindex(result, './/div[@style]', 0, default=None)
|
||||||
if title_tag is None:
|
if title_tag is None:
|
||||||
# this not one of the common google results *section*
|
# this not one of the common google results *section*
|
||||||
logger.debug("ignoring item from the result_xpath list: missing title")
|
logger.debug("ignoring item from the result_xpath list: missing title")
|
||||||
continue
|
continue
|
||||||
title = extract_text(title_tag)
|
title = extract_text(title_tag)
|
||||||
|
|
||||||
raw_url = eval_xpath_getindex(result, ".//a/@href", 0, None)
|
raw_url = result.get("href")
|
||||||
if raw_url is None:
|
if raw_url is None:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
'ignoring item from the result_xpath list: missing url of title "%s"',
|
'ignoring item from the result_xpath list: missing url of title "%s"',
|
||||||
@@ -378,15 +386,15 @@ def response(resp: "SXNG_Response"):
|
|||||||
else:
|
else:
|
||||||
url = raw_url
|
url = raw_url
|
||||||
|
|
||||||
content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]')
|
content_nodes = eval_xpath(result, '../..//div[contains(@class, "ilUpNd H66NU aSRlid")]')
|
||||||
for item in content_nodes:
|
for item in content_nodes:
|
||||||
for script in item.xpath(".//script"):
|
for script in item.xpath(".//script"):
|
||||||
script.getparent().remove(script)
|
script.getparent().remove(script)
|
||||||
|
|
||||||
content = extract_text(content_nodes)
|
content = extract_text(content_nodes[0])
|
||||||
|
|
||||||
# Images that are NOT the favicon
|
# Images that are NOT the favicon
|
||||||
xpath_image = eval_xpath_getindex(result, './/img[not(@class="XNo5Ab")]', index=0, default=None)
|
xpath_image = eval_xpath_getindex(result, './/img', index=0, default=None)
|
||||||
|
|
||||||
thumbnail = None
|
thumbnail = None
|
||||||
if xpath_image is not None:
|
if xpath_image is not None:
|
||||||
|
|||||||
+3
-3
@@ -13,7 +13,7 @@ from collections.abc import MutableMapping, Callable
|
|||||||
|
|
||||||
from numbers import Number
|
from numbers import Number
|
||||||
from os.path import splitext, join
|
from os.path import splitext, join
|
||||||
from random import choice
|
from random import choice, randint
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from html import escape
|
from html import escape
|
||||||
from urllib.parse import urljoin, urlparse, parse_qs, urlencode
|
from urllib.parse import urljoin, urlparse, parse_qs, urlencode
|
||||||
@@ -82,11 +82,11 @@ def gen_useragent(os_string: str | None = None) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def gen_gsa_useragent() -> str:
|
def gen_gsa_useragent() -> str:
|
||||||
"""Return a random GSA User Agent suitable for Google
|
"""Return a random "Android Google App" User Agent suitable for Google
|
||||||
|
|
||||||
See searx/data/gsa_useragents.txt
|
See searx/data/gsa_useragents.txt
|
||||||
"""
|
"""
|
||||||
return choice(gsa_useragents_loader())
|
return choice(gsa_useragents_loader()) + " GoogleApp/" + str(randint(0, 9))
|
||||||
|
|
||||||
|
|
||||||
class HTMLTextExtractor(HTMLParser):
|
class HTMLTextExtractor(HTMLParser):
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ def fetch_gsa_useragents() -> list[str]:
|
|||||||
|
|
||||||
suas: set[str] = set()
|
suas: set[str] = set()
|
||||||
for ua in loads(decompress(response.content)):
|
for ua in loads(decompress(response.content)):
|
||||||
if ua["platform"] == "iPhone" and "GSA" in ua["userAgent"]:
|
if "Android" in ua["userAgent"] and "Chrome" in ua["userAgent"] and "Samsung" not in ua["userAgent"]:
|
||||||
suas.add(ua["userAgent"])
|
suas.add(ua["userAgent"])
|
||||||
|
|
||||||
luas = list(suas)
|
luas = list(suas)
|
||||||
|
|||||||
Reference in New Issue
Block a user