diff --git a/searx/engines/google.py b/searx/engines/google.py index 93a45c1eb..22e79453f 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -327,23 +327,16 @@ def request(query: str, params: "OnlineParams") -> None: params["headers"].update(google_info["headers"]) -# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA -# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26; -RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);') -RE_DATA_IMAGE_end = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*)$') +# regex match to get image map that is found inside the returned javascript: +# (function(){google.ldi={ ... };google.pim={ ... };google.sib=false;google ... +RE_DATA_IMAGE = re.compile(r'"((?:dimg|pimg|tsuid)_[^"]*)":"((?:https?:)?//[^"]*)') -def parse_data_images(text: str): +def parse_url_images(text: str): data_image_map = {} - for img_id, data_image in RE_DATA_IMAGE.findall(text): - end_pos = data_image.rfind("=") - if end_pos > 0: - data_image = data_image[: end_pos + 1] - data_image_map[img_id] = data_image - last = RE_DATA_IMAGE_end.search(text) - if last: - data_image_map[last.group(1)] = last.group(2) + for img_id, image_url in RE_DATA_IMAGE.findall(text): + data_image_map[img_id] = image_url.encode('utf-8').decode("unicode-escape") logger.debug("data:image objects --> %s", list(data_image_map.keys())) return data_image_map @@ -352,7 +345,7 @@ def response(resp: "SXNG_Response"): """Get response from google's search request""" # pylint: disable=too-many-branches, too-many-statements detect_google_sorry(resp) - data_image_map = parse_data_images(resp.text) + data_image_map = parse_url_images(resp.text) results = EngineResults() @@ -392,15 +385,16 @@ def response(resp: "SXNG_Response"): content = extract_text(content_nodes) - thumbnail = result.xpath(".//img/@src") - if thumbnail: - thumbnail = thumbnail[0] + # Images that are NOT the favicon + xpath_image = eval_xpath_getindex(result, './/img[not(@class="XNo5Ab")]', index=0, default=None) + + thumbnail = None + if xpath_image is not None: + thumbnail = xpath_image.get("src") if thumbnail.startswith("data:image"): - img_id = result.xpath(".//img/@id") + img_id = xpath_image.get("id") if img_id: - thumbnail = data_image_map.get(img_id[0]) - else: - thumbnail = None + thumbnail = data_image_map.get(img_id) results.append({"url": url, "title": title, "content": content or '', "thumbnail": thumbnail}) diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 08057fc4f..0860368fd 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -11,7 +11,7 @@ .. _data URLs: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs """ - +import re from urllib.parse import urlencode, urlparse, parse_qs, unquote from lxml import html @@ -29,7 +29,6 @@ from searx.engines.google import ( suggestion_xpath, detect_google_sorry, ui_async, - parse_data_images, ) from searx.utils import get_embeded_stream_url @@ -52,6 +51,23 @@ time_range_support = True safesearch = True +# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA +# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26; +RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);?') + + +def parse_data_images(text: str): + data_image_map = {} + + for img_id, data_image in RE_DATA_IMAGE.findall(text): + end_pos = data_image.rfind("=") + if end_pos > 0: + data_image = data_image[: end_pos + 1] + data_image_map[img_id] = data_image + logger.debug("data:image objects --> %s", list(data_image_map.keys())) + return data_image_map + + def request(query, params): """Google-Video search request""" google_info = get_google_info(params, traits)