diff --git a/searx/engines/google.py b/searx/engines/google.py index f22bdb618..93a45c1eb 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -379,7 +379,11 @@ def response(resp: "SXNG_Response"): title, ) continue - url = unquote(raw_url[7:].split("&sa=U")[0]) # remove the google redirector + + if raw_url.startswith('/url?q='): + url = unquote(raw_url[7:].split("&sa=U")[0]) # remove the google redirector + else: + url = raw_url content_nodes = eval_xpath(result, './/div[contains(@data-sncf, "1")]') for item in content_nodes: @@ -388,24 +392,17 @@ def response(resp: "SXNG_Response"): content = extract_text(content_nodes) - if not content: - logger.debug( - 'ignoring item from the result_xpath list: missing content of title "%s"', - title, - ) - continue - - thumbnail = content_nodes[0].xpath(".//img/@src") + thumbnail = result.xpath(".//img/@src") if thumbnail: thumbnail = thumbnail[0] if thumbnail.startswith("data:image"): - img_id = content_nodes[0].xpath(".//img/@id") + img_id = result.xpath(".//img/@id") if img_id: thumbnail = data_image_map.get(img_id[0]) else: thumbnail = None - results.append({"url": url, "title": title, "content": content, "thumbnail": thumbnail}) + results.append({"url": url, "title": title, "content": content or '', "thumbnail": thumbnail}) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 750f0dd69..08057fc4f 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -12,7 +12,7 @@ https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs """ -from urllib.parse import urlencode, urlparse, parse_qs +from urllib.parse import urlencode, urlparse, parse_qs, unquote from lxml import html from searx.utils import ( @@ -100,14 +100,23 @@ def response(resp): # parse results for result in result_divs: title = extract_text( - eval_xpath_getindex(result, './/h3[contains(@class, "LC20lb")]', 0, default=None), allow_none=True + eval_xpath_getindex(result, './/h3[contains(@class, "LC20lb")] | .//div[@role="heading"]', 0, default=None), + allow_none=True, ) - url = eval_xpath_getindex(result, './/a[@jsname="UWckNb"]/@href', 0, default=None) + url = eval_xpath_getindex( + result, './/a[@jsname="UWckNb"]/@href | .//a[contains(@href, "/url?q=")]/@href', 0, default=None + ) + if url and url.startswith('/url?q='): + url = unquote(url[7:].split('&sa=U')[0]) + content = extract_text( eval_xpath_getindex(result, './/div[contains(@class, "ITZIwc")]', 0, default=None), allow_none=True ) pub_info = extract_text( - eval_xpath_getindex(result, './/div[contains(@class, "gqF9jc")]', 0, default=None), allow_none=True + eval_xpath_getindex( + result, './/div[contains(@class, "gqF9jc")] | .//div[contains(@class, "WRu9Cd")]', 0, default=None + ), + allow_none=True, ) # Broader XPath to find any element thumbnail = eval_xpath_getindex(result, './/img/@src', 0, default=None)