[fix] engine: google-news - Google pushed a frontend update (#5984)

Around March 9 - 10, 2026, Google pushed a frontend update to Google News that completely changed the HTML structure of search results. This is a complete overhaul of the Google News engine. - The real URL is encoded in the "jslog" attribute. @SeriousConcept1134: the attribute is a base64 encoded JSON - CEID list is updated - The typification was pushed forward Related: - https://github.com/searxng/searxng/issues/5852#issuecomment-4254438184 - https://github.com/searxng/searxng/issues/5852#issuecomment-4265598833 Closes: https://github.com/searxng/searxng/issues/5852 Suggested-by: SeriousConcept1134 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-05-21 02:14:31 +02:00 · 2026-05-17 15:27:00 +02:00
parent dce3bb69bb
commit f26e450778
2 changed files with 195 additions and 166 deletions
@@ -4079,6 +4079,7 @@
        "bg-BG": "BG:bg",
        "bn-BD": "BD:bn",
        "bn-IN": "IN:bn",
        "ca-ES": "ES:ca",
        "cs-CZ": "CZ:cs",
        "de-AT": "AT:de",
        "de-CH": "CH:de",
@@ -4110,16 +4111,15 @@
        "es-CO": "CO:es-419",
        "es-CU": "CU:es-419",
        "es-ES": "ES:es",
-        "es-MX": "MX:es-419",
+        "et-EE": "EE:et",
-        "es-PE": "PE:es-419",
+        "fi-FI": "FI:fi",
        "es-US": "US:es-419",
        "es-VE": "VE:es-419",
        "fr-BE": "BE:fr",
        "fr-CA": "CA:fr",
        "fr-CH": "CH:fr",
        "fr-FR": "FR:fr",
        "fr-MA": "MA:fr",
        "fr-SN": "SN:fr",
        "gu-IN": "IN:gu",
        "he-IL": "IL:he",
        "hi-IN": "IN:hi",
        "hu-HU": "HU:hu",
@@ -4131,12 +4131,13 @@
        "lv-LV": "LV:lv",
        "ml-IN": "IN:ml",
        "mr-IN": "IN:mr",
        "ms-MY": "MY:ms",
        "nb-NO": "NO:no",
        "nl-BE": "BE:nl",
        "nl-NL": "NL:nl",
        "pa-IN": "IN:pa",
        "pl-PL": "PL:pl",
        "pt-BR": "BR:pt-419",
        "pt-PT": "PT:pt-150",
        "ro-RO": "RO:ro",
        "ru-RU": "RU:ru",
        "ru-UA": "UA:ru",
@@ -4151,8 +4152,7 @@
        "uk-UA": "UA:uk",
        "vi-VN": "VN:vi",
        "zh-CN": "CN:zh-Hans",
-        "zh-HK": "HK:zh-Hant",
+        "zh-HK": "HK:zh-Hant"
        "zh-TW": "TW:zh-Hant"
      },
      "supported_domains": {}
    },
@@ -23,9 +23,11 @@ The google news API ignores some parameters from the common :ref:`google API`:
 .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
 .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
 """
 import typing as t
-from urllib.parse import urlencode
+import json
 import base64
 from urllib.parse import urlencode
 from lxml import html
 import babel
@@ -44,18 +46,24 @@ from searx.engines.google import (
 )
 from searx.enginelib.traits import EngineTraits
 from searx.result_types import EngineResults
 if t.TYPE_CHECKING:
    from searx.extended_types import SXNG_Response
    from searx.search.processors import OnlineParams
 # about
 about = {
-    "website": 'https://news.google.com',
+    "website": "https://news.google.com",
-    "wikidata_id": 'Q12020',
+    "wikidata_id": "Q12020",
-    "official_api_documentation": 'https://developers.google.com/custom-search',
+    "official_api_documentation": "https://developers.google.com/custom-search",
    "use_official_api": False,
    "require_api_key": False,
-    "results": 'HTML',
+    "results": "HTML",
 }
 # engine dependent config
-categories = ['news']
+categories = ["news"]
 paging = False
 time_range_support = False
@@ -64,231 +72,252 @@ time_range_support = False
 #
 #  safesearch : results are identical for safesearch=0 and safesearch=2
 safesearch = True
 base_url: str = "https://news.google.com"
-def request(query, params):
+def request(query: str, params: "OnlineParams") -> None:
    """Google-News search request"""
-    sxng_locale = params.get('searxng_locale', 'en-US')
+    sxng_locale = params.get("searxng_locale", "en-US")
-    ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
+    ceid: str = locales.get_engine_locale(
        sxng_locale, traits.custom["ceid"], default="US:en"
    )  # pyright: ignore[reportAssignmentType]
    google_info = get_google_info(params, traits)
-    google_info['subdomain'] = 'news.google.com'  # google news has only one domain
+    google_info["subdomain"] = "news.google.com"  # google news has only one domain
-    ceid_region, ceid_lang = ceid.split(':')
+    ceid_region, ceid_lang = ceid.split(":")
    ceid_lang, ceid_suffix = (
-        ceid_lang.split('-')
+        ceid_lang.split(":")
        + [
-            None,
+            "",
        ]
    )[:2]
-    google_info['params']['hl'] = ceid_lang
+    google_info["params"]["hl"] = ceid_lang
-    if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
+    if ceid_suffix and ceid_suffix not in ["Hans", "Hant"]:
        if ceid_region.lower() == ceid_lang:
-            google_info['params']['hl'] = ceid_lang + '-' + ceid_region
+            google_info["params"]["hl"] = ceid_lang + "-" + ceid_region
        else:
-            google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
+            google_info["params"]["hl"] = ceid_lang + "-" + ceid_suffix
    elif ceid_region.lower() != ceid_lang:
-        if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
+        if ceid_region in ["AT", "BE", "CH", "IL", "SA", "IN", "BD", "PT"]:
-            google_info['params']['hl'] = ceid_lang
+            google_info["params"]["hl"] = ceid_lang
        else:
-            google_info['params']['hl'] = ceid_lang + '-' + ceid_region
+            google_info["params"]["hl"] = ceid_lang + "-" + ceid_region
-    google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
+    google_info["params"]["lr"] = "lang_" + ceid_lang.split("-")[0]
-    google_info['params']['gl'] = ceid_region
+    google_info["params"]["gl"] = ceid_region
    query_url = (
-        'https://'
+        "https://"
-        + google_info['subdomain']
+        + google_info["subdomain"]
        + "/search?"
        + urlencode(
-            {
+            {"q": query, **google_info["params"]},
                'q': query,
                **google_info['params'],
            }
        )
        # ceid includes a ':' character which must not be urlencoded
-        + ('&ceid=%s' % ceid)
+        + ("&ceid=%s" % ceid)
    )
-    params['url'] = query_url
+    params["url"] = query_url
-    params['cookies'] = google_info['cookies']
+    params["cookies"] = google_info["cookies"]
-    params['headers'].update(google_info['headers'])
+    params["headers"].update(google_info["headers"])
    return params
-def response(resp):
+def response(resp: "SXNG_Response") -> EngineResults:
    """Get response from google's search request"""
-    results = []
+
    res = EngineResults()
    detect_google_sorry(resp)
    # convert the text to dom
    dom = html.fromstring(resp.text)
-    for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
+    for result in eval_xpath_list(dom, "//div[@jslog and @data-n-tid and @jsdata]"):
-        # The first <a> tag in the <article> contains the link to the article
+        url: str = eval_xpath_getindex(result, "./a[@target='_blank']/@href", 0, default=0)
-        # The href attribute of the <a> tag is a google internal link, we have
+        if not url:
-        # to decode
+            continue
        if url.startswith("./"):
            url = base_url + url[1:]
-        href = eval_xpath_getindex(result, './article/a/@href', 0)
+        # The real URL is often encoded in the "jslog" attribute
-        href = href.split('?')[0]
+        jslog: str | None = eval_xpath_getindex(result, "./a[@target='_blank']/@jslog", 0, default=None)
        href = href.split('/')[-1]
        href = base64.urlsafe_b64decode(href + '====')
        href = href[href.index(b'http') :].split(b'\xd2')[0]
        href = href.decode()
-        title = extract_text(eval_xpath(result, './article/h3[1]'))
+        # Try to extract the real URL from jslog
        real_url: str | None = None
        if jslog:
            # jslog format is usually: "95014; 5:<base64>; track:click,vis".  We
            # want the second part (index 1) after splitting by ";"
            parts: list[str] = jslog.split(";")
            if len(parts) > 1:
                b64_data: str = parts[1].split(":")[-1].strip()
                # Pad base64 if necessary
                b64_data += "=" * (-len(b64_data) % 4)
                decoded_data: list[str | None] = json.loads(base64.b64decode(b64_data).decode("utf-8"))
                # The URL is typically the last element in the decoded array
                if (
                    isinstance(decoded_data, list)
                    and isinstance(decoded_data[-1], str)
                    and decoded_data[-1].startswith("http")
                ):
                    real_url = decoded_data[-1]
        if real_url:
            url = real_url
        else:
            logger.error(f"no real-url found: {url}")
            continue
-        # The pub_date is mostly a string like 'yesterday', not a real
+        title = extract_text(eval_xpath(result, "./h4")) or ""
        # timezone date or time.  Therefore we can't use publishedDate.
        pub_date = extract_text(eval_xpath(result, './article//time'))
        pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
-        content = ' / '.join([x for x in [pub_origin, pub_date] if x])
+        # The pub_date is mostly a string like 'yesterday', not a real timezone
        # date or time.  Therefore we can't use publishedDate and place the
        # *pub* sting into the content.
-        # The image URL is located in a preceding sibling <img> tag, e.g.:
+        pub_date = extract_text(eval_xpath(result, ".//time"))
-        # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
+        pub_origin = extract_text(eval_xpath(result, ".//div[contains(@class, 'vr1PYe')]"))
-        # These URL are long but not personalized (double checked via tor).
+        content = " / ".join([x for x in [pub_origin, pub_date] if x])
-        thumbnail = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
+        thumbnail: str = eval_xpath_getindex(result, ".//figure/img/@src", 0, default="")
        if thumbnail and thumbnail.startswith("/"):
            thumbnail = base_url + thumbnail
-        results.append(
+        res.add(
-            {
+            res.types.MainResult(
-                'url': href,
+                url=url,
-                'title': title,
+                title=title,
-                'content': content,
+                content=content,
-                'thumbnail': thumbnail,
+                thumbnail=thumbnail,
-            }
+            )
        )
-    # return results
+    return res
    return results
 ceid_list = [
-    'AE:ar',
+    "AE:ar",
-    'AR:es-419',
+    "AR:es-419",
-    'AT:de',
+    "AT:de",
-    'AU:en',
+    "AU:en",
-    'BD:bn',
+    "BD:bn",
-    'BE:fr',
+    "BE:fr",
-    'BE:nl',
+    "BE:nl",
-    'BG:bg',
+    "BG:bg",
-    'BR:pt-419',
+    "BR:pt-419",
-    'BW:en',
+    "BW:en",
-    'CA:en',
+    "CA:en",
-    'CA:fr',
+    "CA:fr",
-    'CH:de',
+    "CH:de",
-    'CH:fr',
+    "CH:fr",
-    'CL:es-419',
+    "CL:es-419",
-    'CN:zh-Hans',
+    "CN:zh-Hans",
-    'CO:es-419',
+    "CO:es-419",
-    'CU:es-419',
+    "CU:es-419",
-    'CZ:cs',
+    "CZ:cs",
-    'DE:de',
+    "DE:de",
-    'EG:ar',
+    "EE:et",
-    'ES:es',
+    "EG:ar",
-    'ET:en',
+    "ES:ca",
-    'FR:fr',
+    "ES:es",
-    'GB:en',
+    "ET:en",
-    'GH:en',
+    "FI:fi",
-    'GR:el',
+    "FR:fr",
-    'HK:zh-Hant',
+    "GB:en",
-    'HU:hu',
+    "GH:en",
-    'ID:en',
+    "GR:el",
-    'ID:id',
+    "HK:zh-Hant",
-    'IE:en',
+    "HU:hu",
-    'IL:en',
+    "ID:en",
-    'IL:he',
+    "ID:id",
-    'IN:bn',
+    "IE:en",
-    'IN:en',
+    "IL:en",
-    'IN:hi',
+    "IL:he",
-    'IN:ml',
+    "IN:bn",
-    'IN:mr',
+    "IN:en",
-    'IN:ta',
+    "IN:gu",
-    'IN:te',
+    "IN:hi",
-    'IT:it',
+    "IN:ml",
-    'JP:ja',
+    "IN:mr",
-    'KE:en',
+    "IN:pa",
-    'KR:ko',
+    "IN:ta",
-    'LB:ar',
+    "IN:te",
-    'LT:lt',
+    "IT:it",
-    'LV:en',
+    "JP:ja",
-    'LV:lv',
+    "KE:en",
-    'MA:fr',
+    "KR:ko",
-    'MX:es-419',
+    "LB:ar",
-    'MY:en',
+    "LT:lt",
-    'NA:en',
+    "LV:en",
-    'NG:en',
+    "LV:lv",
-    'NL:nl',
+    "MA:fr",
-    'NO:no',
+    "MY:en",
-    'NZ:en',
+    "MY:ms",
-    'PE:es-419',
+    "NA:en",
-    'PH:en',
+    "NG:en",
-    'PK:en',
+    "NL:nl",
-    'PL:pl',
+    "NO:no",
-    'PT:pt-150',
+    "NZ:en",
-    'RO:ro',
+    "PH:en",
-    'RS:sr',
+    "PK:en",
-    'RU:ru',
+    "PL:pl",
-    'SA:ar',
+    "RO:ro",
-    'SE:sv',
+    "RS:sr",
-    'SG:en',
+    "RU:ru",
-    'SI:sl',
+    "SA:ar",
-    'SK:sk',
+    "SE:sv",
-    'SN:fr',
+    "SG:en",
-    'TH:th',
+    "SI:sl",
-    'TR:tr',
+    "SK:sk",
-    'TW:zh-Hant',
+    "SN:fr",
-    'TZ:en',
+    "TH:th",
-    'UA:ru',
+    "TR:tr",
-    'UA:uk',
+    "TZ:en",
-    'UG:en',
+    "UA:ru",
-    'US:en',
+    "UA:uk",
-    'US:es-419',
+    "UG:en",
-    'VE:es-419',
+    "US:en",
-    'VN:vi',
+    "VN:vi",
-    'ZA:en',
+    "ZA:en",
-    'ZW:en',
+    "ZW:en",
 ]
 """List of region/language combinations supported by Google News.  Values of the
 ``ceid`` argument of the Google News REST API."""
 _skip_values = [
-    'ET:en',  # english (ethiopia)
+    "ET:en",  # english (ethiopia)
-    'ID:en',  # english (indonesia)
+    "ID:en",  # english (indonesia)
-    'LV:en',  # english (latvia)
+    "LV:en",  # english (latvia)
 ]
-_ceid_locale_map = {'NO:no': 'nb-NO'}
+_ceid_locale_map = {"NO:no": "nb-NO"}
 def fetch_traits(engine_traits: EngineTraits):
    _fetch_traits(engine_traits, add_domains=False)
-    engine_traits.custom['ceid'] = {}
+    engine_traits.custom["ceid"] = {}
    for ceid in ceid_list:
        if ceid in _skip_values:
            continue
-        region, lang = ceid.split(':')
+        region, lang = ceid.split(":")
-        x = lang.split('-')
+        x = lang.split("-")
        if len(x) > 1:
-            if x[1] not in ['Hant', 'Hans']:
+            if x[1] not in ["Hant", "Hans"]:
                lang = x[0]
-        sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
+        sxng_locale = _ceid_locale_map.get(ceid, lang + "-" + region)
        try:
-            locale = babel.Locale.parse(sxng_locale, sep='-')
+            locale = babel.Locale.parse(sxng_locale, sep="-")
        except babel.UnknownLocaleError:
            print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
            continue
-        engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid
+        engine_traits.custom["ceid"][locales.region_tag(locale)] = ceid