[fix] engine: google-news - Google pushed a frontend update (#5984)

Around March 9 - 10, 2026, Google pushed a frontend update to Google News that completely changed the HTML structure of search results. This is a complete overhaul of the Google News engine. - The real URL is encoded in the "jslog" attribute. @SeriousConcept1134: the attribute is a base64 encoded JSON - CEID list is updated - The typification was pushed forward Related: - https://github.com/searxng/searxng/issues/5852#issuecomment-4254438184 - https://github.com/searxng/searxng/issues/5852#issuecomment-4265598833 Closes: https://github.com/searxng/searxng/issues/5852 Suggested-by: SeriousConcept1134 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-05-18 00:44:31 +02:00 · 2026-05-17 15:27:00 +02:00
parent dce3bb69bb
commit f26e450778
2 changed files with 195 additions and 166 deletions
@@ -4079,6 +4079,7 @@
        "bg-BG": "BG:bg",
        "bn-BD": "BD:bn",
        "bn-IN": "IN:bn",
+        "ca-ES": "ES:ca",
        "cs-CZ": "CZ:cs",
        "de-AT": "AT:de",
        "de-CH": "CH:de",
@@ -4110,16 +4111,15 @@
        "es-CO": "CO:es-419",
        "es-CU": "CU:es-419",
        "es-ES": "ES:es",
-        "es-MX": "MX:es-419",
-        "es-PE": "PE:es-419",
-        "es-US": "US:es-419",
-        "es-VE": "VE:es-419",
+        "et-EE": "EE:et",
+        "fi-FI": "FI:fi",
        "fr-BE": "BE:fr",
        "fr-CA": "CA:fr",
        "fr-CH": "CH:fr",
        "fr-FR": "FR:fr",
        "fr-MA": "MA:fr",
        "fr-SN": "SN:fr",
+        "gu-IN": "IN:gu",
        "he-IL": "IL:he",
        "hi-IN": "IN:hi",
        "hu-HU": "HU:hu",
@@ -4131,12 +4131,13 @@
        "lv-LV": "LV:lv",
        "ml-IN": "IN:ml",
        "mr-IN": "IN:mr",
+        "ms-MY": "MY:ms",
        "nb-NO": "NO:no",
        "nl-BE": "BE:nl",
        "nl-NL": "NL:nl",
+        "pa-IN": "IN:pa",
        "pl-PL": "PL:pl",
        "pt-BR": "BR:pt-419",
-        "pt-PT": "PT:pt-150",
        "ro-RO": "RO:ro",
        "ru-RU": "RU:ru",
        "ru-UA": "UA:ru",
@@ -4151,8 +4152,7 @@
        "uk-UA": "UA:uk",
        "vi-VN": "VN:vi",
        "zh-CN": "CN:zh-Hans",
-        "zh-HK": "HK:zh-Hant",
-        "zh-TW": "TW:zh-Hant"
+        "zh-HK": "HK:zh-Hant"
      },
      "supported_domains": {}
    },
@@ -23,9 +23,11 @@ The google news API ignores some parameters from the common :ref:`google API`:
 .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
 .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
 """
+import typing as t

-from urllib.parse import urlencode
+import json
 import base64
+from urllib.parse import urlencode
 from lxml import html
 import babel

@@ -44,18 +46,24 @@ from searx.engines.google import (
 )
 from searx.enginelib.traits import EngineTraits

+from searx.result_types import EngineResults
+
+if t.TYPE_CHECKING:
+    from searx.extended_types import SXNG_Response
+    from searx.search.processors import OnlineParams
+
 # about
 about = {
-    "website": 'https://news.google.com',
-    "wikidata_id": 'Q12020',
-    "official_api_documentation": 'https://developers.google.com/custom-search',
+    "website": "https://news.google.com",
+    "wikidata_id": "Q12020",
+    "official_api_documentation": "https://developers.google.com/custom-search",
    "use_official_api": False,
    "require_api_key": False,
-    "results": 'HTML',
+    "results": "HTML",
 }

 # engine dependent config
-categories = ['news']
+categories = ["news"]
 paging = False
 time_range_support = False

@@ -64,231 +72,252 @@ time_range_support = False
 #
 #  safesearch : results are identical for safesearch=0 and safesearch=2
 safesearch = True
+base_url: str = "https://news.google.com"


-def request(query, params):
+def request(query: str, params: "OnlineParams") -> None:
    """Google-News search request"""

-    sxng_locale = params.get('searxng_locale', 'en-US')
-    ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
+    sxng_locale = params.get("searxng_locale", "en-US")
+    ceid: str = locales.get_engine_locale(
+        sxng_locale, traits.custom["ceid"], default="US:en"
+    )  # pyright: ignore[reportAssignmentType]
    google_info = get_google_info(params, traits)
-    google_info['subdomain'] = 'news.google.com'  # google news has only one domain
+    google_info["subdomain"] = "news.google.com"  # google news has only one domain

-    ceid_region, ceid_lang = ceid.split(':')
+    ceid_region, ceid_lang = ceid.split(":")
    ceid_lang, ceid_suffix = (
-        ceid_lang.split('-')
+        ceid_lang.split(":")
        + [
-            None,
+            "",
        ]
    )[:2]

-    google_info['params']['hl'] = ceid_lang
+    google_info["params"]["hl"] = ceid_lang

-    if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
+    if ceid_suffix and ceid_suffix not in ["Hans", "Hant"]:

        if ceid_region.lower() == ceid_lang:
-            google_info['params']['hl'] = ceid_lang + '-' + ceid_region
+            google_info["params"]["hl"] = ceid_lang + "-" + ceid_region
        else:
-            google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
+            google_info["params"]["hl"] = ceid_lang + "-" + ceid_suffix

    elif ceid_region.lower() != ceid_lang:

-        if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
-            google_info['params']['hl'] = ceid_lang
+        if ceid_region in ["AT", "BE", "CH", "IL", "SA", "IN", "BD", "PT"]:
+            google_info["params"]["hl"] = ceid_lang
        else:
-            google_info['params']['hl'] = ceid_lang + '-' + ceid_region
+            google_info["params"]["hl"] = ceid_lang + "-" + ceid_region

-    google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
-    google_info['params']['gl'] = ceid_region
+    google_info["params"]["lr"] = "lang_" + ceid_lang.split("-")[0]
+    google_info["params"]["gl"] = ceid_region

    query_url = (
-        'https://'
-        + google_info['subdomain']
+        "https://"
+        + google_info["subdomain"]
        + "/search?"
        + urlencode(
-            {
-                'q': query,
-                **google_info['params'],
-            }
+            {"q": query, **google_info["params"]},
        )
        # ceid includes a ':' character which must not be urlencoded
-        + ('&ceid=%s' % ceid)
+        + ("&ceid=%s" % ceid)
    )

-    params['url'] = query_url
-    params['cookies'] = google_info['cookies']
-    params['headers'].update(google_info['headers'])
-    return params
+    params["url"] = query_url
+    params["cookies"] = google_info["cookies"]
+    params["headers"].update(google_info["headers"])


-def response(resp):
+def response(resp: "SXNG_Response") -> EngineResults:
    """Get response from google's search request"""
-    results = []
+
+    res = EngineResults()
+
    detect_google_sorry(resp)

    # convert the text to dom
    dom = html.fromstring(resp.text)

-    for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'):
+    for result in eval_xpath_list(dom, "//div[@jslog and @data-n-tid and @jsdata]"):

-        # The first <a> tag in the <article> contains the link to the article
-        # The href attribute of the <a> tag is a google internal link, we have
-        # to decode
+        url: str = eval_xpath_getindex(result, "./a[@target='_blank']/@href", 0, default=0)
+        if not url:
+            continue
+        if url.startswith("./"):
+            url = base_url + url[1:]

-        href = eval_xpath_getindex(result, './article/a/@href', 0)
-        href = href.split('?')[0]
-        href = href.split('/')[-1]
-        href = base64.urlsafe_b64decode(href + '====')
-        href = href[href.index(b'http') :].split(b'\xd2')[0]
-        href = href.decode()
+        # The real URL is often encoded in the "jslog" attribute
+        jslog: str | None = eval_xpath_getindex(result, "./a[@target='_blank']/@jslog", 0, default=None)

-        title = extract_text(eval_xpath(result, './article/h3[1]'))
+        # Try to extract the real URL from jslog
+        real_url: str | None = None
+        if jslog:
+            # jslog format is usually: "95014; 5:<base64>; track:click,vis".  We
+            # want the second part (index 1) after splitting by ";"
+            parts: list[str] = jslog.split(";")
+            if len(parts) > 1:
+                b64_data: str = parts[1].split(":")[-1].strip()
+                # Pad base64 if necessary
+                b64_data += "=" * (-len(b64_data) % 4)
+                decoded_data: list[str | None] = json.loads(base64.b64decode(b64_data).decode("utf-8"))
+                # The URL is typically the last element in the decoded array
+                if (
+                    isinstance(decoded_data, list)
+                    and isinstance(decoded_data[-1], str)
+                    and decoded_data[-1].startswith("http")
+                ):
+                    real_url = decoded_data[-1]
+        if real_url:
+            url = real_url
+        else:
+            logger.error(f"no real-url found: {url}")
+            continue

-        # The pub_date is mostly a string like 'yesterday', not a real
-        # timezone date or time.  Therefore we can't use publishedDate.
-        pub_date = extract_text(eval_xpath(result, './article//time'))
-        pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
+        title = extract_text(eval_xpath(result, "./h4")) or ""

-        content = ' / '.join([x for x in [pub_origin, pub_date] if x])
+        # The pub_date is mostly a string like 'yesterday', not a real timezone
+        # date or time.  Therefore we can't use publishedDate and place the
+        # *pub* sting into the content.

-        # The image URL is located in a preceding sibling <img> tag, e.g.:
-        # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100"
-        # These URL are long but not personalized (double checked via tor).
+        pub_date = extract_text(eval_xpath(result, ".//time"))
+        pub_origin = extract_text(eval_xpath(result, ".//div[contains(@class, 'vr1PYe')]"))
+        content = " / ".join([x for x in [pub_origin, pub_date] if x])

-        thumbnail = extract_text(result.xpath('preceding-sibling::a/figure/img/@src'))
+        thumbnail: str = eval_xpath_getindex(result, ".//figure/img/@src", 0, default="")
+        if thumbnail and thumbnail.startswith("/"):
+            thumbnail = base_url + thumbnail

-        results.append(
-            {
-                'url': href,
-                'title': title,
-                'content': content,
-                'thumbnail': thumbnail,
-            }
+        res.add(
+            res.types.MainResult(
+                url=url,
+                title=title,
+                content=content,
+                thumbnail=thumbnail,
+            )
        )

-    # return results
-    return results
+    return res


 ceid_list = [
-    'AE:ar',
-    'AR:es-419',
-    'AT:de',
-    'AU:en',
-    'BD:bn',
-    'BE:fr',
-    'BE:nl',
-    'BG:bg',
-    'BR:pt-419',
-    'BW:en',
-    'CA:en',
-    'CA:fr',
-    'CH:de',
-    'CH:fr',
-    'CL:es-419',
-    'CN:zh-Hans',
-    'CO:es-419',
-    'CU:es-419',
-    'CZ:cs',
-    'DE:de',
-    'EG:ar',
-    'ES:es',
-    'ET:en',
-    'FR:fr',
-    'GB:en',
-    'GH:en',
-    'GR:el',
-    'HK:zh-Hant',
-    'HU:hu',
-    'ID:en',
-    'ID:id',
-    'IE:en',
-    'IL:en',
-    'IL:he',
-    'IN:bn',
-    'IN:en',
-    'IN:hi',
-    'IN:ml',
-    'IN:mr',
-    'IN:ta',
-    'IN:te',
-    'IT:it',
-    'JP:ja',
-    'KE:en',
-    'KR:ko',
-    'LB:ar',
-    'LT:lt',
-    'LV:en',
-    'LV:lv',
-    'MA:fr',
-    'MX:es-419',
-    'MY:en',
-    'NA:en',
-    'NG:en',
-    'NL:nl',
-    'NO:no',
-    'NZ:en',
-    'PE:es-419',
-    'PH:en',
-    'PK:en',
-    'PL:pl',
-    'PT:pt-150',
-    'RO:ro',
-    'RS:sr',
-    'RU:ru',
-    'SA:ar',
-    'SE:sv',
-    'SG:en',
-    'SI:sl',
-    'SK:sk',
-    'SN:fr',
-    'TH:th',
-    'TR:tr',
-    'TW:zh-Hant',
-    'TZ:en',
-    'UA:ru',
-    'UA:uk',
-    'UG:en',
-    'US:en',
-    'US:es-419',
-    'VE:es-419',
-    'VN:vi',
-    'ZA:en',
-    'ZW:en',
+    "AE:ar",
+    "AR:es-419",
+    "AT:de",
+    "AU:en",
+    "BD:bn",
+    "BE:fr",
+    "BE:nl",
+    "BG:bg",
+    "BR:pt-419",
+    "BW:en",
+    "CA:en",
+    "CA:fr",
+    "CH:de",
+    "CH:fr",
+    "CL:es-419",
+    "CN:zh-Hans",
+    "CO:es-419",
+    "CU:es-419",
+    "CZ:cs",
+    "DE:de",
+    "EE:et",
+    "EG:ar",
+    "ES:ca",
+    "ES:es",
+    "ET:en",
+    "FI:fi",
+    "FR:fr",
+    "GB:en",
+    "GH:en",
+    "GR:el",
+    "HK:zh-Hant",
+    "HU:hu",
+    "ID:en",
+    "ID:id",
+    "IE:en",
+    "IL:en",
+    "IL:he",
+    "IN:bn",
+    "IN:en",
+    "IN:gu",
+    "IN:hi",
+    "IN:ml",
+    "IN:mr",
+    "IN:pa",
+    "IN:ta",
+    "IN:te",
+    "IT:it",
+    "JP:ja",
+    "KE:en",
+    "KR:ko",
+    "LB:ar",
+    "LT:lt",
+    "LV:en",
+    "LV:lv",
+    "MA:fr",
+    "MY:en",
+    "MY:ms",
+    "NA:en",
+    "NG:en",
+    "NL:nl",
+    "NO:no",
+    "NZ:en",
+    "PH:en",
+    "PK:en",
+    "PL:pl",
+    "RO:ro",
+    "RS:sr",
+    "RU:ru",
+    "SA:ar",
+    "SE:sv",
+    "SG:en",
+    "SI:sl",
+    "SK:sk",
+    "SN:fr",
+    "TH:th",
+    "TR:tr",
+    "TZ:en",
+    "UA:ru",
+    "UA:uk",
+    "UG:en",
+    "US:en",
+    "VN:vi",
+    "ZA:en",
+    "ZW:en",
 ]
 """List of region/language combinations supported by Google News.  Values of the
 ``ceid`` argument of the Google News REST API."""


 _skip_values = [
-    'ET:en',  # english (ethiopia)
-    'ID:en',  # english (indonesia)
-    'LV:en',  # english (latvia)
+    "ET:en",  # english (ethiopia)
+    "ID:en",  # english (indonesia)
+    "LV:en",  # english (latvia)
 ]

-_ceid_locale_map = {'NO:no': 'nb-NO'}
+_ceid_locale_map = {"NO:no": "nb-NO"}


 def fetch_traits(engine_traits: EngineTraits):
    _fetch_traits(engine_traits, add_domains=False)

-    engine_traits.custom['ceid'] = {}
+    engine_traits.custom["ceid"] = {}

    for ceid in ceid_list:
        if ceid in _skip_values:
            continue

-        region, lang = ceid.split(':')
-        x = lang.split('-')
+        region, lang = ceid.split(":")
+        x = lang.split("-")
        if len(x) > 1:
-            if x[1] not in ['Hant', 'Hans']:
+            if x[1] not in ["Hant", "Hans"]:
                lang = x[0]

-        sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
+        sxng_locale = _ceid_locale_map.get(ceid, lang + "-" + region)
        try:
-            locale = babel.Locale.parse(sxng_locale, sep='-')
+            locale = babel.Locale.parse(sxng_locale, sep="-")
        except babel.UnknownLocaleError:
            print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
            continue

-        engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid
+        engine_traits.custom["ceid"][locales.region_tag(locale)] = ceid