# SPDX-License-Identifier: AGPL-3.0-or-later """This is the implementation of the Google News engine. Google News has a different region handling compared to Google WEB. - the ``ceid`` argument has to be set (:py:obj:`ceid_list`) - the hl_ argument has to be set correctly (and different to Google WEB) - the gl_ argument is mandatory If one of this argument is not set correctly, the request is redirected to CONSENT dialog:: https://consent.google.com/m?continue= The google news API ignores some parameters from the common :ref:`google API`: - num_ : the number of search results is ignored / there is no paging all results for a query term are in the first response. - save_ : is ignored / Google-News results are always *SafeSearch* .. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp .. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp """ import typing as t import json import base64 from urllib.parse import urlencode from lxml import html import babel from searx import locales from searx.utils import ( eval_xpath, eval_xpath_list, eval_xpath_getindex, extract_text, ) from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import from searx.engines.google import ( get_google_info, detect_google_sorry, ) from searx.enginelib.traits import EngineTraits from searx.result_types import EngineResults if t.TYPE_CHECKING: from searx.extended_types import SXNG_Response from searx.search.processors import OnlineParams # about about = { "website": "https://news.google.com", "wikidata_id": "Q12020", "official_api_documentation": "https://developers.google.com/custom-search", "use_official_api": False, "require_api_key": False, "results": "HTML", } # engine dependent config categories = ["news"] paging = False time_range_support = False # Google-News results are always *SafeSearch*. Option 'safesearch' is set to # False here. # # safesearch : results are identical for safesearch=0 and safesearch=2 safesearch = True base_url: str = "https://news.google.com" def request(query: str, params: "OnlineParams") -> None: """Google-News search request""" sxng_locale = params.get("searxng_locale", "en-US") ceid: str = locales.get_engine_locale( sxng_locale, traits.custom["ceid"], default="US:en" ) # pyright: ignore[reportAssignmentType] google_info = get_google_info(params, traits) google_info["subdomain"] = "news.google.com" # google news has only one domain ceid_region, ceid_lang = ceid.split(":") ceid_lang, ceid_suffix = ( ceid_lang.split(":") + [ "", ] )[:2] google_info["params"]["hl"] = ceid_lang if ceid_suffix and ceid_suffix not in ["Hans", "Hant"]: if ceid_region.lower() == ceid_lang: google_info["params"]["hl"] = ceid_lang + "-" + ceid_region else: google_info["params"]["hl"] = ceid_lang + "-" + ceid_suffix elif ceid_region.lower() != ceid_lang: if ceid_region in ["AT", "BE", "CH", "IL", "SA", "IN", "BD", "PT"]: google_info["params"]["hl"] = ceid_lang else: google_info["params"]["hl"] = ceid_lang + "-" + ceid_region google_info["params"]["lr"] = "lang_" + ceid_lang.split("-")[0] google_info["params"]["gl"] = ceid_region query_url = ( "https://" + google_info["subdomain"] + "/search?" + urlencode( {"q": query, **google_info["params"]}, ) # ceid includes a ':' character which must not be urlencoded + ("&ceid=%s" % ceid) ) params["url"] = query_url params["cookies"] = google_info["cookies"] params["headers"].update(google_info["headers"]) def response(resp: "SXNG_Response") -> EngineResults: """Get response from google's search request""" res = EngineResults() detect_google_sorry(resp) # convert the text to dom dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, "//div[@jslog and @data-n-tid and @jsdata]"): url: str = eval_xpath_getindex(result, "./a[@target='_blank']/@href", 0, default=0) if not url: continue if url.startswith("./"): url = base_url + url[1:] # The real URL is often encoded in the "jslog" attribute jslog: str | None = eval_xpath_getindex(result, "./a[@target='_blank']/@jslog", 0, default=None) # Try to extract the real URL from jslog real_url: str | None = None if jslog: # jslog format is usually: "95014; 5:; track:click,vis". We # want the second part (index 1) after splitting by ";" parts: list[str] = jslog.split(";") if len(parts) > 1: b64_data: str = parts[1].split(":")[-1].strip() # Pad base64 if necessary b64_data += "=" * (-len(b64_data) % 4) decoded_data: list[str | None] = json.loads(base64.b64decode(b64_data).decode("utf-8")) # The URL is typically the last element in the decoded array if ( isinstance(decoded_data, list) and isinstance(decoded_data[-1], str) and decoded_data[-1].startswith("http") ): real_url = decoded_data[-1] if real_url: url = real_url else: logger.error(f"no real-url found: {url}") continue title = extract_text(eval_xpath(result, "./h4")) or "" # The pub_date is mostly a string like 'yesterday', not a real timezone # date or time. Therefore we can't use publishedDate and place the # *pub* sting into the content. pub_date = extract_text(eval_xpath(result, ".//time")) pub_origin = extract_text(eval_xpath(result, ".//div[contains(@class, 'vr1PYe')]")) content = " / ".join([x for x in [pub_origin, pub_date] if x]) thumbnail: str = eval_xpath_getindex(result, ".//figure/img/@src", 0, default="") if thumbnail and thumbnail.startswith("/"): thumbnail = base_url + thumbnail res.add( res.types.MainResult( url=url, title=title, content=content, thumbnail=thumbnail, ) ) return res ceid_list = [ "AE:ar", "AR:es-419", "AT:de", "AU:en", "BD:bn", "BE:fr", "BE:nl", "BG:bg", "BR:pt-419", "BW:en", "CA:en", "CA:fr", "CH:de", "CH:fr", "CL:es-419", "CN:zh-Hans", "CO:es-419", "CU:es-419", "CZ:cs", "DE:de", "EE:et", "EG:ar", "ES:ca", "ES:es", "ET:en", "FI:fi", "FR:fr", "GB:en", "GH:en", "GR:el", "HK:zh-Hant", "HU:hu", "ID:en", "ID:id", "IE:en", "IL:en", "IL:he", "IN:bn", "IN:en", "IN:gu", "IN:hi", "IN:ml", "IN:mr", "IN:pa", "IN:ta", "IN:te", "IT:it", "JP:ja", "KE:en", "KR:ko", "LB:ar", "LT:lt", "LV:en", "LV:lv", "MA:fr", "MY:en", "MY:ms", "NA:en", "NG:en", "NL:nl", "NO:no", "NZ:en", "PH:en", "PK:en", "PL:pl", "RO:ro", "RS:sr", "RU:ru", "SA:ar", "SE:sv", "SG:en", "SI:sl", "SK:sk", "SN:fr", "TH:th", "TR:tr", "TZ:en", "UA:ru", "UA:uk", "UG:en", "US:en", "VN:vi", "ZA:en", "ZW:en", ] """List of region/language combinations supported by Google News. Values of the ``ceid`` argument of the Google News REST API.""" _skip_values = [ "ET:en", # english (ethiopia) "ID:en", # english (indonesia) "LV:en", # english (latvia) ] _ceid_locale_map = {"NO:no": "nb-NO"} def fetch_traits(engine_traits: EngineTraits): _fetch_traits(engine_traits, add_domains=False) engine_traits.custom["ceid"] = {} for ceid in ceid_list: if ceid in _skip_values: continue region, lang = ceid.split(":") x = lang.split("-") if len(x) > 1: if x[1] not in ["Hant", "Hans"]: lang = x[0] sxng_locale = _ceid_locale_map.get(ceid, lang + "-" + region) try: locale = babel.Locale.parse(sxng_locale, sep="-") except babel.UnknownLocaleError: print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale)) continue engine_traits.custom["ceid"][locales.region_tag(locale)] = ceid