From 6521190bb437e535304eeed812b9cab77754c1c3 Mon Sep 17 00:00:00 2001 From: Ivan Gabaldon Date: Wed, 18 Mar 2026 14:55:25 +0100 Subject: [PATCH] [enh] engines: rework bing engine (#5793) * [enh] engines: rework bing engine Only Bing-Web has been reworked. Some features now require JavaScript (paging and time-range results). Cookies no longer work, parameters such as `cc`, `ui`, ... alter the results. The engine only appears to use the locale from `Accept-Language` header properly. The rest of Bing's child engines (Bing-Image, Bing-Video, ...) seem to benefit from using `mkt` param in conjunction with the `Accept-Language` header override, although Bing-Web does not (?) * [enh] explicit mkt * [fix] engines: bing_videos.py https://github.com/searxng/searxng/pull/5793#pullrequestreview-3881883250 --- searx/engines/bing.py | 283 ++++++++++++++--------------------- searx/engines/bing_images.py | 91 +++++------ searx/engines/bing_news.py | 92 ++++++------ searx/engines/bing_videos.py | 86 ++++++----- 4 files changed, 251 insertions(+), 301 deletions(-) diff --git a/searx/engines/bing.py b/searx/engines/bing.py index dd5ce4beb..48537d679 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -1,34 +1,20 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""This is the implementation of the Bing-WEB engine. Some of this +"""This is the implementation of the Bing-Web engine. Some of this implementations are shared by other engines: - :ref:`bing images engine` - :ref:`bing news engine` - :ref:`bing videos engine` -On the `preference page`_ Bing offers a lot of languages an regions (see section -LANGUAGE and COUNTRY/REGION). The Language is the language of the UI, we need -in SearXNG to get the translations of data such as *"published last week"*. - -There is a description of the official search-APIs_, unfortunately this is not -the API we can use or that bing itself would use. You can look up some things -in the API to get a better picture of bing, but the value specifications like -the market codes are usually outdated or at least no longer used by bing itself. - -The market codes have been harmonized and are identical for web, video and -images. The news area has also been harmonized with the other categories. Only -political adjustments still seem to be made -- for example, there is no news -category for the Chinese market. - -.. _preference page: https://www.bing.com/account/general -.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/ +.. note:: + Some functionality (paging and time-range results) are not supported + since they depend on JavaScript. """ -# pylint: disable=too-many-branches, invalid-name import base64 import re -import time +import typing as t from urllib.parse import parse_qs, urlencode, urlparse import babel @@ -36,14 +22,17 @@ import babel.languages from lxml import html from searx.enginelib.traits import EngineTraits -from searx.exceptions import SearxEngineAPIException -from searx.locales import language_tag, region_tag +from searx.locales import region_tag from searx.utils import eval_xpath, eval_xpath_getindex, eval_xpath_list, extract_text -about = { +if t.TYPE_CHECKING: + from searx.extended_types import SXNG_Response + from searx.search.processors import OnlineParams + +about: dict[str, t.Any] = { "website": "https://www.bing.com", "wikidata_id": "Q182496", - "official_api_documentation": "https://www.microsoft.com/en-us/bing/apis/bing-web-search-api", + "official_api_documentation": "https://github.com/MicrosoftDocs/bing-docs", "use_official_api": False, "require_api_key": False, "results": "HTML", @@ -51,69 +40,78 @@ about = { # engine dependent config categories = ["general", "web"] -paging = True -max_page = 200 -"""200 pages maximum (``&first=1991``)""" - -time_range_support = True safesearch = True -"""Bing results are always SFW. To get NSFW links from bing some age -verification by a cookie is needed / thats not possible in SearXNG. -""" +_safesearch_map: dict[int, str] = { + 0: "off", + 1: "moderate", + 2: "strict", +} +"""Filter results. 0: None, 1: Moderate, 2: Strict""" base_url = "https://www.bing.com/search" -"""Bing (Web) search URL""" +"""Bing-Web search URL""" -def _page_offset(pageno): - return (int(pageno) - 1) * 10 + 1 +def get_locale_params(engine_region: str | None) -> dict[str, str] | None: + """API documentation states the ``mkt`` parameter is *the + recommended primary signal* for locale: + + If known, you are encouraged to always specify the market. + Specifying the market helps Bing route the request and return an + appropriate and optimal response. + + The ``mkt`` parameter takes a full ``-`` code. + + This function is shared with :py:mod:`searx.engines.bing_images`, + :py:mod:`searx.engines.bing_news`, and :py:mod:`searx.engines.bing_videos`. + """ + + if not engine_region or engine_region == "clear": + return None + + return {"mkt": engine_region} -def set_bing_cookies(params, engine_language, engine_region): - params["cookies"]["_EDGE_CD"] = f"m={engine_region}&u={engine_language}" - params["cookies"]["_EDGE_S"] = f"mkt={engine_region}&ui={engine_language}" - logger.debug("bing cookies: %s", params["cookies"]) +def override_accept_language(params: "OnlineParams", engine_region: str | None) -> None: + """Override the ``Accept-Language`` header. + + The default header built by :py:class:`~searx.search.processors.online.OnlineProcessor` + appends ``en;q=0.3`` as a fallback language:: + + Accept-Language: de,de-DE;q=0.7,en;q=0.3 + + Bing seems to better select the results locale based on the + ``Accept-Language`` value header. + + This function is shared with :py:mod:`searx.engines.bing_images`, + :py:mod:`searx.engines.bing_news`, and :py:mod:`searx.engines.bing_videos`. + """ + + if not engine_region or engine_region == "clear": + return + + lang = engine_region.split("-")[0] + params["headers"]["Accept-Language"] = f"{engine_region},{lang};q=0.9" -def request(query, params): +def request(query: str, params: "OnlineParams") -> "OnlineParams": """Assemble a Bing-Web request.""" - engine_region = traits.get_region(params["searxng_locale"], traits.all_locale) # type: ignore - engine_language = traits.get_language(params["searxng_locale"], "en") # type: ignore - set_bing_cookies(params, engine_language, engine_region) + engine_region = traits.get_region(params["searxng_locale"], traits.all_locale) - page = params.get("pageno", 1) - query_params = { + override_accept_language(params, engine_region) + + query_params: dict[str, str | int] = { "q": query, - # if arg 'pq' is missed, sometimes on page 4 we get results from page 1, - # don't ask why it is only sometimes / its M$ and they have never been - # deterministic ;) - "pq": query, + "adlt": _safesearch_map.get(params.get("safesearch", 0), "off"), } - # To get correct page, arg first and this arg FORM is needed, the value PERE - # is on page 2, on page 3 its PERE1 and on page 4 its PERE2 .. and so forth. - # The 'first' arg should never send on page 1. - - if page > 1: - query_params["first"] = _page_offset(page) # see also arg FORM - if page == 2: - query_params["FORM"] = "PERE" - elif page > 2: - query_params["FORM"] = "PERE%s" % (page - 2) + locale_params = get_locale_params(engine_region) + if locale_params: + query_params.update(locale_params) params["url"] = f"{base_url}?{urlencode(query_params)}" - if params.get("time_range"): - unix_day = int(time.time() / 86400) - time_ranges = { - "day": "1", - "week": "2", - "month": "3", - "year": f"5_{unix_day - 365}_{unix_day}", - } - params["url"] += f'&filters=ex1:"ez{time_ranges[params["time_range"]]}"' - # in some regions where geoblocking is employed (e.g. China), # www.bing.com redirects to the regional version of Bing params["allow_redirects"] = True @@ -121,82 +119,57 @@ def request(query, params): return params -def response(resp): - # pylint: disable=too-many-locals +def response(resp: "SXNG_Response") -> list[dict[str, t.Any]]: + """Get response from Bing-Web""" - results = [] - result_len = 0 + results: list[dict[str, t.Any]] = [] dom = html.fromstring(resp.text) - # parse results again if nothing is found yet - - for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'): - link = eval_xpath_getindex(result, ".//h2/a", 0, None) + for item in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'): + link = eval_xpath_getindex(item, ".//h2/a", 0, None) if link is None: continue - url = link.attrib.get("href") + + href = link.attrib.get("href", "") title = extract_text(link) - content = eval_xpath(result, ".//p") - for p in content: - # Make sure that the element is free of: - # Web - for e in p.xpath('.//span[@class="algoSlug_icon"]'): - e.getparent().remove(e) - content = extract_text(content) + if not href or not title: + continue - # get the real URL - if url.startswith("https://www.bing.com/ck/a?"): - # get the first value of u parameter - url_query = urlparse(url).query - parsed_url_query = parse_qs(url_query) - param_u = parsed_url_query["u"][0] - # remove "a1" in front - encoded_url = param_u[2:] - # add padding - encoded_url = encoded_url + "=" * (-len(encoded_url) % 4) - # decode base64 encoded URL - url = base64.urlsafe_b64decode(encoded_url).decode() + # what about cn.bing.com, ..? + if href.startswith("https://www.bing.com/ck/a?"): + qs = parse_qs(urlparse(href).query) + u_values = qs.get("u") + if u_values: + u_val = u_values[0] + if u_val.startswith("a1"): + encoded = u_val[2:] + # base64url without padding + encoded += "=" * (-len(encoded) % 4) + href = base64.urlsafe_b64decode(encoded).decode("utf-8", errors="replace") - # append result - results.append({"url": url, "title": title, "content": content}) + # remove decorative icons that Bing injects into

elements + # (``) + content_els = eval_xpath(item, ".//p") + for p in content_els: + for icon in p.xpath('.//span[@class="algoSlug_icon"]'): + icon.getparent().remove(icon) + content = extract_text(content_els) + + results.append({"url": href, "title": title, "content": content}) - # get number_of_results if results: result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) - if "-" in result_len_container: - start_str, result_len_container = re.split(r"-\d+", result_len_container) - start = int(start_str) - else: - start = 1 + result_len_container = re.sub(r"[^0-9]", "", result_len_container) + if result_len_container: + results.append({"number_of_results": int(result_len_container)}) - result_len_container = re.sub("[^0-9]", "", result_len_container) - if len(result_len_container) > 0: - result_len = int(result_len_container) - - expected_start = _page_offset(resp.search_params.get("pageno", 1)) - - if expected_start != start: - if expected_start > result_len: - # Avoid reading more results than available. - # For example, if there is 100 results from some search and we try to get results from 120 to 130, - # Bing will send back the results from 0 to 10 and no error. - # If we compare results count with the first parameter of the request we can avoid this "invalid" - # results. - return [] - - # Sometimes Bing will send back the first result page instead of the requested page as a rate limiting - # measure. - msg = f"Expected results to start at {expected_start}, but got results starting at {start}" - raise SearxEngineAPIException(msg) - - results.append({"number_of_results": result_len}) return results -def fetch_traits(engine_traits: EngineTraits): - """Fetch languages and regions from Bing-Web.""" +def fetch_traits(engine_traits: EngineTraits) -> None: + """Fetch regions from Bing-Web.""" # pylint: disable=import-outside-toplevel from searx.network import get # see https://github.com/searxng/searxng/issues/762 @@ -219,47 +192,10 @@ def fetch_traits(engine_traits: EngineTraits): dom = html.fromstring(resp.text) - # languages - - engine_traits.languages["zh"] = "zh-hans" - - map_lang = {"prs": "fa-AF", "en": "en-us"} - bing_ui_lang_map = { - # HINT: this list probably needs to be supplemented - "en": "us", # en --> en-us - "da": "dk", # da --> da-dk + map_market_codes: dict[str, str] = { + "zh-hk": "en-hk", # not sure why, but at Microslop this is the market code for Hongkong } - for href in eval_xpath(dom, '//div[@id="language-section-content"]//div[@class="languageItem"]/a/@href'): - eng_lang = parse_qs(urlparse(href).query)["setlang"][0] - babel_lang = map_lang.get(eng_lang, eng_lang) - try: - sxng_tag = language_tag(babel.Locale.parse(babel_lang.replace("-", "_"))) - except babel.UnknownLocaleError: - print("ERROR: language (%s) is unknown by babel" % (babel_lang)) - continue - # Language (e.g. 'en' or 'de') from https://www.bing.com/account/general - # is converted by bing to 'en-us' or 'de-de'. But only if there is not - # already a '-' delemitter in the language. For instance 'pt-PT' --> - # 'pt-pt' and 'pt-br' --> 'pt-br' - bing_ui_lang = eng_lang.lower() - if "-" not in bing_ui_lang: - bing_ui_lang = bing_ui_lang + "-" + bing_ui_lang_map.get(bing_ui_lang, bing_ui_lang) - - conflict = engine_traits.languages.get(sxng_tag) - if conflict: - if conflict != bing_ui_lang: - print(f"CONFLICT: babel {sxng_tag} --> {conflict}, {bing_ui_lang}") - continue - engine_traits.languages[sxng_tag] = bing_ui_lang - - # regions (aka "market codes") - - engine_traits.regions["zh-CN"] = "zh-cn" - - map_market_codes = { - "zh-hk": "en-hk", # not sure why, but at M$ this is the market code for Hongkong - } for href in eval_xpath(dom, '//div[@id="region-section-content"]//div[@class="regionItem"]/a/@href'): cc_tag = parse_qs(urlparse(href).query)["cc"][0] if cc_tag == "clear": @@ -268,17 +204,20 @@ def fetch_traits(engine_traits: EngineTraits): # add market codes from official languages of the country .. for lang_tag in babel.languages.get_official_languages(cc_tag, de_facto=True): - if lang_tag not in engine_traits.languages.keys(): - # print("ignore lang: %s <-- %s" % (cc_tag, lang_tag)) - continue lang_tag = lang_tag.split("_")[0] # zh_Hant --> zh market_code = f"{lang_tag}-{cc_tag}" # zh-tw - market_code = map_market_codes.get(market_code, market_code) - sxng_tag = region_tag(babel.Locale.parse("%s_%s" % (lang_tag, cc_tag.upper()))) + + try: + sxng_tag = region_tag(babel.Locale.parse("%s_%s" % (lang_tag, cc_tag.upper()))) + except babel.UnknownLocaleError: + # silently ignore unknown languages + continue + conflict = engine_traits.regions.get(sxng_tag) if conflict: if conflict != market_code: print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, market_code)) - continue + continue + engine_traits.regions[sxng_tag] = market_code diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index d56129707..a8439a414 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -1,96 +1,101 @@ # SPDX-License-Identifier: AGPL-3.0-or-later """Bing-Images: description see :py:obj:`searx.engines.bing`.""" -# pylint: disable=invalid-name + import json from urllib.parse import urlencode from lxml import html -from searx.engines.bing import set_bing_cookies -from searx.engines.bing import fetch_traits # pylint: disable=unused-import +from searx.engines.bing import ( # pylint: disable=unused-import + fetch_traits, + get_locale_params, + override_accept_language, +) # about about = { - "website": 'https://www.bing.com/images', - "wikidata_id": 'Q182496', - "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-image-search-api', + "website": "https://www.bing.com/images", + "wikidata_id": "Q182496", + "official_api_documentation": "https://github.com/MicrosoftDocs/bing-docs", "use_official_api": False, "require_api_key": False, - "results": 'HTML', + "results": "HTML", } # engine dependent config -categories = ['images', 'web'] +categories = ["images", "web"] paging = True safesearch = True time_range_support = True - -base_url = 'https://www.bing.com/images/async' -"""Bing (Images) search URL""" - time_map = { - 'day': 60 * 24, - 'week': 60 * 24 * 7, - 'month': 60 * 24 * 31, - 'year': 60 * 24 * 365, + "day": 60 * 24, + "week": 60 * 24 * 7, + "month": 60 * 24 * 31, + "year": 60 * 24 * 365, } +base_url = "https://www.bing.com/images/async" +"""Bing-Image search URL""" + def request(query, params): """Assemble a Bing-Image request.""" - engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore - engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore - set_bing_cookies(params, engine_language, engine_region) + engine_region = traits.get_region(params["searxng_locale"], traits.all_locale) + + override_accept_language(params, engine_region) # build URL query - # - example: https://www.bing.com/images/async?q=foo&async=content&first=1&count=35 + # - example: https://www.bing.com/images/async?q=foo&async=1&first=1&count=35 query_params = { - 'q': query, - 'async': '1', + "q": query, + "async": "1", # to simplify the page count lets use the default of 35 images per page - 'first': (int(params.get('pageno', 1)) - 1) * 35 + 1, - 'count': 35, + "first": (int(params.get("pageno", 1)) - 1) * 35 + 1, + "count": 35, } + locale_params = get_locale_params(engine_region) + if locale_params: + query_params.update(locale_params) + # time range - # - example: one year (525600 minutes) 'qft=+filterui:age-lt525600' + # - example: one year (525600 minutes) 'qft=filterui:age-lt525600' + if params["time_range"]: + query_params["qft"] = "filterui:age-lt%s" % time_map[params["time_range"]] - if params['time_range']: - query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']] - - params['url'] = base_url + '?' + urlencode(query_params) + params["url"] = base_url + "?" + urlencode(query_params) return params def response(resp): - """Get response from Bing-Images""" + """Get response from Bing-Image""" results = [] + dom = html.fromstring(resp.text) for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'): - metadata = result.xpath('.//a[@class="iusc"]/@m') if not metadata: continue metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0]) - title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip() - img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip().split(" · ") - source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip() + title = " ".join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip() + img_format = " ".join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip().split(" · ") + source = " ".join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip() results.append( { - 'template': 'images.html', - 'url': metadata['purl'], - 'thumbnail_src': metadata['turl'], - 'img_src': metadata['murl'], - 'content': metadata.get('desc'), - 'title': title, - 'source': source, - 'resolution': img_format[0], - 'img_format': img_format[1] if len(img_format) >= 2 else None, + "template": "images.html", + "url": metadata["purl"], + "thumbnail_src": metadata["turl"], + "img_src": metadata["murl"], + "content": metadata.get("desc"), + "title": title, + "source": source, + "resolution": img_format[0], + "img_format": img_format[1] if len(img_format) >= 2 else None, } ) return results diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 43cf575a6..3c10b53d3 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -7,92 +7,90 @@ """ -# pylint: disable=invalid-name - from urllib.parse import urlencode from lxml import html -from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex from searx.enginelib.traits import EngineTraits -from searx.engines.bing import set_bing_cookies +from searx.engines.bing import ( + get_locale_params, + override_accept_language, +) +from searx.utils import eval_xpath, eval_xpath_getindex, eval_xpath_list, extract_text # about about = { - "website": 'https://www.bing.com/news', - "wikidata_id": 'Q2878637', - "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-news-search-api', + "website": "https://www.bing.com/news", + "wikidata_id": "Q2878637", + "official_api_documentation": "https://github.com/MicrosoftDocs/bing-docs", "use_official_api": False, "require_api_key": False, - "results": 'RSS', + "results": "RSS", } # engine dependent config -categories = ['news'] +categories = ["news"] paging = True """If go through the pages and there are actually no new results for another page, then bing returns the results from the last page again.""" time_range_support = True time_map = { - 'day': 'interval="4"', - 'week': 'interval="7"', - 'month': 'interval="9"', + "day": 'interval="4"', + "week": 'interval="7"', + "month": 'interval="9"', } """A string '4' means *last hour*. We use *last hour* for ``day`` here since the difference of *last day* and *last week* in the result list is just marginally. Bing does not have news range ``year`` / we use ``month`` instead.""" -base_url = 'https://www.bing.com/news/infinitescrollajax' +base_url = "https://www.bing.com/news/infinitescrollajax" """Bing (News) search URL""" def request(query, params): """Assemble a Bing-News request.""" - engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore - engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore - set_bing_cookies(params, engine_language, engine_region) + engine_region = traits.get_region(params["searxng_locale"], traits.all_locale) + + override_accept_language(params, engine_region) # build URL query - # - # example: https://www.bing.com/news/infinitescrollajax?q=london&first=1 - - page = int(params.get('pageno', 1)) - 1 + # - example: https://www.bing.com/news/infinitescrollajax?q=london&first=1 + page = int(params.get("pageno", 1)) - 1 query_params = { - 'q': query, - 'InfiniteScroll': 1, + "q": query, + "InfiniteScroll": 1, # to simplify the page count lets use the default of 10 images per page - 'first': page * 10 + 1, - 'SFX': page, - 'form': 'PTFTNR', - 'setlang': engine_region.split('-')[0], - 'cc': engine_region.split('-')[-1], + "first": page * 10 + 1, + "SFX": page, + "form": "PTFTNR", } - if params['time_range']: - query_params['qft'] = time_map.get(params['time_range'], 'interval="9"') + locale_params = get_locale_params(engine_region) + if locale_params: + query_params.update(locale_params) - params['url'] = base_url + '?' + urlencode(query_params) + if params["time_range"]: + query_params["qft"] = time_map.get(params["time_range"], 'interval="9"') + + params["url"] = base_url + "?" + urlencode(query_params) return params def response(resp): - """Get response from Bing-Video""" - results = [] + """Parse the Bing-News response.""" - if not resp.ok or not resp.text: - return results + results = [] dom = html.fromstring(resp.text) for newsitem in eval_xpath_list(dom, '//div[contains(@class, "newsitem")]'): - link = eval_xpath_getindex(newsitem, './/a[@class="title"]', 0, None) if link is None: continue - url = link.attrib.get('href') + url = link.attrib.get("href") title = extract_text(link) content = extract_text(eval_xpath(newsitem, './/div[@class="snippet"]')) @@ -100,31 +98,31 @@ def response(resp): source = eval_xpath_getindex(newsitem, './/div[contains(@class, "source")]', 0, None) if source is not None: for item in ( - eval_xpath_getindex(source, './/span[@aria-label]/@aria-label', 0, None), + eval_xpath_getindex(source, ".//span[@aria-label]/@aria-label", 0, None), # eval_xpath_getindex(source, './/a', 0, None), # eval_xpath_getindex(source, './div/span', 3, None), - link.attrib.get('data-author'), + link.attrib.get("data-author"), ): if item is not None: t = extract_text(item) if t and t.strip(): metadata.append(t.strip()) - metadata = ' | '.join(metadata) + metadata = " | ".join(metadata) thumbnail = None imagelink = eval_xpath_getindex(newsitem, './/a[@class="imagelink"]//img', 0, None) if imagelink is not None: - thumbnail = imagelink.attrib.get('src') + thumbnail = imagelink.attrib.get("src") if not thumbnail.startswith("https://www.bing.com"): - thumbnail = 'https://www.bing.com/' + thumbnail + thumbnail = "https://www.bing.com/" + thumbnail results.append( { - 'url': url, - 'title': title, - 'content': content, - 'thumbnail': thumbnail, - 'metadata': metadata, + "url": url, + "title": title, + "content": content, + "thumbnail": thumbnail, + "metadata": metadata, } ) @@ -148,4 +146,4 @@ def fetch_traits(engine_traits: EngineTraits): # bot. # HINT: 'en-hk' is the region code it does not indicate the language en!! - engine_traits.regions['zh-CN'] = 'en-hk' + engine_traits.regions["zh-CN"] = "en-hk" diff --git a/searx/engines/bing_videos.py b/searx/engines/bing_videos.py index cfa129791..6169c5ec6 100644 --- a/searx/engines/bing_videos.py +++ b/searx/engines/bing_videos.py @@ -1,5 +1,4 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=invalid-name """Bing-Videos: description see :py:obj:`searx.engines.bing`.""" import json @@ -7,81 +6,90 @@ from urllib.parse import urlencode from lxml import html -from searx.engines.bing import set_bing_cookies -from searx.engines.bing import fetch_traits # pylint: disable=unused-import +from searx.engines.bing import ( # pylint: disable=unused-import + fetch_traits, + get_locale_params, + override_accept_language, +) from searx.engines.bing_images import time_map - +from searx.utils import eval_xpath, eval_xpath_getindex about = { - "website": 'https://www.bing.com/videos', - "wikidata_id": 'Q4914152', - "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-video-search-api', + "website": "https://www.bing.com/videos", + "wikidata_id": "Q4914152", + "official_api_documentation": "https://github.com/MicrosoftDocs/bing-docs", "use_official_api": False, "require_api_key": False, - "results": 'HTML', + "results": "HTML", } # engine dependent config -categories = ['videos', 'web'] +categories = ["videos", "web"] paging = True safesearch = True time_range_support = True -base_url = 'https://www.bing.com/videos/asyncv2' -"""Bing (Videos) async search URL.""" +base_url = "https://www.bing.com/videos/asyncv2" +"""Bing-Video search URL""" def request(query, params): """Assemble a Bing-Video request.""" - engine_region = traits.get_region(params['searxng_locale'], traits.all_locale) # type: ignore - engine_language = traits.get_language(params['searxng_locale'], 'en') # type: ignore - set_bing_cookies(params, engine_language, engine_region) + engine_region = traits.get_region(params["searxng_locale"], traits.all_locale) + + override_accept_language(params, engine_region) # build URL query - # - # example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35 - + # - example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35 query_params = { - 'q': query, - 'async': 'content', - # to simplify the page count lets use the default of 35 images per page - 'first': (int(params.get('pageno', 1)) - 1) * 35 + 1, - 'count': 35, + "q": query, + "async": "content", + # to simplify the page count lets use the default of 35 videos per page + "first": (int(params.get("pageno", 1)) - 1) * 35 + 1, + "count": 35, } + locale_params = get_locale_params(engine_region) + if locale_params: + query_params.update(locale_params) + # time range - # - # example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR' + # - example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR' + if params["time_range"]: + query_params["form"] = "VRFLTR" + query_params["qft"] = " filterui:videoage-lt%s" % time_map[params["time_range"]] - if params['time_range']: - query_params['form'] = 'VRFLTR' - query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']] - - params['url'] = base_url + '?' + urlencode(query_params) + params["url"] = base_url + "?" + urlencode(query_params) return params def response(resp): """Get response from Bing-Video""" + results = [] dom = html.fromstring(resp.text) - for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'): - metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0]) - info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip() - content = '{0} - {1}'.format(metadata['du'], info) - thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0] + for result in dom.xpath('//div[contains(@id, "mc_vtvc_video")]'): + metadata = json.loads(eval_xpath_getindex(result, './/div[@class="vrhdata"]/@vrhm', index=0)) + info = " - ".join(eval_xpath(result, './/div[@class="mc_vtvc_meta_block"]//span/text()')).strip() + thumbnail = eval_xpath_getindex( + result, + './/img[starts-with(@class, "rms")]/@data-src-hq', + index=0, + default=None, + ) results.append( { - 'url': metadata['murl'], - 'thumbnail': thumbnail, - 'title': metadata.get('vt', ''), - 'content': content, - 'template': 'videos.html', + "url": metadata["murl"], + "thumbnail": thumbnail, + "title": metadata.get("vt", ""), + "content": info, + "length": metadata["du"], + "template": "videos.html", } )