#!/usr/bin/env python # SPDX-License-Identifier: AGPL-3.0-or-later """Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/sxng_locales.py` :py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`: Persistence of engines traits, fetched from the engines. :origin:`searx/sxng_locales.py` Is generated from intersecting each engine's supported traits. The script :origin:`searxng_extra/update/update_engine_traits.py` is called in the :origin:`CI Update data ... <.github/workflows/data-update.yml>` """ # pylint: disable=invalid-name import typing as t from unicodedata import lookup from pathlib import Path from pprint import pformat import babel import typer from searx import settings, searx_dir from searx import network from searx.engines import load_engines from searx.enginelib.traits import EngineTraitsMap # Output files. sxng_locales_file = Path(searx_dir) / 'sxng_locales.py' sxng_locales_file_header = """\ # SPDX-License-Identifier: AGPL-3.0-or-later '''List of SearXNG's locale codes used for the search language/region. .. hint:: Don't modify this file, this file is generated by:: ./manage data.traits ''' sxng_locales = ( """ sxng_locales_file_footer = """, ) ''' A list of five-digit tuples: 0. SearXNG's internal locale tag (a language or region tag) 1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`) 2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`). Empty string for language tags. 3. English language name (from :py:obj:`babel.core.Locale.english_name`) 4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages are represented by a globe (\U0001f310) .. code:: python ('en', 'English', '', 'English', '\U0001f310'), ('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'), ('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'), .. ('fr', 'Français', '', 'French', '\U0001f310'), ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), :meta hide-value: ''' """ lang2emoji = { 'ha': '\U0001f1f3\U0001f1ea', # Hausa / Niger 'bs': '\U0001f1e7\U0001f1e6', # Bosnian / Bosnia & Herzegovina 'jp': '\U0001f1ef\U0001f1f5', # Japanese 'ua': '\U0001f1fa\U0001f1e6', # Ukrainian 'he': '\U0001f1ee\U0001f1f1', # Hebrew } app = typer.Typer() @app.command() def cli(engines: t.Annotated[list[str] | None, typer.Argument()] = None): """Update ``data/engine_traits.json`` and ``sxng_locales.py``. Optionally, if arguments are provided via the command line, these are interpreted as the names of the engines that should be updated. All other engines will be left untouched. """ all_eng_names: list[str] = [e["name"] for e in settings["engines"]] if engines: unknown: list[str] = [e for e in engines if e not in all_eng_names] if unknown: print(f"ERROR: unknown engines --> {', '.join(unknown)}") raise typer.Exit(42) engines_cfg: list[dict[str, t.Any]] = [] for eng_data in settings["engines"]: if not engines or eng_data["name"] in engines: eng_data["inactive"] = False engines_cfg.append(eng_data) load_engines(engines_cfg) traits_map: EngineTraitsMap = fetch_traits_map() if engines: _map = EngineTraitsMap.from_data() _map.update(traits_map) traits_map = _map print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE) traits_map.save_data() sxng_tag_list = filter_locales(traits_map) write_sxng_locales_file(sxng_tag_list) def fetch_traits_map() -> EngineTraitsMap: """Fetches supported languages for each engine and writes json file with those.""" network.set_timeout_for_thread(10.0) def log(msg: str): print(msg) traits_map = EngineTraitsMap.fetch_traits(log=log) print("fetched properties from %s engines" % len(traits_map)) return traits_map def filter_locales(traits_map: EngineTraitsMap) -> set[str]: """Filter language & region tags by a threshold.""" min_eng_per_region = 18 min_eng_per_lang = 22 _: dict[str, int] = {} for eng in traits_map.values(): for reg in eng.regions.keys(): _[reg] = _.get(reg, 0) + 1 regions = set(k for k, v in _.items() if v >= min_eng_per_region) lang_from_region = set(k.split('-')[0] for k in regions) _ = {} for eng in traits_map.values(): for lang in eng.languages.keys(): # ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they # already counted by existence of 'zh' or 'sr', 'pa') if '_' in lang: # print("ignore %s" % lang) continue _[lang] = _.get(lang, 0) + 1 languages = set(k for k, v in _.items() if v >= min_eng_per_lang) sxng_tag_list: set[str] = set() sxng_tag_list.update(regions) sxng_tag_list.update(lang_from_region) sxng_tag_list.update(languages) return sxng_tag_list def write_sxng_locales_file(sxng_tag_list: set[str]): language_codes: list[tuple[str, str, str, str, str]] = [] for sxng_tag in sorted(sxng_tag_list): sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-') flag = get_unicode_flag(sxng_locale) or '' item = ( sxng_tag, sxng_locale.get_language_name().title(), # pyright: ignore[reportOptionalMemberAccess] sxng_locale.get_territory_name() or '', sxng_locale.english_name.split(' (')[0] if sxng_locale.english_name else '', UnicodeEscape(flag), ) language_codes.append(item) _codes = tuple(language_codes) with sxng_locales_file.open('w', encoding='utf-8') as new_file: file_content = "{header} {language_codes}{footer}".format( header=sxng_locales_file_header, language_codes=pformat(_codes, width=120, indent=4)[1:-1], footer=sxng_locales_file_footer, ) new_file.write(file_content) new_file.close() class UnicodeEscape(str): """Escape unicode string in :py:obj:`pprint.pformat`""" def __repr__(self): return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'" def get_unicode_flag(locale: babel.Locale): """Determine a unicode flag (emoji) that fits to the ``locale``""" emoji = lang2emoji.get(locale.language) if emoji: return emoji if not locale.territory: return '\U0001f310' emoji = lang2emoji.get(locale.territory.lower()) if emoji: return emoji try: c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0]) c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1]) # print("OK : %s --> %s%s" % (locale, c1, c2)) except KeyError as exc: print("ERROR: %s --> %s" % (locale, exc)) return None return c1 + c2 if __name__ == "__main__": app()