[mod] drop fasttext-predict (#5795)

Removes the `fasttext-predict` dependency and the language detection code. If a user now selects `auto` for the search language, the detected language now falls back directly to the `Accept-Language` header sent by the browser (which was already the fallback when fasttext returned no result). - fasttext's [language detection is unreliable](https://github.com/searxng/searxng/issues/4195) for some languages, especially short search queries, and in particular for queries containing proper names which is a common case. - `fasttext-predict` consumes [significant memory](https://github.com/searxng/searxng/pull/1969#issuecomment-1345366676) without offering users much real value. - the upstream fasttext project was archived by Meta in 2024 - users already have two better alternatives: the `Accept-Language` header and the search-syntax language prefix (e.g. `:fr` or `:de`). Related: https://github.com/searxng/searxng/issues/4195 Closes: https://github.com/searxng/searxng/issues/5790
2026-05-17 16:34:32 +02:00 · 2026-03-06 22:40:44 +08:00
parent c7ba2158f9
commit 68ff08f224
5 changed files with 5 additions and 127 deletions
@@ -25,16 +25,11 @@ from lxml.etree import XPath, XPathError, XPathSyntaxError
 from lxml.etree import ElementBase, _Element  # pyright: ignore[reportPrivateUsage]

 from searx import settings
-from searx.data import USER_AGENTS, data_dir, gsa_useragents_loader
+from searx.data import USER_AGENTS, gsa_useragents_loader
 from searx.version import VERSION_TAG
-from searx.sxng_locales import sxng_locales
 from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
 from searx import logger

-if t.TYPE_CHECKING:
-    import fasttext.FastText  # type: ignore
-
-
 logger = logger.getChild('utils')

 XPathSpecType: t.TypeAlias = str | XPath
@@ -61,12 +56,6 @@ _JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
 _XPATH_CACHE: dict[str, XPath] = {}
 _LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}

-_FASTTEXT_MODEL: "fasttext.FastText._FastText | None" = None  # pyright: ignore[reportPrivateUsage]
-"""fasttext model to predict language of a search term"""
-
-SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
-"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
-

 class _NotSetClass:  # pylint: disable=too-few-public-methods
    """Internal class for this module, do not create instance of this class.
@@ -610,17 +599,6 @@ def eval_xpath_getindex(
    return default


-def _get_fasttext_model() -> "fasttext.FastText._FastText":  # pyright: ignore[reportPrivateUsage]
-    global _FASTTEXT_MODEL  # pylint: disable=global-statement
-    if _FASTTEXT_MODEL is None:
-        import fasttext  # pylint: disable=import-outside-toplevel
-
-        # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
-        fasttext.FastText.eprint = lambda x: None  # type: ignore
-        _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))  # type: ignore
-    return _FASTTEXT_MODEL
-
-
 def get_embeded_stream_url(url: str):
    """
    Converts a standard video URL into its embed format. Supported services include Youtube,
@@ -683,77 +661,6 @@ def get_embeded_stream_url(url: str):
    return iframe_src


-def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> str | None:
-    """Detect the language of the ``text`` parameter.
-
-    :param str text: The string whose language is to be detected.
-
-    :param float threshold: Threshold filters the returned labels by a threshold
-        on probability.  A choice of 0.3 will return labels with at least 0.3
-        probability.
-
-    :param bool only_search_languages: If ``True``, returns only supported
-        SearXNG search languages.  see :py:obj:`searx.languages`
-
-    :rtype: str, None
-    :returns:
-        The detected language code or ``None``. See below.
-
-    :raises ValueError: If ``text`` is not a string.
-
-    The language detection is done by using `a fork`_ of the fastText_ library
-    (`python fasttext`_). fastText_ distributes the `language identification
-    model`_, for reference:
-
-    - `FastText.zip: Compressing text classification models`_
-    - `Bag of Tricks for Efficient Text Classification`_
-
-    The `language identification model`_ support the language codes
-    (ISO-639-3)::
-
-        af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
-        bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
-        et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
-        id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
-        lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
-        nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
-        rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
-        tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
-
-    By using ``only_search_languages=True`` the `language identification model`_
-    is harmonized with the SearXNG's language (locale) model.  General
-    conditions of SearXNG's locale model are:
-
-    a. SearXNG's locale of a query is passed to the
-       :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
-       code that is used by an engine.
-
-    b. Most of SearXNG's engines do not support all the languages from `language
-       identification model`_ and there is also a discrepancy in the ISO-639-3
-       (fasttext) and ISO-639-2 (SearXNG)handling.  Further more, in SearXNG the
-       locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
-       (``zh_Hans``) while the `language identification model`_ reduce both to
-       ``zh``.
-
-    .. _a fork: https://github.com/searxng/fasttext-predict
-    .. _fastText: https://fasttext.cc/
-    .. _python fasttext: https://pypi.org/project/fasttext/
-    .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
-    .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
-    .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
-
-    """
-    if not isinstance(text, str):
-        raise ValueError('text must a str')  # pyright: ignore[reportUnreachable]
-    r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)  # type: ignore
-    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:  # type: ignore
-        language = r[0][0].split('__label__')[1]  # type: ignore
-        if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
-            return None
-        return language  # type: ignore
-    return None
-
-
 def _j2p_process_escape(match: re.Match[str]) -> str:
    # deal with ECMA escape characters
    _escape = match.group(1) or match.group(2)