[mod] drop fasttext-predict (#5795)

Removes the `fasttext-predict` dependency and the language detection code. If a user now selects `auto` for the search language, the detected language now falls back directly to the `Accept-Language` header sent by the browser (which was already the fallback when fasttext returned no result). - fasttext's [language detection is unreliable](https://github.com/searxng/searxng/issues/4195) for some languages, especially short search queries, and in particular for queries containing proper names which is a common case. - `fasttext-predict` consumes [significant memory](https://github.com/searxng/searxng/pull/1969#issuecomment-1345366676) without offering users much real value. - the upstream fasttext project was archived by Meta in 2024 - users already have two better alternatives: the `Accept-Language` header and the search-syntax language prefix (e.g. `:fr` or `:de`). Related: https://github.com/searxng/searxng/issues/4195 Closes: https://github.com/searxng/searxng/issues/5790
2026-06-22 09:38:34 +02:00 · 2026-03-06 22:40:44 +08:00
parent c7ba2158f9
commit 68ff08f224
5 changed files with 5 additions and 127 deletions
@@ -12,7 +12,6 @@ httpx-socks[asyncio]==0.10.0
 sniffio==1.3.1
 valkey==6.1.1
 markdown-it-py==3.0.0
 fasttext-predict==0.9.2.4
 tomli==2.4.0; python_version < '3.11'
 msgspec==0.20.0
 typer==0.24.1
@@ -25,16 +25,11 @@ from lxml.etree import XPath, XPathError, XPathSyntaxError
 from lxml.etree import ElementBase, _Element  # pyright: ignore[reportPrivateUsage]
 from searx import settings
-from searx.data import USER_AGENTS, data_dir, gsa_useragents_loader
+from searx.data import USER_AGENTS, gsa_useragents_loader
 from searx.version import VERSION_TAG
 from searx.sxng_locales import sxng_locales
 from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
 from searx import logger
 if t.TYPE_CHECKING:
    import fasttext.FastText  # type: ignore
 logger = logger.getChild('utils')
 XPathSpecType: t.TypeAlias = str | XPath
@@ -61,12 +56,6 @@ _JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
 _XPATH_CACHE: dict[str, XPath] = {}
 _LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
 _FASTTEXT_MODEL: "fasttext.FastText._FastText | None" = None  # pyright: ignore[reportPrivateUsage]
 """fasttext model to predict language of a search term"""
 SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
 """Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
 class _NotSetClass:  # pylint: disable=too-few-public-methods
    """Internal class for this module, do not create instance of this class.
@@ -610,17 +599,6 @@ def eval_xpath_getindex(
    return default
 def _get_fasttext_model() -> "fasttext.FastText._FastText":  # pyright: ignore[reportPrivateUsage]
    global _FASTTEXT_MODEL  # pylint: disable=global-statement
    if _FASTTEXT_MODEL is None:
        import fasttext  # pylint: disable=import-outside-toplevel
        # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
        fasttext.FastText.eprint = lambda x: None  # type: ignore
        _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz'))  # type: ignore
    return _FASTTEXT_MODEL
 def get_embeded_stream_url(url: str):
    """
    Converts a standard video URL into its embed format. Supported services include Youtube,
@@ -683,77 +661,6 @@ def get_embeded_stream_url(url: str):
    return iframe_src
 def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> str | None:
    """Detect the language of the ``text`` parameter.
    :param str text: The string whose language is to be detected.
    :param float threshold: Threshold filters the returned labels by a threshold
        on probability.  A choice of 0.3 will return labels with at least 0.3
        probability.
    :param bool only_search_languages: If ``True``, returns only supported
        SearXNG search languages.  see :py:obj:`searx.languages`
    :rtype: str, None
    :returns:
        The detected language code or ``None``. See below.
    :raises ValueError: If ``text`` is not a string.
    The language detection is done by using `a fork`_ of the fastText_ library
    (`python fasttext`_). fastText_ distributes the `language identification
    model`_, for reference:
    - `FastText.zip: Compressing text classification models`_
    - `Bag of Tricks for Efficient Text Classification`_
    The `language identification model`_ support the language codes
    (ISO-639-3)::
        af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
        bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
        et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
        id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
        lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
        nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
        rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
        tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
    By using ``only_search_languages=True`` the `language identification model`_
    is harmonized with the SearXNG's language (locale) model.  General
    conditions of SearXNG's locale model are:
    a. SearXNG's locale of a query is passed to the
       :py:obj:`searx.locales.get_engine_locale` to get a language and/or region
       code that is used by an engine.
    b. Most of SearXNG's engines do not support all the languages from `language
       identification model`_ and there is also a discrepancy in the ISO-639-3
       (fasttext) and ISO-639-2 (SearXNG)handling.  Further more, in SearXNG the
       locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
       (``zh_Hans``) while the `language identification model`_ reduce both to
       ``zh``.
    .. _a fork: https://github.com/searxng/fasttext-predict
    .. _fastText: https://fasttext.cc/
    .. _python fasttext: https://pypi.org/project/fasttext/
    .. _language identification model: https://fasttext.cc/docs/en/language-identification.html
    .. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
    .. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
    """
    if not isinstance(text, str):
        raise ValueError('text must a str')  # pyright: ignore[reportUnreachable]
    r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold)  # type: ignore
    if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0:  # type: ignore
        language = r[0][0].split('__label__')[1]  # type: ignore
        if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
            return None
        return language  # type: ignore
    return None
 def _j2p_process_escape(match: re.Match[str]) -> str:
    # deal with ECMA escape characters
    _escape = match.group(1) or match.group(2)
@@ -9,7 +9,6 @@ from searx.query import RawTextQuery
 from searx.engines import categories, engines
 from searx.search.models import SearchQuery, EngineRef
 from searx.preferences import Preferences, is_locked
 from searx.utils import detect_language
 # remove duplicate queries.
@@ -233,9 +232,7 @@ def get_search_query_from_webapp(
    4. string with the *selected locale* of the query
    About language/locale: if the client selects the alias ``auto`` the
-    ``SearchQuery`` object is build up by the :py:obj:`detected language
+    language preferred by the :py:obj:`Preferences.client` is used.
    <searx.utils.detect_language>`.  If language recognition does not have a
    match the language preferred by the :py:obj:`Preferences.client` is used.
    If client does not have a preference, the default ``all`` is used.
    The *selected locale* in the tuple always represents the selected
@@ -267,8 +264,7 @@ def get_search_query_from_webapp(
    selected_locale = query_lang
    if query_lang == 'auto':
-        query_lang = detect_language(query, threshold=0.8, only_search_languages=True)
+        query_lang = preferences.client.locale_tag or 'all'
        query_lang = query_lang or preferences.client.locale_tag or 'all'
    if not is_locked('categories') and raw_text_query.specific:
        # if engines are calculated from query,
@@ -19,7 +19,7 @@ from searx.engines import wikidata, set_loggers
 from searx.utils import extract_text, searxng_useragent
 from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
 from searx import searx_dir
-from searx.utils import gen_useragent, detect_language
+from searx.utils import gen_useragent
 import searx.search
 import searx.network
 from searx.data import data_dir
@@ -169,7 +169,7 @@ def get_website_description(url, lang1, lang2=None):
    lang = extract_text(html.xpath('/html/@lang'))
    if lang is None and len(lang1) > 0:
        lang = lang1
-    lang = detect_language(description) or lang or 'en'
+    lang = lang or 'en'
    lang = lang.split('_')[0]
    lang = lang.split('-')[0]
    return (lang, description)
@@ -194,27 +194,3 @@ class TestXPathUtils(SearxTestCase):  # pylint: disable=missing-class-docstring
        with self.assertRaises(SearxEngineXPathException) as context:
            utils.eval_xpath_getindex(doc, 'count(//i)', 1)
        self.assertEqual(context.exception.message, 'the result is not a list')
    def test_detect_language(self):
        # make sure new line are not an issue
        # fasttext.predict('') does not accept new line.
        l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
        self.assertEqual(l, 'en')
        l = utils.detect_language(
            'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす'
        )
        self.assertEqual(l, 'ja')
        l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
        self.assertEqual(l, 'tr')
        l = utils.detect_language('')
        self.assertIsNone(l)
        # mix languages --> None
        l = utils.detect_language('The いろはにほへと Pijamalı')
        self.assertIsNone(l)
        with self.assertRaises(ValueError):
            utils.detect_language(None)  # type: ignore