mirror of
https://github.com/searxng/searxng.git
synced 2026-05-07 18:03:51 +02:00
[mod] drop fasttext-predict (#5795)
Removes the `fasttext-predict` dependency and the language detection code. If a user now selects `auto` for the search language, the detected language now falls back directly to the `Accept-Language` header sent by the browser (which was already the fallback when fasttext returned no result). - fasttext's [language detection is unreliable](https://github.com/searxng/searxng/issues/4195) for some languages, especially short search queries, and in particular for queries containing proper names which is a common case. - `fasttext-predict` consumes [significant memory](https://github.com/searxng/searxng/pull/1969#issuecomment-1345366676) without offering users much real value. - the upstream fasttext project was archived by Meta in 2024 - users already have two better alternatives: the `Accept-Language` header and the search-syntax language prefix (e.g. `:fr` or `:de`). Related: https://github.com/searxng/searxng/issues/4195 Closes: https://github.com/searxng/searxng/issues/5790
This commit is contained in:
@@ -12,7 +12,6 @@ httpx-socks[asyncio]==0.10.0
|
|||||||
sniffio==1.3.1
|
sniffio==1.3.1
|
||||||
valkey==6.1.1
|
valkey==6.1.1
|
||||||
markdown-it-py==3.0.0
|
markdown-it-py==3.0.0
|
||||||
fasttext-predict==0.9.2.4
|
|
||||||
tomli==2.4.0; python_version < '3.11'
|
tomli==2.4.0; python_version < '3.11'
|
||||||
msgspec==0.20.0
|
msgspec==0.20.0
|
||||||
typer==0.24.1
|
typer==0.24.1
|
||||||
|
|||||||
+1
-94
@@ -25,16 +25,11 @@ from lxml.etree import XPath, XPathError, XPathSyntaxError
|
|||||||
from lxml.etree import ElementBase, _Element # pyright: ignore[reportPrivateUsage]
|
from lxml.etree import ElementBase, _Element # pyright: ignore[reportPrivateUsage]
|
||||||
|
|
||||||
from searx import settings
|
from searx import settings
|
||||||
from searx.data import USER_AGENTS, data_dir, gsa_useragents_loader
|
from searx.data import USER_AGENTS, gsa_useragents_loader
|
||||||
from searx.version import VERSION_TAG
|
from searx.version import VERSION_TAG
|
||||||
from searx.sxng_locales import sxng_locales
|
|
||||||
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
|
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
|
||||||
from searx import logger
|
from searx import logger
|
||||||
|
|
||||||
if t.TYPE_CHECKING:
|
|
||||||
import fasttext.FastText # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
logger = logger.getChild('utils')
|
logger = logger.getChild('utils')
|
||||||
|
|
||||||
XPathSpecType: t.TypeAlias = str | XPath
|
XPathSpecType: t.TypeAlias = str | XPath
|
||||||
@@ -61,12 +56,6 @@ _JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'
|
|||||||
_XPATH_CACHE: dict[str, XPath] = {}
|
_XPATH_CACHE: dict[str, XPath] = {}
|
||||||
_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
|
_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}
|
||||||
|
|
||||||
_FASTTEXT_MODEL: "fasttext.FastText._FastText | None" = None # pyright: ignore[reportPrivateUsage]
|
|
||||||
"""fasttext model to predict language of a search term"""
|
|
||||||
|
|
||||||
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
|
|
||||||
"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
|
|
||||||
|
|
||||||
|
|
||||||
class _NotSetClass: # pylint: disable=too-few-public-methods
|
class _NotSetClass: # pylint: disable=too-few-public-methods
|
||||||
"""Internal class for this module, do not create instance of this class.
|
"""Internal class for this module, do not create instance of this class.
|
||||||
@@ -610,17 +599,6 @@ def eval_xpath_getindex(
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
def _get_fasttext_model() -> "fasttext.FastText._FastText": # pyright: ignore[reportPrivateUsage]
|
|
||||||
global _FASTTEXT_MODEL # pylint: disable=global-statement
|
|
||||||
if _FASTTEXT_MODEL is None:
|
|
||||||
import fasttext # pylint: disable=import-outside-toplevel
|
|
||||||
|
|
||||||
# Monkey patch: prevent fasttext from showing a (useless) warning when loading a model.
|
|
||||||
fasttext.FastText.eprint = lambda x: None # type: ignore
|
|
||||||
_FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) # type: ignore
|
|
||||||
return _FASTTEXT_MODEL
|
|
||||||
|
|
||||||
|
|
||||||
def get_embeded_stream_url(url: str):
|
def get_embeded_stream_url(url: str):
|
||||||
"""
|
"""
|
||||||
Converts a standard video URL into its embed format. Supported services include Youtube,
|
Converts a standard video URL into its embed format. Supported services include Youtube,
|
||||||
@@ -683,77 +661,6 @@ def get_embeded_stream_url(url: str):
|
|||||||
return iframe_src
|
return iframe_src
|
||||||
|
|
||||||
|
|
||||||
def detect_language(text: str, threshold: float = 0.3, only_search_languages: bool = False) -> str | None:
|
|
||||||
"""Detect the language of the ``text`` parameter.
|
|
||||||
|
|
||||||
:param str text: The string whose language is to be detected.
|
|
||||||
|
|
||||||
:param float threshold: Threshold filters the returned labels by a threshold
|
|
||||||
on probability. A choice of 0.3 will return labels with at least 0.3
|
|
||||||
probability.
|
|
||||||
|
|
||||||
:param bool only_search_languages: If ``True``, returns only supported
|
|
||||||
SearXNG search languages. see :py:obj:`searx.languages`
|
|
||||||
|
|
||||||
:rtype: str, None
|
|
||||||
:returns:
|
|
||||||
The detected language code or ``None``. See below.
|
|
||||||
|
|
||||||
:raises ValueError: If ``text`` is not a string.
|
|
||||||
|
|
||||||
The language detection is done by using `a fork`_ of the fastText_ library
|
|
||||||
(`python fasttext`_). fastText_ distributes the `language identification
|
|
||||||
model`_, for reference:
|
|
||||||
|
|
||||||
- `FastText.zip: Compressing text classification models`_
|
|
||||||
- `Bag of Tricks for Efficient Text Classification`_
|
|
||||||
|
|
||||||
The `language identification model`_ support the language codes
|
|
||||||
(ISO-639-3)::
|
|
||||||
|
|
||||||
af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs
|
|
||||||
bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es
|
|
||||||
et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia
|
|
||||||
id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li
|
|
||||||
lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah
|
|
||||||
nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru
|
|
||||||
rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl
|
|
||||||
tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
|
|
||||||
|
|
||||||
By using ``only_search_languages=True`` the `language identification model`_
|
|
||||||
is harmonized with the SearXNG's language (locale) model. General
|
|
||||||
conditions of SearXNG's locale model are:
|
|
||||||
|
|
||||||
a. SearXNG's locale of a query is passed to the
|
|
||||||
:py:obj:`searx.locales.get_engine_locale` to get a language and/or region
|
|
||||||
code that is used by an engine.
|
|
||||||
|
|
||||||
b. Most of SearXNG's engines do not support all the languages from `language
|
|
||||||
identification model`_ and there is also a discrepancy in the ISO-639-3
|
|
||||||
(fasttext) and ISO-639-2 (SearXNG)handling. Further more, in SearXNG the
|
|
||||||
locales like ``zh-TH`` (``zh-CN``) are mapped to ``zh_Hant``
|
|
||||||
(``zh_Hans``) while the `language identification model`_ reduce both to
|
|
||||||
``zh``.
|
|
||||||
|
|
||||||
.. _a fork: https://github.com/searxng/fasttext-predict
|
|
||||||
.. _fastText: https://fasttext.cc/
|
|
||||||
.. _python fasttext: https://pypi.org/project/fasttext/
|
|
||||||
.. _language identification model: https://fasttext.cc/docs/en/language-identification.html
|
|
||||||
.. _Bag of Tricks for Efficient Text Classification: https://arxiv.org/abs/1607.01759
|
|
||||||
.. _`FastText.zip: Compressing text classification models`: https://arxiv.org/abs/1612.03651
|
|
||||||
|
|
||||||
"""
|
|
||||||
if not isinstance(text, str):
|
|
||||||
raise ValueError('text must a str') # pyright: ignore[reportUnreachable]
|
|
||||||
r = _get_fasttext_model().predict(text.replace('\n', ' '), k=1, threshold=threshold) # type: ignore
|
|
||||||
if isinstance(r, tuple) and len(r) == 2 and len(r[0]) > 0 and len(r[1]) > 0: # type: ignore
|
|
||||||
language = r[0][0].split('__label__')[1] # type: ignore
|
|
||||||
if only_search_languages and language not in SEARCH_LANGUAGE_CODES:
|
|
||||||
return None
|
|
||||||
return language # type: ignore
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _j2p_process_escape(match: re.Match[str]) -> str:
|
def _j2p_process_escape(match: re.Match[str]) -> str:
|
||||||
# deal with ECMA escape characters
|
# deal with ECMA escape characters
|
||||||
_escape = match.group(1) or match.group(2)
|
_escape = match.group(1) or match.group(2)
|
||||||
|
|||||||
+2
-6
@@ -9,7 +9,6 @@ from searx.query import RawTextQuery
|
|||||||
from searx.engines import categories, engines
|
from searx.engines import categories, engines
|
||||||
from searx.search.models import SearchQuery, EngineRef
|
from searx.search.models import SearchQuery, EngineRef
|
||||||
from searx.preferences import Preferences, is_locked
|
from searx.preferences import Preferences, is_locked
|
||||||
from searx.utils import detect_language
|
|
||||||
|
|
||||||
|
|
||||||
# remove duplicate queries.
|
# remove duplicate queries.
|
||||||
@@ -233,9 +232,7 @@ def get_search_query_from_webapp(
|
|||||||
4. string with the *selected locale* of the query
|
4. string with the *selected locale* of the query
|
||||||
|
|
||||||
About language/locale: if the client selects the alias ``auto`` the
|
About language/locale: if the client selects the alias ``auto`` the
|
||||||
``SearchQuery`` object is build up by the :py:obj:`detected language
|
language preferred by the :py:obj:`Preferences.client` is used.
|
||||||
<searx.utils.detect_language>`. If language recognition does not have a
|
|
||||||
match the language preferred by the :py:obj:`Preferences.client` is used.
|
|
||||||
If client does not have a preference, the default ``all`` is used.
|
If client does not have a preference, the default ``all`` is used.
|
||||||
|
|
||||||
The *selected locale* in the tuple always represents the selected
|
The *selected locale* in the tuple always represents the selected
|
||||||
@@ -267,8 +264,7 @@ def get_search_query_from_webapp(
|
|||||||
selected_locale = query_lang
|
selected_locale = query_lang
|
||||||
|
|
||||||
if query_lang == 'auto':
|
if query_lang == 'auto':
|
||||||
query_lang = detect_language(query, threshold=0.8, only_search_languages=True)
|
query_lang = preferences.client.locale_tag or 'all'
|
||||||
query_lang = query_lang or preferences.client.locale_tag or 'all'
|
|
||||||
|
|
||||||
if not is_locked('categories') and raw_text_query.specific:
|
if not is_locked('categories') and raw_text_query.specific:
|
||||||
# if engines are calculated from query,
|
# if engines are calculated from query,
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ from searx.engines import wikidata, set_loggers
|
|||||||
from searx.utils import extract_text, searxng_useragent
|
from searx.utils import extract_text, searxng_useragent
|
||||||
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
|
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
|
||||||
from searx import searx_dir
|
from searx import searx_dir
|
||||||
from searx.utils import gen_useragent, detect_language
|
from searx.utils import gen_useragent
|
||||||
import searx.search
|
import searx.search
|
||||||
import searx.network
|
import searx.network
|
||||||
from searx.data import data_dir
|
from searx.data import data_dir
|
||||||
@@ -169,7 +169,7 @@ def get_website_description(url, lang1, lang2=None):
|
|||||||
lang = extract_text(html.xpath('/html/@lang'))
|
lang = extract_text(html.xpath('/html/@lang'))
|
||||||
if lang is None and len(lang1) > 0:
|
if lang is None and len(lang1) > 0:
|
||||||
lang = lang1
|
lang = lang1
|
||||||
lang = detect_language(description) or lang or 'en'
|
lang = lang or 'en'
|
||||||
lang = lang.split('_')[0]
|
lang = lang.split('_')[0]
|
||||||
lang = lang.split('-')[0]
|
lang = lang.split('-')[0]
|
||||||
return (lang, description)
|
return (lang, description)
|
||||||
|
|||||||
@@ -194,27 +194,3 @@ class TestXPathUtils(SearxTestCase): # pylint: disable=missing-class-docstring
|
|||||||
with self.assertRaises(SearxEngineXPathException) as context:
|
with self.assertRaises(SearxEngineXPathException) as context:
|
||||||
utils.eval_xpath_getindex(doc, 'count(//i)', 1)
|
utils.eval_xpath_getindex(doc, 'count(//i)', 1)
|
||||||
self.assertEqual(context.exception.message, 'the result is not a list')
|
self.assertEqual(context.exception.message, 'the result is not a list')
|
||||||
|
|
||||||
def test_detect_language(self):
|
|
||||||
# make sure new line are not an issue
|
|
||||||
# fasttext.predict('') does not accept new line.
|
|
||||||
l = utils.detect_language('The quick brown fox jumps over\nthe lazy dog')
|
|
||||||
self.assertEqual(l, 'en')
|
|
||||||
|
|
||||||
l = utils.detect_language(
|
|
||||||
'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす'
|
|
||||||
)
|
|
||||||
self.assertEqual(l, 'ja')
|
|
||||||
|
|
||||||
l = utils.detect_language('Pijamalı hasta yağız şoföre çabucak güvendi.')
|
|
||||||
self.assertEqual(l, 'tr')
|
|
||||||
|
|
||||||
l = utils.detect_language('')
|
|
||||||
self.assertIsNone(l)
|
|
||||||
|
|
||||||
# mix languages --> None
|
|
||||||
l = utils.detect_language('The いろはにほへと Pijamalı')
|
|
||||||
self.assertIsNone(l)
|
|
||||||
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
utils.detect_language(None) # type: ignore
|
|
||||||
|
|||||||
Reference in New Issue
Block a user