From 191818b865b19270c8c38d905141356a6ba01d9a Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 22 Feb 2026 14:44:21 +0100 Subject: [PATCH] [mod] drop SearXNG's checker (#5767) To date, there is no analysis for the checker that could be evaluated in any meaningful way. - https://github.com/searxng/searxng/issues/3407 - https://github.com/searxng/searxng/pull/3312 The checker would need to be completely redesigned, but even then, its usefulness and the maintenance required for it would be disproportionate. TBH: In its current form, it is useless and only consumes resources and causes the engines to be blocked, because these tests (query terms) come from *hundreds* of instances and could be interpreted as bot attacks. Related issues: [search.checker](https://github.com/searxng/searxng/issues?q=label%3A%22search.checker%22) Signed-off-by: Markus Heiser --- Makefile | 7 - docs/dev/makefile.rst | 32 -- searx/engines/google_news.py | 2 +- searx/metrics/__init__.py | 11 +- searx/search/__init__.py | 4 - searx/search/checker/__init__.py | 7 - searx/search/checker/__main__.py | 118 ------- searx/search/checker/background.py | 168 --------- searx/search/checker/impl.py | 441 ------------------------ searx/search/checker/scheduler.lua | 36 -- searx/search/checker/scheduler.py | 58 ---- searx/settings.yml | 79 ----- searx/settings_defaults.py | 4 - searx/templates/simple/new_issue.html | 4 - searx/templates/simple/preferences.html | 8 +- searx/templates/simple/stats.html | 17 - searx/webapp.py | 45 +-- setup.py | 3 +- tests/__init__.py | 1 - 19 files changed, 8 insertions(+), 1037 deletions(-) delete mode 100644 searx/search/checker/__init__.py delete mode 100644 searx/search/checker/__main__.py delete mode 100644 searx/search/checker/background.py delete mode 100644 searx/search/checker/impl.py delete mode 100644 searx/search/checker/scheduler.lua delete mode 100644 searx/search/checker/scheduler.py diff --git a/Makefile b/Makefile index 4719b0b53..3f5bbef38 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,6 @@ help: @echo 'install - developer install of SearxNG into virtualenv' @echo 'uninstall - uninstall developer installation' @echo 'clean - clean up working tree' - @echo 'search.checker - check search engines' @echo 'test - run shell & CI tests' @echo 'test.shell - test shell scripts' @echo 'ci.test - run CI tests' @@ -39,12 +38,6 @@ clean: py.clean docs.clean node.clean nvm.clean go.clean test.clean $(Q)find . -name '*~' -exec rm -f {} + $(Q)find . -name '*.bak' -exec rm -f {} + -PHONY += search.checker search.checker.% -search.checker: install - $(Q)./manage pyenv.cmd searxng-checker -v - -search.checker.%: install - $(Q)./manage pyenv.cmd searxng-checker -v "$(subst _, ,$(patsubst search.checker.%,%,$@))" PHONY += test ci.test test.shell test: test.yamllint test.black test.pyright_modified test.pylint test.unit test.robot test.rst test.shell test.shfmt diff --git a/docs/dev/makefile.rst b/docs/dev/makefile.rst index fa81f3d23..7c47fba6b 100644 --- a/docs/dev/makefile.rst +++ b/docs/dev/makefile.rst @@ -291,38 +291,6 @@ Pylint_ is known as one of the best source-code, bug and quality checker for the Python programming language. The pylint profile used in the SearXNG project is found in project's root folder :origin:`.pylintrc`. -.. _make search.checker: - -``make search.checker.{engine name}`` -===================================== - -To check all engines:: - - make search.checker - -To check a engine with whitespace in the name like *google news* replace space -by underline:: - - make search.checker.google_news - -To see HTTP requests and more use SEARXNG_DEBUG:: - - make SEARXNG_DEBUG=1 search.checker.google_news - -.. _3xx: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#3xx_redirection - -To filter out HTTP redirects (3xx_):: - - make SEARXNG_DEBUG=1 search.checker.google_news | grep -A1 "HTTP/1.1\" 3[0-9][0-9]" - ... - Engine google news Checking - https://news.google.com:443 "GET /search?q=life&hl=en&lr=lang_en&ie=utf8&oe=utf8&ceid=US%3Aen&gl=US HTTP/1.1" 302 0 - https://news.google.com:443 "GET /search?q=life&hl=en-US&lr=lang_en&ie=utf8&oe=utf8&ceid=US:en&gl=US HTTP/1.1" 200 None - -- - https://news.google.com:443 "GET /search?q=computer&hl=en&lr=lang_en&ie=utf8&oe=utf8&ceid=US%3Aen&gl=US HTTP/1.1" 302 0 - https://news.google.com:443 "GET /search?q=computer&hl=en-US&lr=lang_en&ie=utf8&oe=utf8&ceid=US:en&gl=US HTTP/1.1" 200 None - -- - .. _make themes: ``make themes.*`` diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 231840d87..2fffadffb 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -60,7 +60,7 @@ paging = False time_range_support = False # Google-News results are always *SafeSearch*. Option 'safesearch' is set to -# False here, otherwise checker will report safesearch-errors:: +# False here. # # safesearch : results are identical for safesearch=0 and safesearch=2 safesearch = True diff --git a/searx/metrics/__init__.py b/searx/metrics/__init__.py index da833a8c9..14163a8cd 100644 --- a/searx/metrics/__init__.py +++ b/searx/metrics/__init__.py @@ -139,26 +139,18 @@ def get_engine_errors(engline_name_list): return result -def get_reliabilities(engline_name_list, checker_results): +def get_reliabilities(engline_name_list): reliabilities = {} engine_errors = get_engine_errors(engline_name_list) for engine_name in engline_name_list: - checker_result = checker_results.get(engine_name, {}) - checker_success = checker_result.get('success', True) errors = engine_errors.get(engine_name) or [] sent_count = counter('engine', engine_name, 'search', 'count', 'sent') if sent_count == 0: # no request reliability = None - elif checker_success and not errors: - reliability = 100 - elif 'simple' in checker_result.get('errors', {}): - # the basic (simple) test doesn't work: the engine is broken according to the checker - # even if there is no exception - reliability = 0 else: # pylint: disable=consider-using-generator reliability = 100 - sum([error['percentage'] for error in errors if not error.get('secondary')]) @@ -167,7 +159,6 @@ def get_reliabilities(engline_name_list, checker_results): 'reliability': reliability, 'sent_count': sent_count, 'errors': errors, - 'checker': checker_result.get('errors', {}), } return reliabilities diff --git a/searx/search/__init__.py b/searx/search/__init__.py index e4282512b..45ab83509 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -20,7 +20,6 @@ from searx.external_bang import get_bang_url from searx.metrics import initialize as initialize_metrics, counter_inc from searx.network import initialize as initialize_network, check_network_configuration from searx.results import ResultContainer -from searx.search.checker import initialize as initialize_checker from searx.search.processors import PROCESSORS from searx.search.processors.abstract import RequestParams @@ -33,7 +32,6 @@ logger = logger.getChild('search') def initialize( settings_engines: list[dict[str, t.Any]] = None, # pyright: ignore[reportArgumentType] - enable_checker: bool = False, check_network: bool = False, enable_metrics: bool = True, ): @@ -44,8 +42,6 @@ def initialize( check_network_configuration() initialize_metrics([engine['name'] for engine in settings_engines], enable_metrics) PROCESSORS.init(settings_engines) - if enable_checker: - initialize_checker() class Search: diff --git a/searx/search/checker/__init__.py b/searx/search/checker/__init__.py deleted file mode 100644 index f6891a5db..000000000 --- a/searx/search/checker/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=missing-module-docstring - -from .impl import Checker -from .background import initialize, get_result - -__all__ = ('Checker', 'initialize', 'get_result') diff --git a/searx/search/checker/__main__.py b/searx/search/checker/__main__.py deleted file mode 100644 index 183a51bf2..000000000 --- a/searx/search/checker/__main__.py +++ /dev/null @@ -1,118 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=missing-module-docstring - -import sys -import io -import os -import argparse -import logging - -import searx.search -import searx.search.checker -from searx.search import PROCESSORS -from searx.engines import engine_shortcuts - - -# configure logging -root = logging.getLogger() -handler = logging.StreamHandler(sys.stdout) -for h in root.handlers: - root.removeHandler(h) -root.addHandler(handler) - -# color only for a valid terminal -if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: - RESET_SEQ = "\033[0m" - COLOR_SEQ = "\033[1;%dm" - BOLD_SEQ = "\033[1m" - BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8)) -else: - RESET_SEQ = "" - COLOR_SEQ = "" - BOLD_SEQ = "" - BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" - -# equivalent of 'python -u' (unbuffered stdout, stderr) -stdout = io.TextIOWrapper( - # pylint: disable=consider-using-with - open(sys.stdout.fileno(), 'wb', 0), - write_through=True, -) -stderr = io.TextIOWrapper( - # pylint: disable=consider-using-with - open(sys.stderr.fileno(), 'wb', 0), - write_through=True, -) - - -# iterator of processors -def iter_processor(engine_name_list): - if len(engine_name_list) > 0: - for name in engine_name_list: - name = engine_shortcuts.get(name, name) - processor = PROCESSORS.get(name) - if processor is not None: - yield name, processor - else: - stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}\n') - else: - for name, processor in searx.search.PROCESSORS.items(): - yield name, processor - - -# actual check & display -def run(engine_name_list, verbose): - searx.search.initialize() - name_checker_list = [] - for name, processor in iter_processor(engine_name_list): - stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') - if not sys.stdout.isatty(): - stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n') - checker = searx.search.checker.Checker(processor) - checker.run() - name_checker_list.append((name, checker)) - - stdout.write(f'\n== {BOLD_SEQ}Results{RESET_SEQ} ' + '=' * 70 + '\n') - for name, checker in name_checker_list: - if checker.test_results.successful: - stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n') - if verbose: - stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') - else: - stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}') - if not verbose: - errors = [test_name + ': ' + error for test_name, error in checker.test_results] - stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n') - else: - stdout.write('\n') - stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n') - for test_name, logs in checker.test_results.logs.items(): - for log in logs: - log = map(lambda l: l if isinstance(l, str) else repr(l), log) - stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n') - - -# call by setup.py -def main(): - parser = argparse.ArgumentParser(description='Check SearXNG engines.') - parser.add_argument( - 'engine_name_list', - metavar='engine name', - type=str, - nargs='*', - help='engines name or shortcut list. Empty for all engines.', - ) - parser.add_argument( - '--verbose', - '-v', - action='store_true', - dest='verbose', - help='Display details about the test results', - default=False, - ) - args = parser.parse_args() - run(args.engine_name_list, args.verbose) - - -if __name__ == '__main__': - main() diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py deleted file mode 100644 index 1890c77d5..000000000 --- a/searx/search/checker/background.py +++ /dev/null @@ -1,168 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=missing-module-docstring, cyclic-import - -import json -import time -import threading -import os -import signal -from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union - -import valkey.exceptions - -from searx import logger, settings, sxng_debug -from searx.valkeydb import client as get_valkey_client -from searx.exceptions import SearxSettingsException -from searx.search.processors import PROCESSORS -from searx.search.checker import Checker -from searx.search.checker.scheduler import scheduler_function - - -VALKEY_RESULT_KEY = 'SearXNG_checker_result' -VALKEY_LOCK_KEY = 'SearXNG_checker_lock' - - -CheckerResult = Union['CheckerOk', 'CheckerErr', 'CheckerOther'] - - -class CheckerOk(TypedDict): - """Checking the engines succeeded""" - - status: Literal['ok'] - engines: Dict[str, 'EngineResult'] - timestamp: int - - -class CheckerErr(TypedDict): - """Checking the engines failed""" - - status: Literal['error'] - timestamp: int - - -class CheckerOther(TypedDict): - """The status is unknown or disabled""" - - status: Literal['unknown', 'disabled'] - - -EngineResult = Union['EngineOk', 'EngineErr'] - - -class EngineOk(TypedDict): - """Checking the engine succeeded""" - - success: Literal[True] - - -class EngineErr(TypedDict): - """Checking the engine failed""" - - success: Literal[False] - errors: Dict[str, List[str]] - - -def _get_interval(every: Any, error_msg: str) -> Tuple[int, int]: - if isinstance(every, int): - return (every, every) - - if ( - not isinstance(every, (tuple, list)) - or len(every) != 2 # type: ignore - or not isinstance(every[0], int) - or not isinstance(every[1], int) - ): - raise SearxSettingsException(error_msg, None) - return (every[0], every[1]) - - -def get_result() -> CheckerResult: - client = get_valkey_client() - if client is None: - # without Valkey, the checker is disabled - return {'status': 'disabled'} - serialized_result: Optional[bytes] = client.get(VALKEY_RESULT_KEY) - if serialized_result is None: - # the Valkey key does not exist - return {'status': 'unknown'} - return json.loads(serialized_result) - - -def _set_result(result: CheckerResult): - client = get_valkey_client() - if client is None: - # without Valkey, the function does nothing - return - client.set(VALKEY_RESULT_KEY, json.dumps(result)) - - -def _timestamp(): - return int(time.time() / 3600) * 3600 - - -def run(): - try: - # use a Valkey lock to make sure there is no checker running at the same time - # (this should not happen, this is a safety measure) - with get_valkey_client().lock(VALKEY_LOCK_KEY, blocking_timeout=60, timeout=3600): - logger.info('Starting checker') - result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()} - for name, processor in PROCESSORS.items(): - logger.debug('Checking %s engine', name) - checker = Checker(processor) - checker.run() - if checker.test_results.successful: - result['engines'][name] = {'success': True} - else: - result['engines'][name] = {'success': False, 'errors': checker.test_results.errors} - - _set_result(result) - logger.info('Check done') - except valkey.exceptions.LockError: - _set_result({'status': 'error', 'timestamp': _timestamp()}) - logger.exception('Error while running the checker') - except Exception: # pylint: disable=broad-except - _set_result({'status': 'error', 'timestamp': _timestamp()}) - logger.exception('Error while running the checker') - - -def _signal_handler(_signum: int, _frame: Any): - t = threading.Thread(target=run) - t.daemon = True - t.start() - - -def initialize(): - if hasattr(signal, 'SIGUSR1'): - # Windows doesn't support SIGUSR1 - logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid()) - signal.signal(signal.SIGUSR1, _signal_handler) - - # special case when debug is activate - if sxng_debug and settings['checker']['off_when_debug']: - logger.info('debug mode: checker is disabled') - return - - # check value of checker.scheduling.every now - scheduling = settings['checker']['scheduling'] - if scheduling is None or not scheduling: - logger.info('Checker scheduler is disabled') - return - - # make sure there is a Valkey connection - if get_valkey_client() is None: - logger.error('The checker requires Valkey') - return - - # start the background scheduler - every_range = _get_interval(scheduling.get('every', (300, 1800)), 'checker.scheduling.every is not a int or list') - start_after_range = _get_interval( - scheduling.get('start_after', (300, 1800)), 'checker.scheduling.start_after is not a int or list' - ) - t = threading.Thread( - target=scheduler_function, - args=(start_after_range[0], start_after_range[1], every_range[0], every_range[1], run), - name='checker_scheduler', - ) - t.daemon = True - t.start() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py deleted file mode 100644 index 354cbadc7..000000000 --- a/searx/search/checker/impl.py +++ /dev/null @@ -1,441 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=missing-module-docstring, invalid-name - -import gc -import typing -import types -import functools -import itertools -from time import time -from timeit import default_timer -from urllib.parse import urlparse - -import re -import httpx - -from searx import network, logger -from searx.utils import gen_useragent, detect_language -from searx.results import ResultContainer -from searx.search.models import SearchQuery, EngineRef -from searx.search.processors import EngineProcessor -from searx.metrics import counter_inc - - -logger = logger.getChild('searx.search.checker') - -HTML_TAGS = [ - # fmt: off - 'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script', - 'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', - 'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small', - 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt', - 'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input', - 'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet', - 'frame', 'frameset' - # fmt: on -] - - -def get_check_no_html(): - rep = ['<' + tag + r'[^\>]*>' for tag in HTML_TAGS] - rep += ['' for tag in HTML_TAGS] - pattern = re.compile('|'.join(rep)) - - def f(text): - return pattern.search(text.lower()) is None - - return f - - -_check_no_html = get_check_no_html() - - -def _is_url(url): - try: - result = urlparse(url) - except ValueError: - return False - if result.scheme not in ('http', 'https'): - return False - return True - - -@functools.lru_cache(maxsize=8192) -def _download_and_check_if_image(image_url: str) -> bool: - """Download an URL and check if the Content-Type starts with "image/" - This function should not be called directly: use _is_url_image - otherwise the cache of functools.lru_cache contains data: URL which might be huge. - """ - retry = 2 - - while retry > 0: - a = time() - try: - # use "image_proxy" (avoid HTTP/2) - network.set_context_network_name('image_proxy') - r, stream = network.stream( - 'GET', - image_url, - timeout=10.0, - allow_redirects=True, - headers={ - 'User-Agent': gen_useragent(), - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US;q=0.5,en;q=0.3', - 'DNT': '1', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-GPC': '1', - 'Cache-Control': 'max-age=0', - }, - ) - r.close() - if r.status_code == 200: - is_image = r.headers.get('content-type', '').startswith('image/') - else: - is_image = False - del r - del stream - return is_image - except httpx.TimeoutException: - logger.error('Timeout for %s: %i', image_url, int(time() - a)) - retry -= 1 - except httpx.HTTPError: - logger.exception('Exception for %s', image_url) - return False - return False - - -def _is_url_image(image_url) -> bool: - """Normalize image_url""" - if not isinstance(image_url, str): - return False - - if image_url.startswith('//'): - image_url = 'https:' + image_url - - if image_url.startswith('data:'): - return image_url.startswith('data:image/') - - if not _is_url(image_url): - return False - - return _download_and_check_if_image(image_url) - - -def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]: - return { - 'query': search_query.query, - 'lang': search_query.lang, - 'pageno': search_query.pageno, - 'safesearch': search_query.safesearch, - 'time_range': search_query.time_range, - } - - -def _search_query_diff( - sq1: SearchQuery, sq2: SearchQuery -) -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]: - param1 = _search_query_to_dict(sq1) - param2 = _search_query_to_dict(sq2) - common = {} - diff = {} - for k, value1 in param1.items(): - value2 = param2[k] - if value1 == value2: - common[k] = value1 - else: - diff[k] = (value1, value2) - return (common, diff) - - -class TestResults: # pylint: disable=missing-class-docstring - - __slots__ = 'errors', 'logs', 'languages' - - def __init__(self): - self.errors: typing.Dict[str, typing.List[str]] = {} - self.logs: typing.Dict[str, typing.List[typing.Any]] = {} - self.languages: typing.Set[str] = set() - - def add_error(self, test, message, *args): - # message to self.errors - errors_for_test = self.errors.setdefault(test, []) - if message not in errors_for_test: - errors_for_test.append(message) - # (message, *args) to self.logs - logs_for_test = self.logs.setdefault(test, []) - if (message, *args) not in logs_for_test: - logs_for_test.append((message, *args)) - - def add_language(self, language): - self.languages.add(language) - - @property - def successful(self): - return len(self.errors) == 0 - - def __iter__(self): - for test_name, errors in self.errors.items(): - for error in sorted(errors): - yield (test_name, error) - - -class ResultContainerTests: # pylint: disable=missing-class-docstring - - __slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results' - - def __init__( - self, test_results: TestResults, test_name: str, search_query: SearchQuery, result_container: ResultContainer - ): - self.test_name = test_name - self.search_query = search_query - self.result_container = result_container - self.languages: typing.Set[str] = set() - self.test_results = test_results - self.stop_test = False - - @property - def result_urls(self): - results = self.result_container.get_ordered_results() - return [result['url'] for result in results if 'url' in result] - - def _record_error(self, message: str, *args) -> None: - sq = _search_query_to_dict(self.search_query) - sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()]) - self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')') - - def _add_language(self, text: str) -> typing.Optional[str]: - langStr = detect_language(text) - if langStr: - self.languages.add(langStr) - self.test_results.add_language(langStr) - - def _check_result(self, result): - if not _check_no_html(result.get('title', '')): - self._record_error('HTML in title', repr(result.get('title', ''))) - if not _check_no_html(result.get('content', '')): - self._record_error('HTML in content', repr(result.get('content', ''))) - if result.get('url') is None: - self._record_error('url is None') - - self._add_language(result.get('title', '')) - self._add_language(result.get('content', '')) - - template = result.get('template', 'default.html') - if template == 'default.html': - return - if template == 'code.html': - return - if template == 'torrent.html': - return - if template == 'map.html': - return - if template == 'images.html': - thumbnail_src = result.get('thumbnail_src') - if thumbnail_src is not None: - if not _is_url_image(thumbnail_src): - self._record_error('thumbnail_src URL is invalid', thumbnail_src) - elif not _is_url_image(result.get('img_src')): - self._record_error('img_src URL is invalid', result.get('img_src')) - if template == 'videos.html' and not _is_url_image(result.get('thumbnail')): - self._record_error('thumbnail URL is invalid', result.get('img_src')) - - def _check_results(self, results: list): - for result in results: - self._check_result(result) - - def _check_answers(self, answers): - for answer in answers: - if not _check_no_html(answer): - self._record_error('HTML in answer', answer) - - def _check_infoboxes(self, infoboxes): - for infobox in infoboxes: - if not _check_no_html(infobox.get('content', '')): - self._record_error('HTML in infobox content', infobox.get('content', '')) - self._add_language(infobox.get('content', '')) - for attribute in infobox.get('attributes', {}): - if not _check_no_html(attribute.get('value', '')): - self._record_error('HTML in infobox attribute value', attribute.get('value', '')) - - def check_basic(self): - if len(self.result_container.unresponsive_engines) > 0: - for message in self.result_container.unresponsive_engines: - self._record_error(message[1] + ' ' + (message[2] or '')) - self.stop_test = True - return - - results = self.result_container.get_ordered_results() - if len(results) > 0: - self._check_results(results) - - if len(self.result_container.answers) > 0: - self._check_answers(self.result_container.answers) - - if len(self.result_container.infoboxes) > 0: - self._check_infoboxes(self.result_container.infoboxes) - - def has_infobox(self): - """Check the ResultContainer has at least one infobox""" - if len(self.result_container.infoboxes) == 0: - self._record_error('No infobox') - - def has_answer(self): - """Check the ResultContainer has at least one answer""" - if len(self.result_container.answers) == 0: - self._record_error('No answer') - - def has_language(self, lang): - """Check at least one title or content of the results is written in the `lang`. - - Detected using pycld3, may be not accurate""" - if lang not in self.languages: - self._record_error(lang + ' not found') - - def not_empty(self): - """Check the ResultContainer has at least one answer or infobox or result""" - result_types = set() - results = self.result_container.get_ordered_results() - if len(results) > 0: - result_types.add('results') - - if len(self.result_container.answers) > 0: - result_types.add('answers') - - if len(self.result_container.infoboxes) > 0: - result_types.add('infoboxes') - - if len(result_types) == 0: - self._record_error('No result') - - def one_title_contains(self, title: str): - """Check one of the title contains `title` (case insensitive comparison)""" - title = title.lower() - for result in self.result_container.get_ordered_results(): - if title in result['title'].lower(): - return - self._record_error(('{!r} not found in the title'.format(title))) - - -class CheckerTests: # pylint: disable=missing-class-docstring, too-few-public-methods - - __slots__ = 'test_results', 'test_name', 'result_container_tests_list' - - def __init__( - self, test_results: TestResults, test_name: str, result_container_tests_list: typing.List[ResultContainerTests] - ): - self.test_results = test_results - self.test_name = test_name - self.result_container_tests_list = result_container_tests_list - - def unique_results(self): - """Check the results of each ResultContainer is unique""" - urls_list = [rct.result_urls for rct in self.result_container_tests_list] - if len(urls_list[0]) > 0: - # results on the first page - for i, urls_i in enumerate(urls_list): - for j, urls_j in enumerate(urls_list): - if i < j and urls_i == urls_j: - common, diff = _search_query_diff( - self.result_container_tests_list[i].search_query, - self.result_container_tests_list[j].search_query, - ) - common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()]) - diff1_str = ', '.join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()]) - diff2_str = ', '.join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()]) - self.test_results.add_error( - self.test_name, - 'results are identical for {} and {} ({})'.format(diff1_str, diff2_str, common_str), - ) - - -class Checker: # pylint: disable=missing-class-docstring - - __slots__ = 'processor', 'tests', 'test_results' - - def __init__(self, processor: EngineProcessor): - self.processor = processor - self.tests = self.processor.get_tests() - self.test_results = TestResults() - - @property - def engineref_list(self): - engine_name = self.processor.engine_name - engine_category = self.processor.engine.categories[0] - return [EngineRef(engine_name, engine_category)] - - @staticmethod - def search_query_matrix_iterator(engineref_list, matrix): - p = [] - for name, values in matrix.items(): - if isinstance(values, (tuple, list)): - l = [(name, value) for value in values] - else: - l = [(name, values)] - p.append(l) - - for kwargs in itertools.product(*p): - kwargs = dict(kwargs) - query = kwargs['query'] - params = dict(kwargs) - del params['query'] - yield SearchQuery(query, engineref_list, **params) - - def call_test(self, obj, test_description): - if isinstance(test_description, (tuple, list)): - method, args = test_description[0], test_description[1:] - else: - method = test_description - args = () - if isinstance(method, str) and hasattr(obj, method): - getattr(obj, method)(*args) - elif isinstance(method, types.FunctionType): - method(*args) - else: - self.test_results.add_error( - obj.test_name, - 'method {!r} ({}) not found for {}'.format(method, method.__class__.__name__, obj.__class__.__name__), - ) - - def call_tests(self, obj, test_descriptions): - for test_description in test_descriptions: - self.call_test(obj, test_description) - - def search(self, search_query: SearchQuery) -> ResultContainer: - result_container = ResultContainer() - engineref_category = search_query.engineref_list[0].category - params = self.processor.get_params(search_query, engineref_category) - if params is not None: - counter_inc('engine', search_query.engineref_list[0].name, 'search', 'count', 'sent') - self.processor.search(search_query.query, params, result_container, default_timer(), 5) - return result_container - - def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests: - result_container = self.search(search_query) - result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container) - result_container_check.check_basic() - return result_container_check - - def run_test(self, test_name): - test_parameters = self.tests[test_name] - search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix'])) - rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list] - stop_test = False - if 'result_container' in test_parameters: - for rct in rct_list: - stop_test = stop_test or rct.stop_test - if not rct.stop_test: - self.call_tests(rct, test_parameters['result_container']) - if not stop_test: - if 'test' in test_parameters: - checker_tests = CheckerTests(self.test_results, test_name, rct_list) - self.call_tests(checker_tests, test_parameters['test']) - - def run(self): - for test_name in self.tests: - self.run_test(test_name) - # clear cache - _download_and_check_if_image.cache_clear() - # force a garbage collector - gc.collect() diff --git a/searx/search/checker/scheduler.lua b/searx/search/checker/scheduler.lua deleted file mode 100644 index ec318ddd6..000000000 --- a/searx/search/checker/scheduler.lua +++ /dev/null @@ -1,36 +0,0 @@ --- SPDX-License-Identifier: AGPL-3.0-or-later --- --- This script is not a string in scheduler.py, so editors can provide syntax highlighting. - --- The Valkey KEY is defined here and not in Python on purpose: --- only this LUA script can read and update this key to avoid lock and concurrency issues. -local valkey_key = 'SearXNG_checker_next_call_ts' - -local now = redis.call('TIME')[1] -local start_after_from = ARGV[1] -local start_after_to = ARGV[2] -local every_from = ARGV[3] -local every_to = ARGV[4] - -local next_call_ts = redis.call('GET', valkey_key) - -if (next_call_ts == false or next_call_ts == nil) then - -- the scheduler has never run on this Valkey instance, so: - -- 1/ the scheduler does not run now - -- 2/ the next call is a random time between start_after_from and start_after_to - local initial_delay = math.random(start_after_from, start_after_to) - redis.call('SET', valkey_key, now + initial_delay) - return { false, initial_delay } -end - --- next_call_ts is defined --- --> if now is lower than next_call_ts then we don't run the embedded checker --- --> if now is higher then we update next_call_ts and ask to run the embedded checker now. -local call_now = next_call_ts <= now -if call_now then - -- the checker runs now, define the timestamp of the next call: - -- this is a random delay between every_from and every_to - local periodic_delay = math.random(every_from, every_to) - next_call_ts = redis.call('INCRBY', valkey_key, periodic_delay) -end -return { call_now, next_call_ts - now } diff --git a/searx/search/checker/scheduler.py b/searx/search/checker/scheduler.py deleted file mode 100644 index b093a9ab7..000000000 --- a/searx/search/checker/scheduler.py +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# pylint: disable=missing-module-docstring -"""Lame scheduler which use Valkey as a source of truth: -* the Valkey key SearXNG_checker_next_call_ts contains the next time the embedded checker should run. -* to avoid lock, a unique Valkey script reads and updates the Valkey key SearXNG_checker_next_call_ts. -* this Valkey script returns a list of two elements: - * the first one is a boolean. If True, the embedded checker must run now in this worker. - * the second element is the delay in second to wait before the next call to the Valkey script. - -This scheduler is not generic on purpose: if more feature are required, a dedicate scheduler must be used -(= a better scheduler should not use the web workers) -""" - -import logging -import time -from pathlib import Path -from typing import Callable - -from searx.valkeydb import client as get_valkey_client -from searx.valkeylib import lua_script_storage - - -logger = logging.getLogger('searx.search.checker') - -SCHEDULER_LUA = Path(__file__).parent / "scheduler.lua" - - -def scheduler_function(start_after_from: int, start_after_to: int, every_from: int, every_to: int, callback: Callable): - """Run the checker periodically. The function never returns. - - Parameters: - * start_after_from and start_after_to: when to call "callback" for the first on the Valkey instance - * every_from and every_to: after the first call, how often to call "callback" - - There is no issue: - * to call this function is multiple workers - * to kill workers at any time as long there is one at least one worker - """ - scheduler_now_script = SCHEDULER_LUA.open().read() - while True: - # ask the Valkey script what to do - # the script says - # * if the checker must run now. - # * how to long to way before calling the script again (it can be call earlier, but not later). - script = lua_script_storage(get_valkey_client(), scheduler_now_script) - call_now, wait_time = script(args=[start_after_from, start_after_to, every_from, every_to]) - - # does the worker run the checker now? - if call_now: - # run the checker - try: - callback() - except Exception: # pylint: disable=broad-except - logger.exception("Error calling the embedded checker") - # only worker display the wait_time - logger.info("Next call to the checker in %s seconds", wait_time) - # wait until the next call - time.sleep(wait_time) diff --git a/searx/settings.yml b/searx/settings.yml index af2bad478..6fb24adf8 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -277,53 +277,6 @@ plugins: # '(.*\.)?youtu\.be$': 'yt.example.com' # -checker: - # disable checker when in debug mode - off_when_debug: true - - # use "scheduling: {}" to disable scheduling - # scheduling: interval or int - - # to activate the scheduler: - # * uncomment "scheduling" section - # * add "cache2 = name=searxngcache,items=2000,blocks=2000,blocksize=4096,bitmap=1" - # to your uwsgi.ini - - # scheduling: - # start_after: [300, 1800] # delay to start the first run of the checker - # every: [86400, 90000] # how often the checker runs - - # additional tests: only for the YAML anchors (see the engines section) - # - additional_tests: - rosebud: &test_rosebud - matrix: - query: rosebud - lang: en - result_container: - - not_empty - - ['one_title_contains', 'citizen kane'] - test: - - unique_results - - android: &test_android - matrix: - query: ['android'] - lang: ['en', 'de', 'fr', 'zh-CN'] - result_container: - - not_empty - - ['one_title_contains', 'google'] - test: - - unique_results - - # tests: only for the YAML anchors (see the engines section) - tests: - infobox: &tests_infobox - infobox: - matrix: - query: ["linux", "new york", "bbc"] - result_container: - - has_infobox categories_as_tabs: general: @@ -746,7 +699,6 @@ engines: shortcut: ddd weight: 2 disabled: true - tests: *tests_infobox # cloudflare protected # - name: digbt @@ -820,7 +772,6 @@ engines: weight: 2 # add "list" to the array to get results in the results list display_type: ["infobox"] - tests: *tests_infobox categories: [general] - name: duckduckgo @@ -1053,32 +1004,18 @@ engines: - name: google engine: google shortcut: go - # additional_tests: - # android: *test_android - name: google images engine: google_images shortcut: goi - # additional_tests: - # android: *test_android - # dali: - # matrix: - # query: ['Dali Christ'] - # lang: ['en', 'de', 'fr', 'zh-CN'] - # result_container: - # - ['one_title_contains', 'Salvador'] - name: google news engine: google_news shortcut: gon - # additional_tests: - # android: *test_android - name: google videos engine: google_videos shortcut: gov - # additional_tests: - # android: *test_android - name: google scholar engine: google_scholar @@ -1784,8 +1721,6 @@ engines: shortcut: qw categories: [general, web] disabled: true - additional_tests: - rosebud: *test_rosebud - name: qwant news qwant_categ: news @@ -2021,8 +1956,6 @@ engines: shortcut: sp startpage_categ: web categories: [general, web] - additional_tests: - rosebud: *test_rosebud - name: startpage news engine: startpage @@ -2245,8 +2178,6 @@ engines: base_url: "https://{language}.wikiquote.org/" search_type: text disabled: true - additional_tests: - rosebud: *test_rosebud about: website: https://www.wikiquote.org/ wikidata_id: Q369 @@ -2273,16 +2204,6 @@ engines: about: website: https://species.wikimedia.org/ wikidata_id: Q13679 - tests: - wikispecies: - matrix: - query: "Campbell, L.I. et al. 2011: MicroRNAs" - lang: en - result_container: - - not_empty - - ['one_title_contains', 'Tardigrada'] - test: - - unique_results - name: wiktionary engine: mediawiki diff --git a/searx/settings_defaults.py b/searx/settings_defaults.py index 5c17b248e..2b82dce14 100644 --- a/searx/settings_defaults.py +++ b/searx/settings_defaults.py @@ -267,10 +267,6 @@ SCHEMA: dict[str, t.Any] = { 'networks': {}, }, 'plugins': SettingsValue(dict, {}), - 'checker': { - 'off_when_debug': SettingsValue(bool, True, None), - 'scheduling': SettingsValue((None, dict), None, None), - }, 'categories_as_tabs': SettingsValue(dict, CATEGORIES_AS_TABS), 'engines': SettingsValue(list, []), 'doi_resolvers': {}, diff --git a/searx/templates/simple/new_issue.html b/searx/templates/simple/new_issue.html index 142da727a..f2d99c5f9 100644 --- a/searx/templates/simple/new_issue.html +++ b/searx/templates/simple/new_issue.html @@ -54,10 +54,6 @@ or manually by executing the searx/webapp.py file? --> {{' '}}* Function: `{{ error.function }}` {{' '}}* Code: `{{ error.code }}` {{'\n'-}} -{%- endfor -%} -{%- for test_name, results in engine_reliability.checker.items() -%} -{%- if loop.first %}Checker{% endif -%} -{{-'\n '}}* {{ test_name }}: {% for result in results%}`{{ result }}`,{% endfor -%} {%- endfor -%} diff --git a/searx/templates/simple/preferences.html b/searx/templates/simple/preferences.html index ca3a3f087..a496bd3ae 100644 --- a/searx/templates/simple/preferences.html +++ b/searx/templates/simple/preferences.html @@ -64,7 +64,7 @@ {%- if search_engine.enable_http -%}

{{- icon_big('exclamation-sign', 'No HTTPS') -}}{{- _('No HTTPS')-}}

{% endif -%} - {%- if reliabilities.get(search_engine.name, {}).errors or reliabilities.get(search_engine.name, {}).checker -%} + {%- if reliabilities.get(search_engine.name, {}).errors -%} {{- _('View error logs and submit a bug report') -}} @@ -109,7 +109,6 @@ {%- macro engine_reliability(engine_name) -%} {%- set r = reliabilities.get(engine_name, {}).get('reliability', None) -%} - {%- set checker_result = reliabilities.get(engine_name, {}).get('checker', []) -%} {%- set errors = reliabilities.get(engine_name, {}).get('errors', []) -%} {%- if r != None -%} {%- if r <= 50 -%} @@ -124,7 +123,7 @@ {% else %} {%- set r = '' -%} {%- endif -%} - {%- if checker_result or errors -%} + {%- if errors -%} {{- '' -}} {{- '' -}} @@ -132,9 +131,6 @@ {{- '' -}} {{- '' -}} {% endif %} diff --git a/searx/webapp.py b/searx/webapp.py index d965cf280..23dd56e66 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -118,7 +118,6 @@ from searx.valkeydb import initialize as valkey_initialize from searx.sxng_locales import sxng_locales import searx.search from searx.network import stream as http_stream, set_context_network_name -from searx.search.checker import get_result as checker_get_result logger = logger.getChild('webapp') @@ -929,23 +928,11 @@ def preferences(): # reliabilities reliabilities = {} engine_errors = get_engine_errors(filtered_engines) - checker_results = checker_get_result() - checker_results = ( - checker_results['engines'] if checker_results['status'] == 'ok' and 'engines' in checker_results else {} - ) for _, e in filtered_engines.items(): - checker_result = checker_results.get(e.name, {}) - checker_success = checker_result.get('success', True) errors = engine_errors.get(e.name) or [] if counter('engine', e.name, 'search', 'count', 'sent') == 0: # no request reliability = None - elif checker_success and not errors: - reliability = 100 - elif 'simple' in checker_result.get('errors', {}): - # the basic (simple) test doesn't work: the engine is broken according to the checker - # even if there is no exception - reliability = 0 else: # pylint: disable=consider-using-generator reliability = 100 - sum([error['percentage'] for error in errors if not error.get('secondary')]) @@ -953,10 +940,7 @@ def preferences(): reliabilities[e.name] = { 'reliability': reliability, 'errors': [], - 'checker': checker_results.get(e.name, {}).get('errors', {}).keys(), } - # keep the order of the list checker_results[e.name]['errors'] and deduplicate. - # the first element has the highest percentage rate. reliabilities_errors = [] for error in errors: error_user_text = None @@ -977,13 +961,6 @@ def preferences(): ) safesearch = e.safesearch time_range_support = e.time_range_support - for checker_test_name in checker_results.get(e.name, {}).get('errors', {}): - if supports_selected_language and checker_test_name.startswith('lang_'): - supports_selected_language = '?' - elif safesearch and checker_test_name == 'safesearch': - safesearch = '?' - elif time_range_support and checker_test_name == 'time_range': - time_range_support = '?' supports[e.name] = { 'supports_selected_language': supports_selected_language, 'safesearch': safesearch, @@ -1133,13 +1110,8 @@ def stats(): else: filtered_engines = [selected_engine_name] - checker_results = checker_get_result() - checker_results = ( - checker_results['engines'] if checker_results['status'] == 'ok' and 'engines' in checker_results else {} - ) - engine_stats = get_engines_stats(filtered_engines) - engine_reliabilities = get_reliabilities(filtered_engines, checker_results) + engine_reliabilities = get_reliabilities(filtered_engines) if sort_order not in STATS_SORT_PARAMETERS: sort_order = 'name' @@ -1194,12 +1166,6 @@ def stats_errors(): return jsonify(result) -@app.route('/stats/checker', methods=['GET']) -def stats_checker(): - result = checker_get_result() - return jsonify(result) - - @app.route('/metrics') def stats_open_metrics(): password = settings['general'].get("open_metrics") @@ -1212,13 +1178,8 @@ def stats_open_metrics(): filtered_engines = dict(filter(lambda kv: sxng_request.preferences.validate_token(kv[1]), engines.items())) - checker_results = checker_get_result() - checker_results = ( - checker_results['engines'] if checker_results['status'] == 'ok' and 'engines' in checker_results else {} - ) - engine_stats = get_engines_stats(filtered_engines) - engine_reliabilities = get_reliabilities(filtered_engines, checker_results) + engine_reliabilities = get_reliabilities(filtered_engines) metrics_text = openmetrics(engine_stats, engine_reliabilities) return Response(metrics_text, mimetype='text/plain') @@ -1394,7 +1355,7 @@ def init(): searx.plugins.initialize(app) metrics: bool = get_setting("general.enable_metrics") # type: ignore - searx.search.initialize(enable_checker=True, check_network=True, enable_metrics=metrics) + searx.search.initialize(check_network=True, enable_metrics=metrics) limiter.initialize(app, settings) favicons.init() diff --git a/setup.py b/setup.py index 2f3667b6e..0518ef42d 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( ], project_urls={"Code": GIT_URL, "Issue tracker": get_setting('brand.issue_url')}, entry_points={ - 'console_scripts': ['searxng-run = searx.webapp:run', 'searxng-checker = searx.search.checker.__main__:main'] + 'console_scripts': ['searxng-run = searx.webapp:run'] }, packages=find_packages( include=[ @@ -54,7 +54,6 @@ setup( 'settings.yml', '*.toml', '*.msg', - 'search/checker/scheduler.lua', 'data/*.json', 'data/*.txt', 'data/*.ftz', diff --git a/tests/__init__.py b/tests/__init__.py index 9c176aedf..8b35161d9 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -82,7 +82,6 @@ class SearxTestCase(aiounittest.AsyncTestCase): # - initialize searx.network, searx.metrics, searx.processors and searx.search.checker searx.search.initialize( - enable_checker=True, check_network=True, enable_metrics=searx.get_setting("general.enable_metrics"), # type: ignore )