[mod] drop SearXNG's checker (#5767)

To date, there is no analysis for the checker that could be evaluated in any
meaningful way.

- https://github.com/searxng/searxng/issues/3407
- https://github.com/searxng/searxng/pull/3312

The checker would need to be completely redesigned, but even then, its
usefulness and the maintenance required for it would be disproportionate.

TBH: In its current form, it is useless and only consumes resources and
causes the engines to be blocked, because these tests (query terms) come
from *hundreds* of instances and could be interpreted as bot attacks.

Related issues: [search.checker](https://github.com/searxng/searxng/issues?q=label%3A%22search.checker%22)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser
2026-02-22 14:44:21 +01:00
committed by GitHub
parent 5054e69844
commit 191818b865
19 changed files with 8 additions and 1037 deletions
-7
View File
@@ -17,7 +17,6 @@ help:
@echo 'install - developer install of SearxNG into virtualenv'
@echo 'uninstall - uninstall developer installation'
@echo 'clean - clean up working tree'
@echo 'search.checker - check search engines'
@echo 'test - run shell & CI tests'
@echo 'test.shell - test shell scripts'
@echo 'ci.test - run CI tests'
@@ -39,12 +38,6 @@ clean: py.clean docs.clean node.clean nvm.clean go.clean test.clean
$(Q)find . -name '*~' -exec rm -f {} +
$(Q)find . -name '*.bak' -exec rm -f {} +
PHONY += search.checker search.checker.%
search.checker: install
$(Q)./manage pyenv.cmd searxng-checker -v
search.checker.%: install
$(Q)./manage pyenv.cmd searxng-checker -v "$(subst _, ,$(patsubst search.checker.%,%,$@))"
PHONY += test ci.test test.shell
test: test.yamllint test.black test.pyright_modified test.pylint test.unit test.robot test.rst test.shell test.shfmt
-32
View File
@@ -291,38 +291,6 @@ Pylint_ is known as one of the best source-code, bug and quality checker for the
Python programming language. The pylint profile used in the SearXNG project is
found in project's root folder :origin:`.pylintrc`.
.. _make search.checker:
``make search.checker.{engine name}``
=====================================
To check all engines::
make search.checker
To check a engine with whitespace in the name like *google news* replace space
by underline::
make search.checker.google_news
To see HTTP requests and more use SEARXNG_DEBUG::
make SEARXNG_DEBUG=1 search.checker.google_news
.. _3xx: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#3xx_redirection
To filter out HTTP redirects (3xx_)::
make SEARXNG_DEBUG=1 search.checker.google_news | grep -A1 "HTTP/1.1\" 3[0-9][0-9]"
...
Engine google news Checking
https://news.google.com:443 "GET /search?q=life&hl=en&lr=lang_en&ie=utf8&oe=utf8&ceid=US%3Aen&gl=US HTTP/1.1" 302 0
https://news.google.com:443 "GET /search?q=life&hl=en-US&lr=lang_en&ie=utf8&oe=utf8&ceid=US:en&gl=US HTTP/1.1" 200 None
--
https://news.google.com:443 "GET /search?q=computer&hl=en&lr=lang_en&ie=utf8&oe=utf8&ceid=US%3Aen&gl=US HTTP/1.1" 302 0
https://news.google.com:443 "GET /search?q=computer&hl=en-US&lr=lang_en&ie=utf8&oe=utf8&ceid=US:en&gl=US HTTP/1.1" 200 None
--
.. _make themes:
``make themes.*``
+1 -1
View File
@@ -60,7 +60,7 @@ paging = False
time_range_support = False
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
# False here.
#
# safesearch : results are identical for safesearch=0 and safesearch=2
safesearch = True
+1 -10
View File
@@ -139,26 +139,18 @@ def get_engine_errors(engline_name_list):
return result
def get_reliabilities(engline_name_list, checker_results):
def get_reliabilities(engline_name_list):
reliabilities = {}
engine_errors = get_engine_errors(engline_name_list)
for engine_name in engline_name_list:
checker_result = checker_results.get(engine_name, {})
checker_success = checker_result.get('success', True)
errors = engine_errors.get(engine_name) or []
sent_count = counter('engine', engine_name, 'search', 'count', 'sent')
if sent_count == 0:
# no request
reliability = None
elif checker_success and not errors:
reliability = 100
elif 'simple' in checker_result.get('errors', {}):
# the basic (simple) test doesn't work: the engine is broken according to the checker
# even if there is no exception
reliability = 0
else:
# pylint: disable=consider-using-generator
reliability = 100 - sum([error['percentage'] for error in errors if not error.get('secondary')])
@@ -167,7 +159,6 @@ def get_reliabilities(engline_name_list, checker_results):
'reliability': reliability,
'sent_count': sent_count,
'errors': errors,
'checker': checker_result.get('errors', {}),
}
return reliabilities
-4
View File
@@ -20,7 +20,6 @@ from searx.external_bang import get_bang_url
from searx.metrics import initialize as initialize_metrics, counter_inc
from searx.network import initialize as initialize_network, check_network_configuration
from searx.results import ResultContainer
from searx.search.checker import initialize as initialize_checker
from searx.search.processors import PROCESSORS
from searx.search.processors.abstract import RequestParams
@@ -33,7 +32,6 @@ logger = logger.getChild('search')
def initialize(
settings_engines: list[dict[str, t.Any]] = None, # pyright: ignore[reportArgumentType]
enable_checker: bool = False,
check_network: bool = False,
enable_metrics: bool = True,
):
@@ -44,8 +42,6 @@ def initialize(
check_network_configuration()
initialize_metrics([engine['name'] for engine in settings_engines], enable_metrics)
PROCESSORS.init(settings_engines)
if enable_checker:
initialize_checker()
class Search:
-7
View File
@@ -1,7 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
from .impl import Checker
from .background import initialize, get_result
__all__ = ('Checker', 'initialize', 'get_result')
-118
View File
@@ -1,118 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
import sys
import io
import os
import argparse
import logging
import searx.search
import searx.search.checker
from searx.search import PROCESSORS
from searx.engines import engine_shortcuts
# configure logging
root = logging.getLogger()
handler = logging.StreamHandler(sys.stdout)
for h in root.handlers:
root.removeHandler(h)
root.addHandler(handler)
# color only for a valid terminal
if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']:
RESET_SEQ = "\033[0m"
COLOR_SEQ = "\033[1;%dm"
BOLD_SEQ = "\033[1m"
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = map(lambda i: COLOR_SEQ % (30 + i), range(8))
else:
RESET_SEQ = ""
COLOR_SEQ = ""
BOLD_SEQ = ""
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
# equivalent of 'python -u' (unbuffered stdout, stderr)
stdout = io.TextIOWrapper(
# pylint: disable=consider-using-with
open(sys.stdout.fileno(), 'wb', 0),
write_through=True,
)
stderr = io.TextIOWrapper(
# pylint: disable=consider-using-with
open(sys.stderr.fileno(), 'wb', 0),
write_through=True,
)
# iterator of processors
def iter_processor(engine_name_list):
if len(engine_name_list) > 0:
for name in engine_name_list:
name = engine_shortcuts.get(name, name)
processor = PROCESSORS.get(name)
if processor is not None:
yield name, processor
else:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RED}Engine does not exist{RESET_SEQ}\n')
else:
for name, processor in searx.search.PROCESSORS.items():
yield name, processor
# actual check & display
def run(engine_name_list, verbose):
searx.search.initialize()
name_checker_list = []
for name, processor in iter_processor(engine_name_list):
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n')
if not sys.stdout.isatty():
stderr.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}Checking\n')
checker = searx.search.checker.Checker(processor)
checker.run()
name_checker_list.append((name, checker))
stdout.write(f'\n== {BOLD_SEQ}Results{RESET_SEQ} ' + '=' * 70 + '\n')
for name, checker in name_checker_list:
if checker.test_results.successful:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{GREEN}OK{RESET_SEQ}\n')
if verbose:
stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n')
else:
stdout.write(f'{BOLD_SEQ}Engine {name:30}{RESET_SEQ}{RESET_SEQ}{RED}Error{RESET_SEQ}')
if not verbose:
errors = [test_name + ': ' + error for test_name, error in checker.test_results]
stdout.write(f'{RED}Error {str(errors)}{RESET_SEQ}\n')
else:
stdout.write('\n')
stdout.write(f' {"found languages":15}: {" ".join(sorted(list(checker.test_results.languages)))}\n')
for test_name, logs in checker.test_results.logs.items():
for log in logs:
log = map(lambda l: l if isinstance(l, str) else repr(l), log)
stdout.write(f' {test_name:15}: {RED}{" ".join(log)}{RESET_SEQ}\n')
# call by setup.py
def main():
parser = argparse.ArgumentParser(description='Check SearXNG engines.')
parser.add_argument(
'engine_name_list',
metavar='engine name',
type=str,
nargs='*',
help='engines name or shortcut list. Empty for all engines.',
)
parser.add_argument(
'--verbose',
'-v',
action='store_true',
dest='verbose',
help='Display details about the test results',
default=False,
)
args = parser.parse_args()
run(args.engine_name_list, args.verbose)
if __name__ == '__main__':
main()
-168
View File
@@ -1,168 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, cyclic-import
import json
import time
import threading
import os
import signal
from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
import valkey.exceptions
from searx import logger, settings, sxng_debug
from searx.valkeydb import client as get_valkey_client
from searx.exceptions import SearxSettingsException
from searx.search.processors import PROCESSORS
from searx.search.checker import Checker
from searx.search.checker.scheduler import scheduler_function
VALKEY_RESULT_KEY = 'SearXNG_checker_result'
VALKEY_LOCK_KEY = 'SearXNG_checker_lock'
CheckerResult = Union['CheckerOk', 'CheckerErr', 'CheckerOther']
class CheckerOk(TypedDict):
"""Checking the engines succeeded"""
status: Literal['ok']
engines: Dict[str, 'EngineResult']
timestamp: int
class CheckerErr(TypedDict):
"""Checking the engines failed"""
status: Literal['error']
timestamp: int
class CheckerOther(TypedDict):
"""The status is unknown or disabled"""
status: Literal['unknown', 'disabled']
EngineResult = Union['EngineOk', 'EngineErr']
class EngineOk(TypedDict):
"""Checking the engine succeeded"""
success: Literal[True]
class EngineErr(TypedDict):
"""Checking the engine failed"""
success: Literal[False]
errors: Dict[str, List[str]]
def _get_interval(every: Any, error_msg: str) -> Tuple[int, int]:
if isinstance(every, int):
return (every, every)
if (
not isinstance(every, (tuple, list))
or len(every) != 2 # type: ignore
or not isinstance(every[0], int)
or not isinstance(every[1], int)
):
raise SearxSettingsException(error_msg, None)
return (every[0], every[1])
def get_result() -> CheckerResult:
client = get_valkey_client()
if client is None:
# without Valkey, the checker is disabled
return {'status': 'disabled'}
serialized_result: Optional[bytes] = client.get(VALKEY_RESULT_KEY)
if serialized_result is None:
# the Valkey key does not exist
return {'status': 'unknown'}
return json.loads(serialized_result)
def _set_result(result: CheckerResult):
client = get_valkey_client()
if client is None:
# without Valkey, the function does nothing
return
client.set(VALKEY_RESULT_KEY, json.dumps(result))
def _timestamp():
return int(time.time() / 3600) * 3600
def run():
try:
# use a Valkey lock to make sure there is no checker running at the same time
# (this should not happen, this is a safety measure)
with get_valkey_client().lock(VALKEY_LOCK_KEY, blocking_timeout=60, timeout=3600):
logger.info('Starting checker')
result: CheckerOk = {'status': 'ok', 'engines': {}, 'timestamp': _timestamp()}
for name, processor in PROCESSORS.items():
logger.debug('Checking %s engine', name)
checker = Checker(processor)
checker.run()
if checker.test_results.successful:
result['engines'][name] = {'success': True}
else:
result['engines'][name] = {'success': False, 'errors': checker.test_results.errors}
_set_result(result)
logger.info('Check done')
except valkey.exceptions.LockError:
_set_result({'status': 'error', 'timestamp': _timestamp()})
logger.exception('Error while running the checker')
except Exception: # pylint: disable=broad-except
_set_result({'status': 'error', 'timestamp': _timestamp()})
logger.exception('Error while running the checker')
def _signal_handler(_signum: int, _frame: Any):
t = threading.Thread(target=run)
t.daemon = True
t.start()
def initialize():
if hasattr(signal, 'SIGUSR1'):
# Windows doesn't support SIGUSR1
logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
signal.signal(signal.SIGUSR1, _signal_handler)
# special case when debug is activate
if sxng_debug and settings['checker']['off_when_debug']:
logger.info('debug mode: checker is disabled')
return
# check value of checker.scheduling.every now
scheduling = settings['checker']['scheduling']
if scheduling is None or not scheduling:
logger.info('Checker scheduler is disabled')
return
# make sure there is a Valkey connection
if get_valkey_client() is None:
logger.error('The checker requires Valkey')
return
# start the background scheduler
every_range = _get_interval(scheduling.get('every', (300, 1800)), 'checker.scheduling.every is not a int or list')
start_after_range = _get_interval(
scheduling.get('start_after', (300, 1800)), 'checker.scheduling.start_after is not a int or list'
)
t = threading.Thread(
target=scheduler_function,
args=(start_after_range[0], start_after_range[1], every_range[0], every_range[1], run),
name='checker_scheduler',
)
t.daemon = True
t.start()
-441
View File
@@ -1,441 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, invalid-name
import gc
import typing
import types
import functools
import itertools
from time import time
from timeit import default_timer
from urllib.parse import urlparse
import re
import httpx
from searx import network, logger
from searx.utils import gen_useragent, detect_language
from searx.results import ResultContainer
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
from searx.metrics import counter_inc
logger = logger.getChild('searx.search.checker')
HTML_TAGS = [
# fmt: off
'embed', 'iframe', 'object', 'param', 'picture', 'source', 'svg', 'math', 'canvas', 'noscript', 'script',
'del', 'ins', 'area', 'audio', 'img', 'map', 'track', 'video', 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite',
'code', 'data', 'dfn', 'em', 'i', 'kdb', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'small',
'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'style', 'blockquote', 'dd', 'div', 'dl', 'dt',
'figcaption', 'figure', 'hr', 'li', 'ol', 'p', 'pre', 'ul', 'button', 'datalist', 'fieldset', 'form', 'input',
'label', 'legend', 'meter', 'optgroup', 'option', 'output', 'progress', 'select', 'textarea', 'applet',
'frame', 'frameset'
# fmt: on
]
def get_check_no_html():
rep = ['<' + tag + r'[^\>]*>' for tag in HTML_TAGS]
rep += ['</' + tag + '>' for tag in HTML_TAGS]
pattern = re.compile('|'.join(rep))
def f(text):
return pattern.search(text.lower()) is None
return f
_check_no_html = get_check_no_html()
def _is_url(url):
try:
result = urlparse(url)
except ValueError:
return False
if result.scheme not in ('http', 'https'):
return False
return True
@functools.lru_cache(maxsize=8192)
def _download_and_check_if_image(image_url: str) -> bool:
"""Download an URL and check if the Content-Type starts with "image/"
This function should not be called directly: use _is_url_image
otherwise the cache of functools.lru_cache contains data: URL which might be huge.
"""
retry = 2
while retry > 0:
a = time()
try:
# use "image_proxy" (avoid HTTP/2)
network.set_context_network_name('image_proxy')
r, stream = network.stream(
'GET',
image_url,
timeout=10.0,
allow_redirects=True,
headers={
'User-Agent': gen_useragent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US;q=0.5,en;q=0.3',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-GPC': '1',
'Cache-Control': 'max-age=0',
},
)
r.close()
if r.status_code == 200:
is_image = r.headers.get('content-type', '').startswith('image/')
else:
is_image = False
del r
del stream
return is_image
except httpx.TimeoutException:
logger.error('Timeout for %s: %i', image_url, int(time() - a))
retry -= 1
except httpx.HTTPError:
logger.exception('Exception for %s', image_url)
return False
return False
def _is_url_image(image_url) -> bool:
"""Normalize image_url"""
if not isinstance(image_url, str):
return False
if image_url.startswith('//'):
image_url = 'https:' + image_url
if image_url.startswith('data:'):
return image_url.startswith('data:image/')
if not _is_url(image_url):
return False
return _download_and_check_if_image(image_url)
def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
return {
'query': search_query.query,
'lang': search_query.lang,
'pageno': search_query.pageno,
'safesearch': search_query.safesearch,
'time_range': search_query.time_range,
}
def _search_query_diff(
sq1: SearchQuery, sq2: SearchQuery
) -> typing.Tuple[typing.Dict[str, typing.Any], typing.Dict[str, typing.Any]]:
param1 = _search_query_to_dict(sq1)
param2 = _search_query_to_dict(sq2)
common = {}
diff = {}
for k, value1 in param1.items():
value2 = param2[k]
if value1 == value2:
common[k] = value1
else:
diff[k] = (value1, value2)
return (common, diff)
class TestResults: # pylint: disable=missing-class-docstring
__slots__ = 'errors', 'logs', 'languages'
def __init__(self):
self.errors: typing.Dict[str, typing.List[str]] = {}
self.logs: typing.Dict[str, typing.List[typing.Any]] = {}
self.languages: typing.Set[str] = set()
def add_error(self, test, message, *args):
# message to self.errors
errors_for_test = self.errors.setdefault(test, [])
if message not in errors_for_test:
errors_for_test.append(message)
# (message, *args) to self.logs
logs_for_test = self.logs.setdefault(test, [])
if (message, *args) not in logs_for_test:
logs_for_test.append((message, *args))
def add_language(self, language):
self.languages.add(language)
@property
def successful(self):
return len(self.errors) == 0
def __iter__(self):
for test_name, errors in self.errors.items():
for error in sorted(errors):
yield (test_name, error)
class ResultContainerTests: # pylint: disable=missing-class-docstring
__slots__ = 'test_name', 'search_query', 'result_container', 'languages', 'stop_test', 'test_results'
def __init__(
self, test_results: TestResults, test_name: str, search_query: SearchQuery, result_container: ResultContainer
):
self.test_name = test_name
self.search_query = search_query
self.result_container = result_container
self.languages: typing.Set[str] = set()
self.test_results = test_results
self.stop_test = False
@property
def result_urls(self):
results = self.result_container.get_ordered_results()
return [result['url'] for result in results if 'url' in result]
def _record_error(self, message: str, *args) -> None:
sq = _search_query_to_dict(self.search_query)
sqstr = ' '.join(['{}={!r}'.format(k, v) for k, v in sq.items()])
self.test_results.add_error(self.test_name, message, *args, '(' + sqstr + ')')
def _add_language(self, text: str) -> typing.Optional[str]:
langStr = detect_language(text)
if langStr:
self.languages.add(langStr)
self.test_results.add_language(langStr)
def _check_result(self, result):
if not _check_no_html(result.get('title', '')):
self._record_error('HTML in title', repr(result.get('title', '')))
if not _check_no_html(result.get('content', '')):
self._record_error('HTML in content', repr(result.get('content', '')))
if result.get('url') is None:
self._record_error('url is None')
self._add_language(result.get('title', ''))
self._add_language(result.get('content', ''))
template = result.get('template', 'default.html')
if template == 'default.html':
return
if template == 'code.html':
return
if template == 'torrent.html':
return
if template == 'map.html':
return
if template == 'images.html':
thumbnail_src = result.get('thumbnail_src')
if thumbnail_src is not None:
if not _is_url_image(thumbnail_src):
self._record_error('thumbnail_src URL is invalid', thumbnail_src)
elif not _is_url_image(result.get('img_src')):
self._record_error('img_src URL is invalid', result.get('img_src'))
if template == 'videos.html' and not _is_url_image(result.get('thumbnail')):
self._record_error('thumbnail URL is invalid', result.get('img_src'))
def _check_results(self, results: list):
for result in results:
self._check_result(result)
def _check_answers(self, answers):
for answer in answers:
if not _check_no_html(answer):
self._record_error('HTML in answer', answer)
def _check_infoboxes(self, infoboxes):
for infobox in infoboxes:
if not _check_no_html(infobox.get('content', '')):
self._record_error('HTML in infobox content', infobox.get('content', ''))
self._add_language(infobox.get('content', ''))
for attribute in infobox.get('attributes', {}):
if not _check_no_html(attribute.get('value', '')):
self._record_error('HTML in infobox attribute value', attribute.get('value', ''))
def check_basic(self):
if len(self.result_container.unresponsive_engines) > 0:
for message in self.result_container.unresponsive_engines:
self._record_error(message[1] + ' ' + (message[2] or ''))
self.stop_test = True
return
results = self.result_container.get_ordered_results()
if len(results) > 0:
self._check_results(results)
if len(self.result_container.answers) > 0:
self._check_answers(self.result_container.answers)
if len(self.result_container.infoboxes) > 0:
self._check_infoboxes(self.result_container.infoboxes)
def has_infobox(self):
"""Check the ResultContainer has at least one infobox"""
if len(self.result_container.infoboxes) == 0:
self._record_error('No infobox')
def has_answer(self):
"""Check the ResultContainer has at least one answer"""
if len(self.result_container.answers) == 0:
self._record_error('No answer')
def has_language(self, lang):
"""Check at least one title or content of the results is written in the `lang`.
Detected using pycld3, may be not accurate"""
if lang not in self.languages:
self._record_error(lang + ' not found')
def not_empty(self):
"""Check the ResultContainer has at least one answer or infobox or result"""
result_types = set()
results = self.result_container.get_ordered_results()
if len(results) > 0:
result_types.add('results')
if len(self.result_container.answers) > 0:
result_types.add('answers')
if len(self.result_container.infoboxes) > 0:
result_types.add('infoboxes')
if len(result_types) == 0:
self._record_error('No result')
def one_title_contains(self, title: str):
"""Check one of the title contains `title` (case insensitive comparison)"""
title = title.lower()
for result in self.result_container.get_ordered_results():
if title in result['title'].lower():
return
self._record_error(('{!r} not found in the title'.format(title)))
class CheckerTests: # pylint: disable=missing-class-docstring, too-few-public-methods
__slots__ = 'test_results', 'test_name', 'result_container_tests_list'
def __init__(
self, test_results: TestResults, test_name: str, result_container_tests_list: typing.List[ResultContainerTests]
):
self.test_results = test_results
self.test_name = test_name
self.result_container_tests_list = result_container_tests_list
def unique_results(self):
"""Check the results of each ResultContainer is unique"""
urls_list = [rct.result_urls for rct in self.result_container_tests_list]
if len(urls_list[0]) > 0:
# results on the first page
for i, urls_i in enumerate(urls_list):
for j, urls_j in enumerate(urls_list):
if i < j and urls_i == urls_j:
common, diff = _search_query_diff(
self.result_container_tests_list[i].search_query,
self.result_container_tests_list[j].search_query,
)
common_str = ' '.join(['{}={!r}'.format(k, v) for k, v in common.items()])
diff1_str = ', '.join(['{}={!r}'.format(k, v1) for (k, (v1, v2)) in diff.items()])
diff2_str = ', '.join(['{}={!r}'.format(k, v2) for (k, (v1, v2)) in diff.items()])
self.test_results.add_error(
self.test_name,
'results are identical for {} and {} ({})'.format(diff1_str, diff2_str, common_str),
)
class Checker: # pylint: disable=missing-class-docstring
__slots__ = 'processor', 'tests', 'test_results'
def __init__(self, processor: EngineProcessor):
self.processor = processor
self.tests = self.processor.get_tests()
self.test_results = TestResults()
@property
def engineref_list(self):
engine_name = self.processor.engine_name
engine_category = self.processor.engine.categories[0]
return [EngineRef(engine_name, engine_category)]
@staticmethod
def search_query_matrix_iterator(engineref_list, matrix):
p = []
for name, values in matrix.items():
if isinstance(values, (tuple, list)):
l = [(name, value) for value in values]
else:
l = [(name, values)]
p.append(l)
for kwargs in itertools.product(*p):
kwargs = dict(kwargs)
query = kwargs['query']
params = dict(kwargs)
del params['query']
yield SearchQuery(query, engineref_list, **params)
def call_test(self, obj, test_description):
if isinstance(test_description, (tuple, list)):
method, args = test_description[0], test_description[1:]
else:
method = test_description
args = ()
if isinstance(method, str) and hasattr(obj, method):
getattr(obj, method)(*args)
elif isinstance(method, types.FunctionType):
method(*args)
else:
self.test_results.add_error(
obj.test_name,
'method {!r} ({}) not found for {}'.format(method, method.__class__.__name__, obj.__class__.__name__),
)
def call_tests(self, obj, test_descriptions):
for test_description in test_descriptions:
self.call_test(obj, test_description)
def search(self, search_query: SearchQuery) -> ResultContainer:
result_container = ResultContainer()
engineref_category = search_query.engineref_list[0].category
params = self.processor.get_params(search_query, engineref_category)
if params is not None:
counter_inc('engine', search_query.engineref_list[0].name, 'search', 'count', 'sent')
self.processor.search(search_query.query, params, result_container, default_timer(), 5)
return result_container
def get_result_container_tests(self, test_name: str, search_query: SearchQuery) -> ResultContainerTests:
result_container = self.search(search_query)
result_container_check = ResultContainerTests(self.test_results, test_name, search_query, result_container)
result_container_check.check_basic()
return result_container_check
def run_test(self, test_name):
test_parameters = self.tests[test_name]
search_query_list = list(Checker.search_query_matrix_iterator(self.engineref_list, test_parameters['matrix']))
rct_list = [self.get_result_container_tests(test_name, search_query) for search_query in search_query_list]
stop_test = False
if 'result_container' in test_parameters:
for rct in rct_list:
stop_test = stop_test or rct.stop_test
if not rct.stop_test:
self.call_tests(rct, test_parameters['result_container'])
if not stop_test:
if 'test' in test_parameters:
checker_tests = CheckerTests(self.test_results, test_name, rct_list)
self.call_tests(checker_tests, test_parameters['test'])
def run(self):
for test_name in self.tests:
self.run_test(test_name)
# clear cache
_download_and_check_if_image.cache_clear()
# force a garbage collector
gc.collect()
-36
View File
@@ -1,36 +0,0 @@
-- SPDX-License-Identifier: AGPL-3.0-or-later
--
-- This script is not a string in scheduler.py, so editors can provide syntax highlighting.
-- The Valkey KEY is defined here and not in Python on purpose:
-- only this LUA script can read and update this key to avoid lock and concurrency issues.
local valkey_key = 'SearXNG_checker_next_call_ts'
local now = redis.call('TIME')[1]
local start_after_from = ARGV[1]
local start_after_to = ARGV[2]
local every_from = ARGV[3]
local every_to = ARGV[4]
local next_call_ts = redis.call('GET', valkey_key)
if (next_call_ts == false or next_call_ts == nil) then
-- the scheduler has never run on this Valkey instance, so:
-- 1/ the scheduler does not run now
-- 2/ the next call is a random time between start_after_from and start_after_to
local initial_delay = math.random(start_after_from, start_after_to)
redis.call('SET', valkey_key, now + initial_delay)
return { false, initial_delay }
end
-- next_call_ts is defined
-- --> if now is lower than next_call_ts then we don't run the embedded checker
-- --> if now is higher then we update next_call_ts and ask to run the embedded checker now.
local call_now = next_call_ts <= now
if call_now then
-- the checker runs now, define the timestamp of the next call:
-- this is a random delay between every_from and every_to
local periodic_delay = math.random(every_from, every_to)
next_call_ts = redis.call('INCRBY', valkey_key, periodic_delay)
end
return { call_now, next_call_ts - now }
-58
View File
@@ -1,58 +0,0 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring
"""Lame scheduler which use Valkey as a source of truth:
* the Valkey key SearXNG_checker_next_call_ts contains the next time the embedded checker should run.
* to avoid lock, a unique Valkey script reads and updates the Valkey key SearXNG_checker_next_call_ts.
* this Valkey script returns a list of two elements:
* the first one is a boolean. If True, the embedded checker must run now in this worker.
* the second element is the delay in second to wait before the next call to the Valkey script.
This scheduler is not generic on purpose: if more feature are required, a dedicate scheduler must be used
(= a better scheduler should not use the web workers)
"""
import logging
import time
from pathlib import Path
from typing import Callable
from searx.valkeydb import client as get_valkey_client
from searx.valkeylib import lua_script_storage
logger = logging.getLogger('searx.search.checker')
SCHEDULER_LUA = Path(__file__).parent / "scheduler.lua"
def scheduler_function(start_after_from: int, start_after_to: int, every_from: int, every_to: int, callback: Callable):
"""Run the checker periodically. The function never returns.
Parameters:
* start_after_from and start_after_to: when to call "callback" for the first on the Valkey instance
* every_from and every_to: after the first call, how often to call "callback"
There is no issue:
* to call this function is multiple workers
* to kill workers at any time as long there is one at least one worker
"""
scheduler_now_script = SCHEDULER_LUA.open().read()
while True:
# ask the Valkey script what to do
# the script says
# * if the checker must run now.
# * how to long to way before calling the script again (it can be call earlier, but not later).
script = lua_script_storage(get_valkey_client(), scheduler_now_script)
call_now, wait_time = script(args=[start_after_from, start_after_to, every_from, every_to])
# does the worker run the checker now?
if call_now:
# run the checker
try:
callback()
except Exception: # pylint: disable=broad-except
logger.exception("Error calling the embedded checker")
# only worker display the wait_time
logger.info("Next call to the checker in %s seconds", wait_time)
# wait until the next call
time.sleep(wait_time)
-79
View File
@@ -277,53 +277,6 @@ plugins:
# '(.*\.)?youtu\.be$': 'yt.example.com'
#
checker:
# disable checker when in debug mode
off_when_debug: true
# use "scheduling: {}" to disable scheduling
# scheduling: interval or int
# to activate the scheduler:
# * uncomment "scheduling" section
# * add "cache2 = name=searxngcache,items=2000,blocks=2000,blocksize=4096,bitmap=1"
# to your uwsgi.ini
# scheduling:
# start_after: [300, 1800] # delay to start the first run of the checker
# every: [86400, 90000] # how often the checker runs
# additional tests: only for the YAML anchors (see the engines section)
#
additional_tests:
rosebud: &test_rosebud
matrix:
query: rosebud
lang: en
result_container:
- not_empty
- ['one_title_contains', 'citizen kane']
test:
- unique_results
android: &test_android
matrix:
query: ['android']
lang: ['en', 'de', 'fr', 'zh-CN']
result_container:
- not_empty
- ['one_title_contains', 'google']
test:
- unique_results
# tests: only for the YAML anchors (see the engines section)
tests:
infobox: &tests_infobox
infobox:
matrix:
query: ["linux", "new york", "bbc"]
result_container:
- has_infobox
categories_as_tabs:
general:
@@ -746,7 +699,6 @@ engines:
shortcut: ddd
weight: 2
disabled: true
tests: *tests_infobox
# cloudflare protected
# - name: digbt
@@ -820,7 +772,6 @@ engines:
weight: 2
# add "list" to the array to get results in the results list
display_type: ["infobox"]
tests: *tests_infobox
categories: [general]
- name: duckduckgo
@@ -1053,32 +1004,18 @@ engines:
- name: google
engine: google
shortcut: go
# additional_tests:
# android: *test_android
- name: google images
engine: google_images
shortcut: goi
# additional_tests:
# android: *test_android
# dali:
# matrix:
# query: ['Dali Christ']
# lang: ['en', 'de', 'fr', 'zh-CN']
# result_container:
# - ['one_title_contains', 'Salvador']
- name: google news
engine: google_news
shortcut: gon
# additional_tests:
# android: *test_android
- name: google videos
engine: google_videos
shortcut: gov
# additional_tests:
# android: *test_android
- name: google scholar
engine: google_scholar
@@ -1784,8 +1721,6 @@ engines:
shortcut: qw
categories: [general, web]
disabled: true
additional_tests:
rosebud: *test_rosebud
- name: qwant news
qwant_categ: news
@@ -2021,8 +1956,6 @@ engines:
shortcut: sp
startpage_categ: web
categories: [general, web]
additional_tests:
rosebud: *test_rosebud
- name: startpage news
engine: startpage
@@ -2245,8 +2178,6 @@ engines:
base_url: "https://{language}.wikiquote.org/"
search_type: text
disabled: true
additional_tests:
rosebud: *test_rosebud
about:
website: https://www.wikiquote.org/
wikidata_id: Q369
@@ -2273,16 +2204,6 @@ engines:
about:
website: https://species.wikimedia.org/
wikidata_id: Q13679
tests:
wikispecies:
matrix:
query: "Campbell, L.I. et al. 2011: MicroRNAs"
lang: en
result_container:
- not_empty
- ['one_title_contains', 'Tardigrada']
test:
- unique_results
- name: wiktionary
engine: mediawiki
-4
View File
@@ -267,10 +267,6 @@ SCHEMA: dict[str, t.Any] = {
'networks': {},
},
'plugins': SettingsValue(dict, {}),
'checker': {
'off_when_debug': SettingsValue(bool, True, None),
'scheduling': SettingsValue((None, dict), None, None),
},
'categories_as_tabs': SettingsValue(dict, CATEGORIES_AS_TABS),
'engines': SettingsValue(list, []),
'doi_resolvers': {},
-4
View File
@@ -54,10 +54,6 @@ or manually by executing the searx/webapp.py file? -->
{{' '}}* Function: `{{ error.function }}`
{{' '}}* Code: `{{ error.code }}`
{{'\n'-}}
{%- endfor -%}
{%- for test_name, results in engine_reliability.checker.items() -%}
{%- if loop.first %}Checker{% endif -%}
{{-'\n '}}* {{ test_name }}: {% for result in results%}`{{ result }}`,{% endfor -%}
{%- endfor -%}
</textarea>
<input type="checkbox" id="step1">
+2 -6
View File
@@ -64,7 +64,7 @@
{%- if search_engine.enable_http -%}
<p>{{- icon_big('exclamation-sign', 'No HTTPS') -}}{{- _('No HTTPS')-}}</p>
{% endif -%}
{%- if reliabilities.get(search_engine.name, {}).errors or reliabilities.get(search_engine.name, {}).checker -%}
{%- if reliabilities.get(search_engine.name, {}).errors -%}
<a href="{{ url_for('stats', engine=search_engine.name|e) }}" {{- ' ' -}}
title="{{ _('View error logs and submit a bug report') }}">
{{- _('View error logs and submit a bug report') -}}
@@ -109,7 +109,6 @@
{%- macro engine_reliability(engine_name) -%}
{%- set r = reliabilities.get(engine_name, {}).get('reliability', None) -%}
{%- set checker_result = reliabilities.get(engine_name, {}).get('checker', []) -%}
{%- set errors = reliabilities.get(engine_name, {}).get('errors', []) -%}
{%- if r != None -%}
{%- if r <= 50 -%}
@@ -124,7 +123,7 @@
{% else %}
{%- set r = '' -%}
{%- endif -%}
{%- if checker_result or errors -%}
{%- if errors -%}
<td class="{{ label }} column-reliability">{{- '' -}}
<a href="{{ url_for('stats', engine=engine_name|e) }}">{{- '' -}}
<span>
@@ -132,9 +131,6 @@
</span>{{- '' -}}
</a>{{- '' -}}
<div class="engine-tooltip" role="tooltip" id="{{engine_name}}_reliability">
{%- if checker_result -%}
<p>{{ _("Failed checker test(s): ") }} {{ ', '.join(checker_result) }}</p>
{%- endif -%}
{%- if errors -%}<p>{{ _('Errors:') }}</p>{%- endif -%}
{%- for error in errors -%}
<p>{{ error }}</p>{{- '' -}}
-17
View File
@@ -124,23 +124,6 @@
{% endif %}
{% endfor %}
{% endfor %}
{% if engine_reliabilities[selected_engine_name].checker %}
<h3>{{ _('Checker') }}</h3>
<table>
<tr>
<th scope="col" class="failed-test">{{ _('Failed test') }}</th>
<th scope="col">{{ _('Comment(s)') }}</th>
</tr>
{% for test_name, results in engine_reliabilities[selected_engine_name].checker.items() %}
<tr>
<td>{{ test_name }}</td>
<td>
{% for r in results %}<p>{{ r }}</p>{% endfor %}
</td>
</tr>
{% endfor %}
</table>
{% endif %}
{{ new_issue(selected_engine_name, engine_reliabilities[selected_engine_name]) }}
</div>
{% endif %}
+3 -42
View File
@@ -118,7 +118,6 @@ from searx.valkeydb import initialize as valkey_initialize
from searx.sxng_locales import sxng_locales
import searx.search
from searx.network import stream as http_stream, set_context_network_name
from searx.search.checker import get_result as checker_get_result
logger = logger.getChild('webapp')
@@ -929,23 +928,11 @@ def preferences():
# reliabilities
reliabilities = {}
engine_errors = get_engine_errors(filtered_engines)
checker_results = checker_get_result()
checker_results = (
checker_results['engines'] if checker_results['status'] == 'ok' and 'engines' in checker_results else {}
)
for _, e in filtered_engines.items():
checker_result = checker_results.get(e.name, {})
checker_success = checker_result.get('success', True)
errors = engine_errors.get(e.name) or []
if counter('engine', e.name, 'search', 'count', 'sent') == 0:
# no request
reliability = None
elif checker_success and not errors:
reliability = 100
elif 'simple' in checker_result.get('errors', {}):
# the basic (simple) test doesn't work: the engine is broken according to the checker
# even if there is no exception
reliability = 0
else:
# pylint: disable=consider-using-generator
reliability = 100 - sum([error['percentage'] for error in errors if not error.get('secondary')])
@@ -953,10 +940,7 @@ def preferences():
reliabilities[e.name] = {
'reliability': reliability,
'errors': [],
'checker': checker_results.get(e.name, {}).get('errors', {}).keys(),
}
# keep the order of the list checker_results[e.name]['errors'] and deduplicate.
# the first element has the highest percentage rate.
reliabilities_errors = []
for error in errors:
error_user_text = None
@@ -977,13 +961,6 @@ def preferences():
)
safesearch = e.safesearch
time_range_support = e.time_range_support
for checker_test_name in checker_results.get(e.name, {}).get('errors', {}):
if supports_selected_language and checker_test_name.startswith('lang_'):
supports_selected_language = '?'
elif safesearch and checker_test_name == 'safesearch':
safesearch = '?'
elif time_range_support and checker_test_name == 'time_range':
time_range_support = '?'
supports[e.name] = {
'supports_selected_language': supports_selected_language,
'safesearch': safesearch,
@@ -1133,13 +1110,8 @@ def stats():
else:
filtered_engines = [selected_engine_name]
checker_results = checker_get_result()
checker_results = (
checker_results['engines'] if checker_results['status'] == 'ok' and 'engines' in checker_results else {}
)
engine_stats = get_engines_stats(filtered_engines)
engine_reliabilities = get_reliabilities(filtered_engines, checker_results)
engine_reliabilities = get_reliabilities(filtered_engines)
if sort_order not in STATS_SORT_PARAMETERS:
sort_order = 'name'
@@ -1194,12 +1166,6 @@ def stats_errors():
return jsonify(result)
@app.route('/stats/checker', methods=['GET'])
def stats_checker():
result = checker_get_result()
return jsonify(result)
@app.route('/metrics')
def stats_open_metrics():
password = settings['general'].get("open_metrics")
@@ -1212,13 +1178,8 @@ def stats_open_metrics():
filtered_engines = dict(filter(lambda kv: sxng_request.preferences.validate_token(kv[1]), engines.items()))
checker_results = checker_get_result()
checker_results = (
checker_results['engines'] if checker_results['status'] == 'ok' and 'engines' in checker_results else {}
)
engine_stats = get_engines_stats(filtered_engines)
engine_reliabilities = get_reliabilities(filtered_engines, checker_results)
engine_reliabilities = get_reliabilities(filtered_engines)
metrics_text = openmetrics(engine_stats, engine_reliabilities)
return Response(metrics_text, mimetype='text/plain')
@@ -1394,7 +1355,7 @@ def init():
searx.plugins.initialize(app)
metrics: bool = get_setting("general.enable_metrics") # type: ignore
searx.search.initialize(enable_checker=True, check_network=True, enable_metrics=metrics)
searx.search.initialize(check_network=True, enable_metrics=metrics)
limiter.initialize(app, settings)
favicons.init()
+1 -2
View File
@@ -39,7 +39,7 @@ setup(
],
project_urls={"Code": GIT_URL, "Issue tracker": get_setting('brand.issue_url')},
entry_points={
'console_scripts': ['searxng-run = searx.webapp:run', 'searxng-checker = searx.search.checker.__main__:main']
'console_scripts': ['searxng-run = searx.webapp:run']
},
packages=find_packages(
include=[
@@ -54,7 +54,6 @@ setup(
'settings.yml',
'*.toml',
'*.msg',
'search/checker/scheduler.lua',
'data/*.json',
'data/*.txt',
'data/*.ftz',
-1
View File
@@ -82,7 +82,6 @@ class SearxTestCase(aiounittest.AsyncTestCase):
# - initialize searx.network, searx.metrics, searx.processors and searx.search.checker
searx.search.initialize(
enable_checker=True,
check_network=True,
enable_metrics=searx.get_setting("general.enable_metrics"), # type: ignore
)