Files
searxng/searx/results.py
T
Markus Heiser dd27fce3b7 [unbload] drop meaningless field `number_of_results_xpath` from results (#6130)
In the result-list, the ``number_of_results`` indicate the number of hits in the
Index, they do not indicate how many results are in the answer.

In the past, search engines such as google or ddg had an indication on the first
page of a search term of how many hits there were for this term in total in
their index.

This info was added up in SearXNG and delivered under ``number_of_results``.
Nowadays the search engines no longer indicate how many hits there are in the
index and so this field in SearXNG is also superfluous.

- https://github.com/searxng/searxng/issues/2457#issuecomment-2566181574
- https://github.com/searxng/searxng/issues/2987
- https://github.com/searxng/searxng/issues/5034

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-05-25 12:43:02 +02:00

357 lines
13 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
# pylint: disable=missing-module-docstring, missing-class-docstring
import typing as t
import warnings
from collections import defaultdict
from threading import RLock
from searx import logger as log
import searx.engines
from searx.metrics import histogram_observe, counter_add
from searx.result_types import Result, LegacyResult, MainResult
from searx.result_types.answer import AnswerSet, BaseAnswer
def calculate_score(
result: MainResult | LegacyResult,
priority: MainResult.PriorityType,
) -> float:
weight = 1.0
for result_engine in result['engines']:
if hasattr(searx.engines.engines.get(result_engine), 'weight'):
weight *= float(searx.engines.engines[result_engine].weight)
weight *= len(result['positions'])
score = 0
for position in result['positions']:
if priority == 'low':
continue
if priority == 'high':
score += weight
else:
score += weight / position
return score
class Timing(t.NamedTuple):
engine: str
total: float
load: float
class UnresponsiveEngine(t.NamedTuple):
engine: str
error_type: str
suspended: bool
class ResultContainer:
"""In the result container, the results are collected, sorted and duplicates
will be merged."""
# pylint: disable=too-many-statements
main_results_map: dict[int, MainResult | LegacyResult]
infoboxes: list[LegacyResult]
suggestions: set[str]
answers: AnswerSet
corrections: set[str]
def __init__(self):
self.main_results_map = {}
self.infoboxes = []
self.suggestions = set()
self.answers = AnswerSet()
self.corrections = set()
self.engine_data: dict[str, dict[str, str]] = defaultdict(dict)
self._closed: bool = False
self.paging: bool = False
self.unresponsive_engines: set[UnresponsiveEngine] = set()
self.timings: list[Timing] = []
self.redirect_url: str | None = None
self.on_result: t.Callable[[Result | LegacyResult], bool] = lambda _: True
self._lock: RLock = RLock()
self._main_results_sorted: list[MainResult | LegacyResult] = None # type: ignore
def extend(
self, engine_name: str | None, results: list[Result | LegacyResult]
): # pylint: disable=too-many-branches
if self._closed:
log.debug("container is closed, ignoring results: %s", results)
return
main_count = 0
for result in list(results):
if isinstance(result, Result):
result.engine = result.engine or engine_name
result.normalize_result_fields()
if not self.on_result(result):
continue
if isinstance(result, BaseAnswer):
self.answers.add(result)
elif isinstance(result, MainResult):
main_count += 1
self._merge_main_result(result, main_count)
else:
# more types need to be implemented in the future ..
raise NotImplementedError(f"no handler implemented to process the result of type {result}")
else:
result["engine"] = result.get("engine") or engine_name or ""
result = LegacyResult(result) # for backward compatibility, will be romeved one day
result.normalize_result_fields()
if "suggestion" in result:
if self.on_result(result):
self.suggestions.add(result["suggestion"])
continue
if "answer" in result:
if self.on_result(result):
warnings.warn(
f"answer results from engine {result.engine}"
" are without typification / migrate to Answer class.",
DeprecationWarning,
)
self.answers.add(result) # type: ignore
continue
if "correction" in result:
if self.on_result(result):
self.corrections.add(result["correction"])
continue
if "infobox" in result:
if self.on_result(result):
self._merge_infobox(result)
continue
if "engine_data" in result:
if self.on_result(result):
if result.engine:
self.engine_data[result.engine][result["key"]] = result["engine_data"]
continue
if self.on_result(result):
main_count += 1
self._merge_main_result(result, main_count)
continue
if engine_name in searx.engines.engines:
eng = searx.engines.engines[engine_name]
histogram_observe(main_count, "engine", eng.name, "result", "count")
if not self.paging and eng.paging:
self.paging = True
def _merge_infobox(self, new_infobox: LegacyResult):
add_infobox = True
new_id = getattr(new_infobox, "id", None)
if new_id is not None:
with self._lock:
for existing_infobox in self.infoboxes:
if new_id == getattr(existing_infobox, "id", None):
merge_two_infoboxes(existing_infobox, new_infobox)
add_infobox = False
if add_infobox:
self.infoboxes.append(new_infobox)
def _merge_main_result(self, result: MainResult | LegacyResult, position: int):
result_hash = hash(result)
with self._lock:
merged = self.main_results_map.get(result_hash)
if not merged:
# if there is no duplicate in the merged results, append result
result.positions = [position]
self.main_results_map[result_hash] = result
return
merge_two_main_results(merged, result)
# add the new position
merged.positions.append(position)
def close(self):
self._closed = True
for result in self.main_results_map.values():
result.score = calculate_score(result, result.priority)
for eng_name in result.engines:
counter_add(result.score, 'engine', eng_name, 'score')
def get_ordered_results(self) -> list[MainResult | LegacyResult]:
"""Returns a sorted list of results to be displayed in the main result
area (:ref:`result types`)."""
if not self._closed:
self.close()
if self._main_results_sorted:
return self._main_results_sorted
# first pass, sort results by "score" (descanding)
results = sorted(self.main_results_map.values(), key=lambda x: x.score, reverse=True)
# pass 2 : group results by category and template
gresults: list[MainResult | LegacyResult] = []
categoryPositions: dict[str, t.Any] = {}
max_count = 8
max_distance = 20
for res in results:
# do we need to handle more than one category per engine?
engine = searx.engines.engines.get(res.engine or "")
if engine:
res.category = engine.categories[0] if len(engine.categories) > 0 else ""
# do we need to handle more than one category per engine?
category = f"{res.category}:{res.template}:{'img_src' if (res.thumbnail or res.img_src) else ''}"
grp = categoryPositions.get(category)
# group with previous results using the same category, if the group
# can accept more result and is not too far from the current
# position
if (grp is not None) and (grp["count"] > 0) and (len(gresults) - grp["index"] < max_distance):
# group with the previous results using the same category with
# this one
index = grp["index"]
gresults.insert(index, res)
# update every index after the current one (including the
# current one)
for item in categoryPositions.values():
v = item["index"]
if v >= index:
item["index"] = v + 1
# update this category
grp["count"] -= 1
else:
gresults.append(res)
# update categoryIndex
categoryPositions[category] = {"index": len(gresults), "count": max_count}
continue
self._main_results_sorted = gresults
return self._main_results_sorted
def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
with self._lock:
if self._closed:
log.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
return
if searx.engines.engines[engine_name].display_error_messages:
self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
with self._lock:
if self._closed:
log.error("call to ResultContainer.add_timing after ResultContainer.close")
return
self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
def get_timings(self) -> list[Timing]:
with self._lock:
if not self._closed:
log.error("call to ResultContainer.get_timings before ResultContainer.close")
return []
return self.timings
def merge_two_infoboxes(origin: LegacyResult, other: LegacyResult):
"""Merges the values from ``other`` into ``origin``."""
# pylint: disable=too-many-branches
weight1 = getattr(searx.engines.engines[origin.engine], "weight", 1)
weight2 = getattr(searx.engines.engines[other.engine], "weight", 1)
if weight2 > weight1:
origin.engine = other.engine
origin.engines |= other.engines
if other.urls:
url_items = origin.get("urls", [])
for url2 in other.urls:
unique_url = True
entity_url2 = url2.get("entity")
for url1 in origin.get("urls", []):
if (entity_url2 is not None and entity_url2 == url1.get("entity")) or (
url1.get("url") == url2.get("url")
):
unique_url = False
break
if unique_url:
url_items.append(url2)
origin.urls = url_items
if other.img_src:
if not origin.img_src:
origin.img_src = other.img_src
elif weight2 > weight1:
origin.img_src = other.img_src
if other.attributes:
if not origin.attributes:
origin.attributes = other.attributes
else:
attr_names_1: set[str] = set()
for attr in origin.attributes:
label = attr.get("label")
if label:
attr_names_1.add(label)
entity = attr.get("entity")
if entity:
attr_names_1.add(entity)
for attr in other.attributes:
if attr.get("label") not in attr_names_1 and attr.get('entity') not in attr_names_1:
origin.attributes.append(attr)
if other.content:
if not origin.content:
origin.content = other.content
elif len(other.content) > len(origin.content):
origin.content = other.content
def merge_two_main_results(origin: MainResult | LegacyResult, other: MainResult | LegacyResult):
"""Merges the values from ``other`` into ``origin``."""
if len(other.content or "") > len(origin.content or ""):
# use content with more text
origin.content = other.content
# use title with more text
if len(other.title or "") > len(origin.title or ""):
origin.title = other.title
# merge all result's parameters not found in origin
if isinstance(other, MainResult) and isinstance(origin, MainResult):
origin.defaults_from(other)
elif isinstance(other, LegacyResult) and isinstance(origin, LegacyResult):
origin.defaults_from(other)
# add engine to list of result-engines
origin.engines.add(other.engine or "")
# use https, ftps, .. if possible
if origin.parsed_url and not origin.parsed_url.scheme.endswith("s"):
if other.parsed_url and other.parsed_url.scheme.endswith("s"):
origin.parsed_url = origin.parsed_url._replace(scheme=other.parsed_url.scheme)
origin.url = origin.parsed_url.geturl()