mirror of
https://github.com/searxng/searxng.git
synced 2026-05-26 21:00:13 +02:00
dd27fce3b7
In the result-list, the ``number_of_results`` indicate the number of hits in the Index, they do not indicate how many results are in the answer. In the past, search engines such as google or ddg had an indication on the first page of a search term of how many hits there were for this term in total in their index. This info was added up in SearXNG and delivered under ``number_of_results``. Nowadays the search engines no longer indicate how many hits there are in the index and so this field in SearXNG is also superfluous. - https://github.com/searxng/searxng/issues/2457#issuecomment-2566181574 - https://github.com/searxng/searxng/issues/2987 - https://github.com/searxng/searxng/issues/5034 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
357 lines
13 KiB
Python
357 lines
13 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
# pylint: disable=missing-module-docstring, missing-class-docstring
|
|
|
|
import typing as t
|
|
|
|
import warnings
|
|
from collections import defaultdict
|
|
from threading import RLock
|
|
|
|
from searx import logger as log
|
|
import searx.engines
|
|
from searx.metrics import histogram_observe, counter_add
|
|
from searx.result_types import Result, LegacyResult, MainResult
|
|
from searx.result_types.answer import AnswerSet, BaseAnswer
|
|
|
|
|
|
def calculate_score(
|
|
result: MainResult | LegacyResult,
|
|
priority: MainResult.PriorityType,
|
|
) -> float:
|
|
weight = 1.0
|
|
|
|
for result_engine in result['engines']:
|
|
if hasattr(searx.engines.engines.get(result_engine), 'weight'):
|
|
weight *= float(searx.engines.engines[result_engine].weight)
|
|
|
|
weight *= len(result['positions'])
|
|
score = 0
|
|
|
|
for position in result['positions']:
|
|
if priority == 'low':
|
|
continue
|
|
if priority == 'high':
|
|
score += weight
|
|
else:
|
|
score += weight / position
|
|
|
|
return score
|
|
|
|
|
|
class Timing(t.NamedTuple):
|
|
engine: str
|
|
total: float
|
|
load: float
|
|
|
|
|
|
class UnresponsiveEngine(t.NamedTuple):
|
|
engine: str
|
|
error_type: str
|
|
suspended: bool
|
|
|
|
|
|
class ResultContainer:
|
|
"""In the result container, the results are collected, sorted and duplicates
|
|
will be merged."""
|
|
|
|
# pylint: disable=too-many-statements
|
|
|
|
main_results_map: dict[int, MainResult | LegacyResult]
|
|
infoboxes: list[LegacyResult]
|
|
suggestions: set[str]
|
|
answers: AnswerSet
|
|
corrections: set[str]
|
|
|
|
def __init__(self):
|
|
self.main_results_map = {}
|
|
self.infoboxes = []
|
|
self.suggestions = set()
|
|
self.answers = AnswerSet()
|
|
self.corrections = set()
|
|
|
|
self.engine_data: dict[str, dict[str, str]] = defaultdict(dict)
|
|
self._closed: bool = False
|
|
self.paging: bool = False
|
|
self.unresponsive_engines: set[UnresponsiveEngine] = set()
|
|
self.timings: list[Timing] = []
|
|
self.redirect_url: str | None = None
|
|
self.on_result: t.Callable[[Result | LegacyResult], bool] = lambda _: True
|
|
self._lock: RLock = RLock()
|
|
self._main_results_sorted: list[MainResult | LegacyResult] = None # type: ignore
|
|
|
|
def extend(
|
|
self, engine_name: str | None, results: list[Result | LegacyResult]
|
|
): # pylint: disable=too-many-branches
|
|
if self._closed:
|
|
log.debug("container is closed, ignoring results: %s", results)
|
|
return
|
|
main_count = 0
|
|
|
|
for result in list(results):
|
|
|
|
if isinstance(result, Result):
|
|
result.engine = result.engine or engine_name
|
|
result.normalize_result_fields()
|
|
if not self.on_result(result):
|
|
continue
|
|
|
|
if isinstance(result, BaseAnswer):
|
|
self.answers.add(result)
|
|
elif isinstance(result, MainResult):
|
|
main_count += 1
|
|
self._merge_main_result(result, main_count)
|
|
else:
|
|
# more types need to be implemented in the future ..
|
|
raise NotImplementedError(f"no handler implemented to process the result of type {result}")
|
|
|
|
else:
|
|
result["engine"] = result.get("engine") or engine_name or ""
|
|
result = LegacyResult(result) # for backward compatibility, will be romeved one day
|
|
result.normalize_result_fields()
|
|
|
|
if "suggestion" in result:
|
|
if self.on_result(result):
|
|
self.suggestions.add(result["suggestion"])
|
|
continue
|
|
|
|
if "answer" in result:
|
|
if self.on_result(result):
|
|
warnings.warn(
|
|
f"answer results from engine {result.engine}"
|
|
" are without typification / migrate to Answer class.",
|
|
DeprecationWarning,
|
|
)
|
|
self.answers.add(result) # type: ignore
|
|
continue
|
|
|
|
if "correction" in result:
|
|
if self.on_result(result):
|
|
self.corrections.add(result["correction"])
|
|
continue
|
|
|
|
if "infobox" in result:
|
|
if self.on_result(result):
|
|
self._merge_infobox(result)
|
|
continue
|
|
|
|
if "engine_data" in result:
|
|
if self.on_result(result):
|
|
if result.engine:
|
|
self.engine_data[result.engine][result["key"]] = result["engine_data"]
|
|
continue
|
|
|
|
if self.on_result(result):
|
|
main_count += 1
|
|
self._merge_main_result(result, main_count)
|
|
continue
|
|
|
|
if engine_name in searx.engines.engines:
|
|
eng = searx.engines.engines[engine_name]
|
|
histogram_observe(main_count, "engine", eng.name, "result", "count")
|
|
if not self.paging and eng.paging:
|
|
self.paging = True
|
|
|
|
def _merge_infobox(self, new_infobox: LegacyResult):
|
|
add_infobox = True
|
|
|
|
new_id = getattr(new_infobox, "id", None)
|
|
if new_id is not None:
|
|
with self._lock:
|
|
for existing_infobox in self.infoboxes:
|
|
if new_id == getattr(existing_infobox, "id", None):
|
|
merge_two_infoboxes(existing_infobox, new_infobox)
|
|
add_infobox = False
|
|
if add_infobox:
|
|
self.infoboxes.append(new_infobox)
|
|
|
|
def _merge_main_result(self, result: MainResult | LegacyResult, position: int):
|
|
result_hash = hash(result)
|
|
|
|
with self._lock:
|
|
|
|
merged = self.main_results_map.get(result_hash)
|
|
if not merged:
|
|
# if there is no duplicate in the merged results, append result
|
|
result.positions = [position]
|
|
self.main_results_map[result_hash] = result
|
|
return
|
|
|
|
merge_two_main_results(merged, result)
|
|
# add the new position
|
|
merged.positions.append(position)
|
|
|
|
def close(self):
|
|
self._closed = True
|
|
|
|
for result in self.main_results_map.values():
|
|
result.score = calculate_score(result, result.priority)
|
|
for eng_name in result.engines:
|
|
counter_add(result.score, 'engine', eng_name, 'score')
|
|
|
|
def get_ordered_results(self) -> list[MainResult | LegacyResult]:
|
|
"""Returns a sorted list of results to be displayed in the main result
|
|
area (:ref:`result types`)."""
|
|
|
|
if not self._closed:
|
|
self.close()
|
|
|
|
if self._main_results_sorted:
|
|
return self._main_results_sorted
|
|
|
|
# first pass, sort results by "score" (descanding)
|
|
results = sorted(self.main_results_map.values(), key=lambda x: x.score, reverse=True)
|
|
|
|
# pass 2 : group results by category and template
|
|
gresults: list[MainResult | LegacyResult] = []
|
|
categoryPositions: dict[str, t.Any] = {}
|
|
max_count = 8
|
|
max_distance = 20
|
|
|
|
for res in results:
|
|
# do we need to handle more than one category per engine?
|
|
engine = searx.engines.engines.get(res.engine or "")
|
|
if engine:
|
|
res.category = engine.categories[0] if len(engine.categories) > 0 else ""
|
|
|
|
# do we need to handle more than one category per engine?
|
|
category = f"{res.category}:{res.template}:{'img_src' if (res.thumbnail or res.img_src) else ''}"
|
|
grp = categoryPositions.get(category)
|
|
|
|
# group with previous results using the same category, if the group
|
|
# can accept more result and is not too far from the current
|
|
# position
|
|
|
|
if (grp is not None) and (grp["count"] > 0) and (len(gresults) - grp["index"] < max_distance):
|
|
# group with the previous results using the same category with
|
|
# this one
|
|
index = grp["index"]
|
|
gresults.insert(index, res)
|
|
|
|
# update every index after the current one (including the
|
|
# current one)
|
|
for item in categoryPositions.values():
|
|
v = item["index"]
|
|
if v >= index:
|
|
item["index"] = v + 1
|
|
|
|
# update this category
|
|
grp["count"] -= 1
|
|
|
|
else:
|
|
gresults.append(res)
|
|
# update categoryIndex
|
|
categoryPositions[category] = {"index": len(gresults), "count": max_count}
|
|
continue
|
|
|
|
self._main_results_sorted = gresults
|
|
return self._main_results_sorted
|
|
|
|
def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
|
|
with self._lock:
|
|
if self._closed:
|
|
log.error("call to ResultContainer.add_unresponsive_engine after ResultContainer.close")
|
|
return
|
|
if searx.engines.engines[engine_name].display_error_messages:
|
|
self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
|
|
|
|
def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
|
|
with self._lock:
|
|
if self._closed:
|
|
log.error("call to ResultContainer.add_timing after ResultContainer.close")
|
|
return
|
|
self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
|
|
|
|
def get_timings(self) -> list[Timing]:
|
|
with self._lock:
|
|
if not self._closed:
|
|
log.error("call to ResultContainer.get_timings before ResultContainer.close")
|
|
return []
|
|
return self.timings
|
|
|
|
|
|
def merge_two_infoboxes(origin: LegacyResult, other: LegacyResult):
|
|
"""Merges the values from ``other`` into ``origin``."""
|
|
# pylint: disable=too-many-branches
|
|
weight1 = getattr(searx.engines.engines[origin.engine], "weight", 1)
|
|
weight2 = getattr(searx.engines.engines[other.engine], "weight", 1)
|
|
|
|
if weight2 > weight1:
|
|
origin.engine = other.engine
|
|
|
|
origin.engines |= other.engines
|
|
|
|
if other.urls:
|
|
url_items = origin.get("urls", [])
|
|
|
|
for url2 in other.urls:
|
|
unique_url = True
|
|
entity_url2 = url2.get("entity")
|
|
|
|
for url1 in origin.get("urls", []):
|
|
if (entity_url2 is not None and entity_url2 == url1.get("entity")) or (
|
|
url1.get("url") == url2.get("url")
|
|
):
|
|
unique_url = False
|
|
break
|
|
if unique_url:
|
|
url_items.append(url2)
|
|
|
|
origin.urls = url_items
|
|
|
|
if other.img_src:
|
|
if not origin.img_src:
|
|
origin.img_src = other.img_src
|
|
elif weight2 > weight1:
|
|
origin.img_src = other.img_src
|
|
|
|
if other.attributes:
|
|
if not origin.attributes:
|
|
origin.attributes = other.attributes
|
|
else:
|
|
attr_names_1: set[str] = set()
|
|
for attr in origin.attributes:
|
|
label = attr.get("label")
|
|
if label:
|
|
attr_names_1.add(label)
|
|
|
|
entity = attr.get("entity")
|
|
if entity:
|
|
attr_names_1.add(entity)
|
|
|
|
for attr in other.attributes:
|
|
if attr.get("label") not in attr_names_1 and attr.get('entity') not in attr_names_1:
|
|
origin.attributes.append(attr)
|
|
|
|
if other.content:
|
|
if not origin.content:
|
|
origin.content = other.content
|
|
elif len(other.content) > len(origin.content):
|
|
origin.content = other.content
|
|
|
|
|
|
def merge_two_main_results(origin: MainResult | LegacyResult, other: MainResult | LegacyResult):
|
|
"""Merges the values from ``other`` into ``origin``."""
|
|
|
|
if len(other.content or "") > len(origin.content or ""):
|
|
# use content with more text
|
|
origin.content = other.content
|
|
|
|
# use title with more text
|
|
if len(other.title or "") > len(origin.title or ""):
|
|
origin.title = other.title
|
|
|
|
# merge all result's parameters not found in origin
|
|
if isinstance(other, MainResult) and isinstance(origin, MainResult):
|
|
origin.defaults_from(other)
|
|
elif isinstance(other, LegacyResult) and isinstance(origin, LegacyResult):
|
|
origin.defaults_from(other)
|
|
|
|
# add engine to list of result-engines
|
|
origin.engines.add(other.engine or "")
|
|
|
|
# use https, ftps, .. if possible
|
|
if origin.parsed_url and not origin.parsed_url.scheme.endswith("s"):
|
|
if other.parsed_url and other.parsed_url.scheme.endswith("s"):
|
|
origin.parsed_url = origin.parsed_url._replace(scheme=other.parsed_url.scheme)
|
|
origin.url = origin.parsed_url.geturl()
|