Files
searxng/searx/engines/ahmia.py
T
Markus Heiser dd27fce3b7 [unbload] drop meaningless field `number_of_results_xpath` from results (#6130)
In the result-list, the ``number_of_results`` indicate the number of hits in the
Index, they do not indicate how many results are in the answer.

In the past, search engines such as google or ddg had an indication on the first
page of a search term of how many hits there were for this term in total in
their index.

This info was added up in SearXNG and delivered under ``number_of_results``.
Nowadays the search engines no longer indicate how many hits there are in the
index and so this field in SearXNG is also superfluous.

- https://github.com/searxng/searxng/issues/2457#issuecomment-2566181574
- https://github.com/searxng/searxng/issues/2987
- https://github.com/searxng/searxng/issues/5034

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-05-25 12:43:02 +02:00

114 lines
3.8 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Ahmia (Onions)
"""
import typing as t
from urllib.parse import urlencode, urlparse, parse_qs
from lxml.html import fromstring
from searx.utils import gen_useragent, ElementType
from searx.engines.xpath import extract_url, extract_text, eval_xpath_list, eval_xpath
from searx.network import get
from searx.enginelib import EngineCache
# about
about = {
"website": 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion',
"wikidata_id": 'Q18693938',
"official_api_documentation": None,
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
# engine config
categories = ['onions']
paging = True
page_size = 10
# search url
base_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion'
search_url = 'http://juhanurmihxlp77nkq76byazcldy2hlmovfu2epvl5ankdibsot4csyd.onion/search/?{query}'
time_range_support = True
time_range_dict = {'day': 1, 'week': 7, 'month': 30}
# xpaths
results_xpath = '//li[@class="result"]'
url_xpath = './h4/a/@href'
title_xpath = './h4/a[1]'
content_xpath = './/p[1]'
correction_xpath = '//*[@id="didYouMean"]//a'
name_token_xpath = '//form[@id="searchForm"]/input[@type="hidden"]/@name'
value_token_xpath = '//form[@id="searchForm"]/input[@type="hidden"]/@value'
CACHE: EngineCache
def setup(engine_settings: dict[str, t.Any]) -> bool:
global CACHE # pylint: disable=global-statement
CACHE = EngineCache(engine_settings["name"])
return True
def _get_tokens(dom: ElementType | None = None) -> str:
"""
The tokens are hidden in a hidden input field.
They update every minute, but allow up to 1 hour old tokens to be used.
To spend the least amount of requests, it is best to always get the newest
tokens from each request. In worst case if it has expired, it would
need to do a total of 2 requests (over tor, might be ridiculously slow).
"""
if dom is None:
resp = get(base_url, headers={'User-Agent': gen_useragent()})
dom = fromstring(resp.text)
name_token = extract_text(dom.xpath(name_token_xpath))
value_token = extract_text(dom.xpath(value_token_xpath))
return f"{name_token}:{value_token}"
def request(query, params):
token_str: str | None = CACHE.get('ahmia-tokens')
if not token_str:
token_str = _get_tokens()
CACHE.set('ahmia-tokens', token_str, expire=60 * 60)
name_token, value_token = token_str.split(":")
params['url'] = search_url.format(query=urlencode({'q': query, name_token: value_token}))
if params['time_range'] in time_range_dict:
params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
return params
def response(resp):
results = []
dom = fromstring(resp.text)
# trim results so there's not way too many at once
first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
all_results = eval_xpath_list(dom, results_xpath)
trimmed_results = all_results[first_result_index : first_result_index + page_size]
# get results
for result in trimmed_results:
# remove ahmia url and extract the actual url for the result
raw_url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
title = extract_text(eval_xpath(result, title_xpath))
content = extract_text(eval_xpath(result, content_xpath))
results.append({'url': cleaned_url, 'title': title, 'content': content, 'is_onion': True})
# get spelling corrections
for correction in eval_xpath_list(dom, correction_xpath):
results.append({'correction': extract_text(correction)})
# Update the tokens to the newest ones
token_str = _get_tokens(dom)
CACHE.set('ahmia-tokens', token_str, expire=60 * 60)
return results