mirror of
https://github.com/searxng/searxng.git
synced 2026-05-26 04:40:12 +02:00
efc305b7f9
[mod] normalize variable name for the max number of results per request In the past, we have used different names for the variable that specifies the maximum number of hits in the outgoing request. - ``page_size`` - ``number_of_results`` - ``nb_per_page`` Since *page_size* is the most accurate term and is also used in the XPath engines, all other engines are adjusted accordingly within this patch .. documentation adjusted accordingly. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
171 lines
4.9 KiB
Python
171 lines
4.9 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""The MediaWiki engine is a *generic* engine to **query** Wikimedia wikis by
|
|
the `MediaWiki Action API`_. For a `query action`_ all Wikimedia wikis have
|
|
endpoints that follow this pattern::
|
|
|
|
https://{base_url}/w/api.php?action=query&list=search&format=json
|
|
|
|
.. note::
|
|
|
|
In its actual state, this engine is implemented to parse JSON result
|
|
(`format=json`_) from a search query (`list=search`_). If you need other
|
|
``action`` and ``list`` types ask SearXNG developers to extend the
|
|
implementation according to your needs.
|
|
|
|
.. _MediaWiki Action API: https://www.mediawiki.org/wiki/API:Main_page
|
|
.. _query action: https://www.mediawiki.org/w/api.php?action=help&modules=query
|
|
.. _`list=search`: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bsearch
|
|
.. _`format=json`: https://www.mediawiki.org/w/api.php?action=help&modules=json
|
|
|
|
Configuration
|
|
=============
|
|
|
|
Request:
|
|
|
|
- :py:obj:`base_url`
|
|
- :py:obj:`search_type`
|
|
- :py:obj:`srenablerewrites`
|
|
- :py:obj:`srsort`
|
|
- :py:obj:`srprop`
|
|
|
|
Implementations
|
|
===============
|
|
|
|
"""
|
|
|
|
from datetime import datetime
|
|
from urllib.parse import urlencode, quote
|
|
|
|
from searx.utils import html_to_text
|
|
|
|
# about
|
|
about = {
|
|
"website": None,
|
|
"wikidata_id": None,
|
|
"official_api_documentation": 'https://www.mediawiki.org/w/api.php?action=help&modules=query',
|
|
"use_official_api": True,
|
|
"require_api_key": False,
|
|
"results": 'JSON',
|
|
}
|
|
|
|
# engine dependent config
|
|
categories = ['general']
|
|
paging = True
|
|
page_size = 5
|
|
|
|
search_type: str = 'nearmatch'
|
|
"""Which type of search to perform. One of the following values: ``nearmatch``,
|
|
``text`` or ``title``.
|
|
|
|
See ``srwhat`` argument in `list=search`_ documentation.
|
|
"""
|
|
|
|
srenablerewrites: bool = True
|
|
"""Enable internal query rewriting (Type: boolean). Some search backends can
|
|
rewrite the query into another which is thought to provide better results, for
|
|
instance by correcting spelling errors.
|
|
|
|
See ``srenablerewrites`` argument in `list=search`_ documentation.
|
|
"""
|
|
|
|
srsort: str = 'relevance'
|
|
"""Set the sort order of returned results. One of the following values:
|
|
``create_timestamp_asc``, ``create_timestamp_desc``, ``incoming_links_asc``,
|
|
``incoming_links_desc``, ``just_match``, ``last_edit_asc``, ``last_edit_desc``,
|
|
``none``, ``random``, ``relevance``, ``user_random``.
|
|
|
|
See ``srenablerewrites`` argument in `list=search`_ documentation.
|
|
"""
|
|
|
|
srprop: str = 'sectiontitle|snippet|timestamp|categorysnippet'
|
|
"""Which properties to return.
|
|
|
|
See ``srprop`` argument in `list=search`_ documentation.
|
|
"""
|
|
|
|
base_url: str = 'https://{language}.wikipedia.org/'
|
|
"""Base URL of the Wikimedia wiki.
|
|
|
|
``{language}``:
|
|
ISO 639-1 language code (en, de, fr ..) of the search language.
|
|
"""
|
|
|
|
api_path: str = 'w/api.php'
|
|
"""The path the PHP api is listening on.
|
|
|
|
The default path should work fine usually.
|
|
"""
|
|
|
|
timestamp_format = '%Y-%m-%dT%H:%M:%SZ'
|
|
"""The longhand version of MediaWiki time strings."""
|
|
|
|
|
|
def request(query, params):
|
|
|
|
# write search-language back to params, required in response
|
|
|
|
if params['language'] == 'all':
|
|
params['language'] = 'en'
|
|
else:
|
|
params['language'] = params['language'].split('-')[0]
|
|
|
|
api_url = f"{base_url.rstrip('/')}/{api_path}?".format(language=params['language'])
|
|
offset = (params['pageno'] - 1) * page_size
|
|
|
|
args = {
|
|
'action': 'query',
|
|
'list': 'search',
|
|
'format': 'json',
|
|
'srsearch': query,
|
|
'sroffset': offset,
|
|
'srlimit': page_size,
|
|
'srwhat': search_type,
|
|
'srprop': srprop,
|
|
'srsort': srsort,
|
|
}
|
|
if srenablerewrites:
|
|
args['srenablerewrites'] = '1'
|
|
|
|
params['url'] = api_url + urlencode(args)
|
|
return params
|
|
|
|
|
|
# get response from search-request
|
|
def response(resp):
|
|
|
|
results = []
|
|
search_results = resp.json()
|
|
|
|
# return empty array if there are no results
|
|
if not search_results.get('query', {}).get('search'):
|
|
return []
|
|
|
|
for result in search_results['query']['search']:
|
|
|
|
if result.get('snippet', '').startswith('#REDIRECT'):
|
|
continue
|
|
|
|
title = result['title']
|
|
sectiontitle = result.get('sectiontitle')
|
|
content = html_to_text(result.get('snippet', ''))
|
|
metadata = html_to_text(result.get('categorysnippet', ''))
|
|
timestamp = result.get('timestamp')
|
|
|
|
url = (
|
|
base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(title.replace(' ', '_').encode())
|
|
)
|
|
if sectiontitle:
|
|
# in case of sectiontitle create a link to the section in the wiki page
|
|
url += '#' + quote(sectiontitle.replace(' ', '_').encode())
|
|
title += ' / ' + sectiontitle
|
|
|
|
item = {'url': url, 'title': title, 'content': content, 'metadata': metadata}
|
|
|
|
if timestamp:
|
|
item['publishedDate'] = datetime.strptime(timestamp, timestamp_format)
|
|
|
|
results.append(item)
|
|
|
|
# return results
|
|
return results
|