Files
searxng/searx/engines/wikicommons.py
T
Markus Heiser efc305b7f9 [mod] normalize variable name for the max number of results per request (#6131)
[mod] normalize variable name for the max number of results per request

In the past, we have used different names for the variable that specifies the
maximum number of hits in the outgoing request.

- ``page_size``
- ``number_of_results``
- ``nb_per_page``

Since *page_size* is the most accurate term and is also used in the XPath
engines, all other engines are adjusted accordingly within this
patch .. documentation adjusted accordingly.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-05-25 12:41:31 +02:00

209 lines
6.1 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""`Wikimedia Commons`_ is a collection of more than 120 millions freely usable
media files to which anyone can contribute.
This engine uses the `MediaWiki query API`_, with which engines can be configured
for searching images, videos, audio, and other files in the Wikimedia.
.. _MediaWiki query API: https://commons.wikimedia.org/w/api.php?action=help&modules=query
.. _Wikimedia Commons: https://commons.wikimedia.org/
Configuration
=============
The engine has the following additional settings:
.. code:: yaml
- name: wikicommons.images
engine: wikicommons
wc_search_type: image
- name: wikicommons.videos
engine: wikicommons
wc_search_type: video
- name: wikicommons.audio
engine: wikicommons
wc_search_type: audio
- name: wikicommons.files
engine: wikicommons
wc_search_type: file
Implementations
===============
"""
import typing as t
import datetime
import pathlib
from urllib.parse import urlencode, unquote
from searx.utils import html_to_text, humanize_bytes
from searx.result_types import EngineResults
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": "https://commons.wikimedia.org/",
"wikidata_id": "Q565",
"official_api_documentation": "https://commons.wikimedia.org/w/api.php",
"use_official_api": True,
"require_api_key": False,
"results": "JSON",
}
categories: list[str] = []
paging = True
page_size = 10
wc_api_url = "https://commons.wikimedia.org/w/api.php"
wc_search_type: str = ""
SEARCH_TYPES: dict[str, str] = {
"image": "bitmap|drawing",
"video": "video",
"audio": "audio",
"file": "multimedia|office|archive|3d",
}
# FileType = t.Literal["bitmap", "drawing", "video", "audio", "multimedia", "office", "archive", "3d"]
# FILE_TYPES = list(t.get_args(FileType))
def setup(engine_settings: dict[str, t.Any]) -> bool:
"""Initialization of the Wikimedia engine, checks if the value configured in
:py:obj:`wc_search_type` is valid."""
if engine_settings.get("wc_search_type") not in SEARCH_TYPES:
logger.error(
"wc_search_type: %s isn't a valid file type (%s)",
engine_settings.get("wc_search_type"),
",".join(SEARCH_TYPES.keys()),
)
return False
return True
def request(query: str, params: "OnlineParams") -> None:
uselang: str = "en"
if params["searxng_locale"] != "all":
uselang = params["searxng_locale"].split("-")[0]
filetype = SEARCH_TYPES[wc_search_type]
args = {
# https://commons.wikimedia.org/w/api.php
"format": "json",
"uselang": uselang,
"action": "query",
# https://commons.wikimedia.org/w/api.php?action=help&modules=query
"prop": "info|imageinfo",
# generator (gsr optins) https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bsearch
"generator": "search",
"gsrnamespace": "6", # https://www.mediawiki.org/wiki/Help:Namespaces#Renaming_namespaces
"gsrprop": "snippet",
"gsrlimit": page_size,
"gsroffset": page_size * (params["pageno"] - 1),
"gsrsearch": f"filetype:{filetype} {query}",
# imageinfo: https://commons.wikimedia.org/w/api.php?action=help&modules=query%2Bimageinfo
"iiprop": "url|size|mime",
"iiurlheight": "180", # needed for the thumb url
}
params["url"] = f"{wc_api_url}?{urlencode(args, safe=':|')}"
def response(resp: "SXNG_Response") -> EngineResults:
res = EngineResults()
json_data = resp.json()
pages = json_data.get("query", {}).get("pages", {}).values()
for item in pages:
if not item.get("imageinfo", []):
continue
imageinfo = item["imageinfo"][0]
title: str = item["title"].replace("File:", "").rsplit(".", 1)[0]
content = html_to_text(item["snippet"])
url: str = imageinfo["descriptionurl"]
media_url: str = imageinfo["url"]
mimetype: str = imageinfo["mime"]
thumbnail: str = imageinfo["thumburl"]
size = imageinfo.get("size")
if size:
size = humanize_bytes(size)
duration = None
seconds: str = imageinfo.get("duration")
if seconds:
try:
duration = datetime.timedelta(seconds=int(seconds))
except OverflowError:
pass
if wc_search_type == "file":
res.add(
res.types.File(
title=title,
url=url,
content=content,
size=size,
mimetype=mimetype,
filename=unquote(pathlib.Path(media_url).name),
embedded=media_url,
thumbnail=thumbnail,
)
)
continue
if wc_search_type == "image":
res.add(
res.types.LegacyResult(
template="images.html",
title=title,
url=url,
content=content,
img_src=imageinfo["url"],
thumbnail_src=thumbnail,
resolution=f"{imageinfo['width']} x {imageinfo['height']}",
img_format=imageinfo["mime"],
filesize=size,
)
)
continue
if wc_search_type == "video":
res.add(
res.types.LegacyResult(
template="videos.html",
title=title,
url=url,
content=content,
iframe_src=media_url,
length=duration,
)
)
continue
if wc_search_type == "audio":
res.add(
res.types.MainResult(
template="default.html",
title=title,
url=url,
content=content,
audio_src=media_url,
length=duration,
)
)
continue
return res