Files
searxng/searx/engines/pubmed.py
T
Markus Heiser efc305b7f9 [mod] normalize variable name for the max number of results per request (#6131)
[mod] normalize variable name for the max number of results per request

In the past, we have used different names for the variable that specifies the
maximum number of hits in the outgoing request.

- ``page_size``
- ``number_of_results``
- ``nb_per_page``

Since *page_size* is the most accurate term and is also used in the XPath
engines, all other engines are adjusted accordingly within this
patch .. documentation adjusted accordingly.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-05-25 12:41:31 +02:00

152 lines
4.8 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""PubMed_ comprises more than 39 million citations for biomedical literature
from MEDLINE, life science journals, and online books. Citations may include
links to full text content from PubMed Central and publisher web sites.
.. _PubMed: https://pubmed.ncbi.nlm.nih.gov/
Configuration
=============
.. code:: yaml
- name: pubmed
engine: pubmed
shortcut: pub
Implementations
===============
"""
import typing as t
from datetime import datetime
from urllib.parse import urlencode
from lxml import etree
from searx.result_types import EngineResults
from searx.network import get
from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
ElementType,
)
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": "https://www.ncbi.nlm.nih.gov/pubmed/",
"wikidata_id": "Q1540899",
"official_api_documentation": {
"url": "https://www.ncbi.nlm.nih.gov/home/develop/api/",
"comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/",
},
"use_official_api": True,
"require_api_key": False,
"results": "XML",
}
categories = ["science", "scientific publications"]
eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
# engine dependent config
page_size = 10
pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
def request(query: str, params: "OnlineParams") -> None:
args = urlencode(
{
"db": "pubmed",
"term": query,
"retstart": (params["pageno"] - 1) * page_size,
"hits": page_size,
}
)
esearch_url = f"{eutils_api}/esearch.fcgi?{args}"
# DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd
esearch_resp: "SXNG_Response" = get(esearch_url, timeout=3)
pmids_results = etree.XML(esearch_resp.content)
pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")]
# send efetch request with the IDs from esearch response
args = urlencode(
{
"db": "pubmed",
"retmode": "xml",
"id": ",".join(pmids),
}
)
efetch_url = f"{eutils_api}/efetch.fcgi?{args}"
params["url"] = efetch_url
def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
# DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd
# parse efetch response
efetch_xml = etree.XML(resp.content)
res = EngineResults()
def _field_txt(xml: ElementType, xpath_str: str) -> str:
elem = eval_xpath_getindex(xml, xpath_str, 0, default="")
return extract_text(elem, allow_none=True) or ""
for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"):
medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0)
pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0)
title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text
pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text
url: str = pubmed_url + pmid
content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()")
doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()")
journal = _field_txt(medline_citation, "./Article/Journal/Title/text()")
issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()")
authors: list[str] = []
for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"):
f = eval_xpath_getindex(author, "./ForeName", 0, default=None)
l = eval_xpath_getindex(author, "./LastName", 0, default=None)
author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip()
if author_name:
authors.append(author_name)
accepted_date = eval_xpath_getindex(
pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None
)
pub_date = None
if accepted_date is not None:
year = eval_xpath_getindex(accepted_date, "./Year", 0)
month = eval_xpath_getindex(accepted_date, "./Month", 0)
day = eval_xpath_getindex(accepted_date, "./Day", 0)
try:
pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text))
except ValueError:
pass
res.add(
res.types.Paper(
url=url,
title=title,
content=content,
journal=journal,
issn=[issn],
authors=authors,
doi=doi,
publishedDate=pub_date,
)
)
return res