mirror of
https://github.com/searxng/searxng.git
synced 2026-05-26 21:00:13 +02:00
efc305b7f9
[mod] normalize variable name for the max number of results per request In the past, we have used different names for the variable that specifies the maximum number of hits in the outgoing request. - ``page_size`` - ``number_of_results`` - ``nb_per_page`` Since *page_size* is the most accurate term and is also used in the XPath engines, all other engines are adjusted accordingly within this patch .. documentation adjusted accordingly. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
152 lines
4.8 KiB
Python
152 lines
4.8 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""PubMed_ comprises more than 39 million citations for biomedical literature
|
|
from MEDLINE, life science journals, and online books. Citations may include
|
|
links to full text content from PubMed Central and publisher web sites.
|
|
|
|
.. _PubMed: https://pubmed.ncbi.nlm.nih.gov/
|
|
|
|
Configuration
|
|
=============
|
|
|
|
.. code:: yaml
|
|
|
|
- name: pubmed
|
|
engine: pubmed
|
|
shortcut: pub
|
|
|
|
Implementations
|
|
===============
|
|
|
|
"""
|
|
|
|
import typing as t
|
|
|
|
from datetime import datetime
|
|
from urllib.parse import urlencode
|
|
|
|
from lxml import etree
|
|
|
|
from searx.result_types import EngineResults
|
|
from searx.network import get
|
|
from searx.utils import (
|
|
eval_xpath_getindex,
|
|
eval_xpath_list,
|
|
extract_text,
|
|
ElementType,
|
|
)
|
|
|
|
if t.TYPE_CHECKING:
|
|
from searx.extended_types import SXNG_Response
|
|
from searx.search.processors import OnlineParams
|
|
|
|
|
|
about = {
|
|
"website": "https://www.ncbi.nlm.nih.gov/pubmed/",
|
|
"wikidata_id": "Q1540899",
|
|
"official_api_documentation": {
|
|
"url": "https://www.ncbi.nlm.nih.gov/home/develop/api/",
|
|
"comment": "More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/",
|
|
},
|
|
"use_official_api": True,
|
|
"require_api_key": False,
|
|
"results": "XML",
|
|
}
|
|
|
|
categories = ["science", "scientific publications"]
|
|
|
|
eutils_api = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
|
|
|
|
# engine dependent config
|
|
page_size = 10
|
|
pubmed_url = "https://www.ncbi.nlm.nih.gov/pubmed/"
|
|
|
|
|
|
def request(query: str, params: "OnlineParams") -> None:
|
|
|
|
args = urlencode(
|
|
{
|
|
"db": "pubmed",
|
|
"term": query,
|
|
"retstart": (params["pageno"] - 1) * page_size,
|
|
"hits": page_size,
|
|
}
|
|
)
|
|
esearch_url = f"{eutils_api}/esearch.fcgi?{args}"
|
|
# DTD: https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd
|
|
esearch_resp: "SXNG_Response" = get(esearch_url, timeout=3)
|
|
pmids_results = etree.XML(esearch_resp.content)
|
|
pmids: list[str] = [i.text for i in pmids_results.xpath("//eSearchResult/IdList/Id")]
|
|
|
|
# send efetch request with the IDs from esearch response
|
|
args = urlencode(
|
|
{
|
|
"db": "pubmed",
|
|
"retmode": "xml",
|
|
"id": ",".join(pmids),
|
|
}
|
|
)
|
|
efetch_url = f"{eutils_api}/efetch.fcgi?{args}"
|
|
params["url"] = efetch_url
|
|
|
|
|
|
def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
|
|
|
|
# DTD: https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd
|
|
|
|
# parse efetch response
|
|
efetch_xml = etree.XML(resp.content)
|
|
res = EngineResults()
|
|
|
|
def _field_txt(xml: ElementType, xpath_str: str) -> str:
|
|
elem = eval_xpath_getindex(xml, xpath_str, 0, default="")
|
|
return extract_text(elem, allow_none=True) or ""
|
|
|
|
for pubmed_article in eval_xpath_list(efetch_xml, "//PubmedArticle"):
|
|
|
|
medline_citation: ElementType = eval_xpath_getindex(pubmed_article, "./MedlineCitation", 0)
|
|
pubmed_data: ElementType = eval_xpath_getindex(pubmed_article, "./PubmedData", 0)
|
|
|
|
title: str = eval_xpath_getindex(medline_citation, ".//Article/ArticleTitle", 0).text
|
|
pmid: str = eval_xpath_getindex(medline_citation, ".//PMID", 0).text
|
|
url: str = pubmed_url + pmid
|
|
content = _field_txt(medline_citation, ".//Abstract/AbstractText//text()")
|
|
doi = _field_txt(medline_citation, ".//ELocationID[@EIdType='doi']/text()")
|
|
journal = _field_txt(medline_citation, "./Article/Journal/Title/text()")
|
|
issn = _field_txt(medline_citation, "./Article/Journal/ISSN/text()")
|
|
|
|
authors: list[str] = []
|
|
|
|
for author in eval_xpath_list(medline_citation, "./Article/AuthorList/Author"):
|
|
f = eval_xpath_getindex(author, "./ForeName", 0, default=None)
|
|
l = eval_xpath_getindex(author, "./LastName", 0, default=None)
|
|
author_name = f"{f.text if f is not None else ''} {l.text if l is not None else ''}".strip()
|
|
if author_name:
|
|
authors.append(author_name)
|
|
|
|
accepted_date = eval_xpath_getindex(
|
|
pubmed_data, "./History//PubMedPubDate[@PubStatus='accepted']", 0, default=None
|
|
)
|
|
pub_date = None
|
|
if accepted_date is not None:
|
|
year = eval_xpath_getindex(accepted_date, "./Year", 0)
|
|
month = eval_xpath_getindex(accepted_date, "./Month", 0)
|
|
day = eval_xpath_getindex(accepted_date, "./Day", 0)
|
|
try:
|
|
pub_date = datetime(year=int(year.text), month=int(month.text), day=int(day.text))
|
|
except ValueError:
|
|
pass
|
|
|
|
res.add(
|
|
res.types.Paper(
|
|
url=url,
|
|
title=title,
|
|
content=content,
|
|
journal=journal,
|
|
issn=[issn],
|
|
authors=authors,
|
|
doi=doi,
|
|
publishedDate=pub_date,
|
|
)
|
|
)
|
|
return res
|