[mod] Google Scholar engine: revision of the engine (Paper result)

Revision of the engine / use of the result type Paper as well as other typifications. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-05-22 02:44:32 +02:00 · 2025-09-10 16:39:24 +02:00
parent 078c9fcb68
commit 599d9488c5
2 changed files with 152 additions and 128 deletions
@@ -11,6 +11,8 @@ engines:
 """
 import typing as t
 import re
 import random
 import string
@@ -28,8 +30,10 @@ from searx.exceptions import SearxEngineCaptchaException
 from searx.enginelib.traits import EngineTraits
 from searx.result_types import EngineResults
 if t.TYPE_CHECKING:
    from searx.extended_types import SXNG_Response
    from searx.search.processors import OnlineParams
 # about
 about = {
    "website": 'https://www.google.com',
    "wikidata_id": 'Q9366',
@@ -89,7 +93,7 @@ def ui_async(start: int) -> str:
    return ",".join([arc_id, use_ac, _fmt])
-def get_google_info(params, eng_traits):
+def get_google_info(params: "OnlineParams", eng_traits: EngineTraits) -> dict[str, t.Any]:
    """Composing various (language) properties for the google engines (:ref:`google
    API`).
@@ -144,7 +148,7 @@ def get_google_info(params, eng_traits):
    """
-    ret_val = {
+    ret_val: dict[str, t.Any] = {
        'language': None,
        'country': None,
        'subdomain': None,
@@ -273,7 +277,7 @@ def detect_google_sorry(resp):
        raise SearxEngineCaptchaException()
-def request(query, params):
+def request(query: str, params: "OnlineParams") -> None:
    """Google search request"""
    # pylint: disable=line-too-long
    start = (params['pageno'] - 1) * 10
@@ -317,7 +321,6 @@ def request(query, params):
    params['cookies'] = google_info['cookies']
    params['headers'].update(google_info['headers'])
    return params
 # =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
@@ -341,7 +344,7 @@ def parse_data_images(text: str):
    return data_image_map
-def response(resp) -> EngineResults:
+def response(resp: "SXNG_Response"):
    """Get response from google's search request"""
    # pylint: disable=too-many-branches, too-many-statements
    detect_google_sorry(resp)
@@ -1,12 +1,29 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
-"""This is the implementation of the Google Scholar engine.
+"""Google Scholar is a freely accessible web search engine that indexes the full
 text or metadata of scholarly literature across an array of publishing formats
 and disciplines.
 Compared to other Google services the Scholar engine has a simple GET REST-API
-and there does not exists `async` API.  Even though the API slightly vintage we
+and there does not exists ``async`` API.  Even though the API slightly vintage
-can make use of the :ref:`google API` to assemble the arguments of the GET
+we can make use of the :ref:`google API` to assemble the arguments of the GET
 request.
 Configuration
 =============
 .. code:: yaml
  - name: google scholar
    engine: google_scholar
    shortcut: gos
 Implementations
 ===============
 """
 import typing as t
 from urllib.parse import urlencode
 from datetime import datetime
 from lxml import html
@@ -16,6 +33,7 @@ from searx.utils import (
    eval_xpath_getindex,
    eval_xpath_list,
    extract_text,
    ElementType,
 )
 from searx.exceptions import SearxEngineCaptchaException
@@ -26,18 +44,23 @@ from searx.engines.google import (
    time_range_dict,
 )
-# about
+from searx.result_types import EngineResults
 if t.TYPE_CHECKING:
    from searx.extended_types import SXNG_Response
    from searx.search.processors import OnlineParams
 about = {
-    "website": 'https://scholar.google.com',
+    "website": "https://scholar.google.com",
-    "wikidata_id": 'Q494817',
+    "wikidata_id": "Q494817",
-    "official_api_documentation": 'https://developers.google.com/custom-search',
+    "official_api_documentation": "https://developers.google.com/custom-search",
    "use_official_api": False,
    "require_api_key": False,
-    "results": 'HTML',
+    "results": "HTML",
 }
 # engine dependent config
-categories = ['science', 'scientific publications']
+categories = ["science", "scientific publications"]
 paging = True
 max_page = 50
 """`Google max 50 pages`_
@@ -50,9 +73,97 @@ safesearch = False
 send_accept_language_header = True
-def time_range_args(params):
+def request(query: str, params: "OnlineParams") -> None:
    """Google-Scholar search request"""
    google_info = get_google_info(params, traits)
    # subdomain is: scholar.google.xy
    google_info["subdomain"] = google_info["subdomain"].replace("www.", "scholar.")
    args = {
        "q": query,
        **google_info["params"],
        "start": (params["pageno"] - 1) * 10,
        "as_sdt": "2007",  # include patents / to disable set "0,5"
        "as_vis": "0",  # include citations / to disable set "1"
    }
    args.update(time_range_args(params))
    params["url"] = "https://" + google_info["subdomain"] + "/scholar?" + urlencode(args)
    params["cookies"] = google_info["cookies"]
    params["headers"].update(google_info["headers"])
 def response(resp: "SXNG_Response") -> EngineResults:  # pylint: disable=too-many-locals
    """Parse response from Google Scholar"""
    res = EngineResults()
    dom = html.fromstring(resp.text)
    detect_google_captcha(dom)
    # parse results
    for result in eval_xpath_list(dom, "//div[@data-rp]"):
        title = extract_text(eval_xpath(result, ".//h3[1]//a"))
        if not title:
            # this is a [ZITATION] block
            continue
        pub_type: str = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']")) or ""
        if pub_type:
            pub_type = pub_type[1:-1].lower()
        url: str = eval_xpath_getindex(result, ".//h3[1]//a/@href", 0)
        content: str = extract_text(eval_xpath(result, ".//div[@class='gs_rs']")) or ""
        authors, journal, publisher, publishedDate = parse_gs_a(
            extract_text(eval_xpath(result, ".//div[@class='gs_a']"))
        )
        if publisher in url:
            publisher = ""
        # cited by
        comments: str = (
            extract_text(eval_xpath(result, ".//div[@class='gs_fl']/a[starts-with(@href,'/scholar?cites=')]")) or ""
        )
        # link to the html or pdf document
        html_url: str = ""
        pdf_url: str = ""
        doc_url = eval_xpath_getindex(result, ".//div[@class='gs_or_ggsm']/a/@href", 0, default=None)
        doc_type = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']"))
        if doc_type == "[PDF]":
            pdf_url = doc_url
        else:
            html_url = doc_url
        res.add(
            res.types.Paper(
                type=pub_type,
                url=url,
                title=title,
                authors=authors,
                publisher=publisher,
                journal=journal,
                publishedDate=publishedDate,
                content=content,
                comments=comments,
                html_url=html_url,
                pdf_url=pdf_url,
            )
        )
    # parse suggestion
    for suggestion in eval_xpath(dom, "//div[contains(@class, 'gs_qsuggest_wrap')]//li//a"):
        res.add(res.types.LegacyResult(suggestion=extract_text(suggestion)))
    for correction in eval_xpath(dom, "//div[@class='gs_r gs_pda']/a"):
        res.add(res.types.LegacyResult(correction=extract_text(correction)))
    return res
 def time_range_args(params: "OnlineParams") -> dict[str, int]:
    """Returns a dictionary with a time range arguments based on
-    ``params['time_range']``.
+    ``params["time_range"]``.
    Google Scholar supports a detailed search by year.  Searching by *last
    month* or *last week* (as offered by SearXNG) is uncommon for scientific
@@ -60,21 +171,23 @@ def time_range_args(params):
    To limit the result list when the users selects a range, all the SearXNG
    ranges (*day*, *week*, *month*, *year*) are mapped to *year*.  If no range
-    is set an empty dictionary of arguments is returned.  Example;  when
+    is set an empty dictionary of arguments is returned.
-    user selects a time range (current year minus one in 2022):
+
    Example; when user selects a time range and we find ourselves in the year
    2025 (current year minus one):
    .. code:: python
-        { 'as_ylo' : 2021 }
+        { "as_ylo" : 2024 }
    """
-    ret_val = {}
+    ret_val: dict[str, int] = {}
-    if params['time_range'] in time_range_dict:
+    if params["time_range"] in time_range_dict:
-        ret_val['as_ylo'] = datetime.now().year - 1
+        ret_val["as_ylo"] = datetime.now().year - 1
    return ret_val
-def detect_google_captcha(dom):
+def detect_google_captcha(dom: ElementType):
    """In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
    not redirected to ``sorry.google.com``.
    """
@@ -82,29 +195,7 @@ def detect_google_captcha(dom):
        raise SearxEngineCaptchaException()
-def request(query, params):
+def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]:
    """Google-Scholar search request"""
    google_info = get_google_info(params, traits)
    # subdomain is: scholar.google.xy
    google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
    args = {
        'q': query,
        **google_info['params'],
        'start': (params['pageno'] - 1) * 10,
        'as_sdt': '2007',  # include patents / to disable set '0,5'
        'as_vis': '0',  # include citations / to disable set '1'
    }
    args.update(time_range_args(params))
    params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
    params['cookies'] = google_info['cookies']
    params['headers'].update(google_info['headers'])
    return params
 def parse_gs_a(text: str | None):
    """Parse the text written in green.
    Possible formats:
@@ -113,98 +204,28 @@ def parse_gs_a(text: str | None):
    * "{authors} - {publisher}"
    """
    if text is None or text == "":
-        return None, None, None, None
+        return [], "", "", None
-    s_text = text.split(' - ')
+    s_text = text.split(" - ")
-    authors = s_text[0].split(', ')
+    authors: list[str] = s_text[0].split(", ")
-    publisher = s_text[-1]
+    publisher: str = s_text[-1]
    if len(s_text) != 3:
-        return authors, None, publisher, None
+        return authors, "", publisher, None
    # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
    # get journal and year
-    journal_year = s_text[1].split(', ')
+    journal_year = s_text[1].split(", ")
    # journal is optional and may contains some coma
    if len(journal_year) > 1:
-        journal = ', '.join(journal_year[0:-1])
+        journal: str = ", ".join(journal_year[0:-1])
-        if journal == '…':
+        if journal == "…":
-            journal = None
+            journal = ""
    else:
-        journal = None
+        journal = ""
    # year
    year = journal_year[-1]
    try:
-        publishedDate = datetime.strptime(year.strip(), '%Y')
+        publishedDate = datetime.strptime(year.strip(), "%Y")
    except ValueError:
        publishedDate = None
    return authors, journal, publisher, publishedDate
 def response(resp):  # pylint: disable=too-many-locals
    """Parse response from Google Scholar"""
    results = []
    # convert the text to dom
    dom = html.fromstring(resp.text)
    detect_google_captcha(dom)
    # parse results
    for result in eval_xpath_list(dom, '//div[@data-rp]'):
        title = extract_text(eval_xpath(result, './/h3[1]//a'))
        if not title:
            # this is a [ZITATION] block
            continue
        pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
        if pub_type:
            pub_type = pub_type[1:-1].lower()
        url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
        content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
        authors, journal, publisher, publishedDate = parse_gs_a(
            extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
        )
        if publisher in url:
            publisher = None
        # cited by
        comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
        # link to the html or pdf document
        html_url = None
        pdf_url = None
        doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
        doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
        if doc_type == "[PDF]":
            pdf_url = doc_url
        else:
            html_url = doc_url
        results.append(
            {
                'template': 'paper.html',
                'type': pub_type,
                'url': url,
                'title': title,
                'authors': authors,
                'publisher': publisher,
                'journal': journal,
                'publishedDate': publishedDate,
                'content': content,
                'comments': comments,
                'html_url': html_url,
                'pdf_url': pdf_url,
            }
        )
    # parse suggestion
    for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'):
        # append suggestion
        results.append({'suggestion': extract_text(suggestion)})
    for correction in eval_xpath(dom, '//div[@class="gs_r gs_pda"]/a'):
        results.append({'correction': extract_text(correction)})
    return results