Compare commits

..

4 Commits

Author SHA1 Message Date
Markus Heiser 00f7c68a6f [chore] drop emacs' obsolete .dir-locals template (#6236)
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-06-10 17:38:19 +02:00
Bnyro 41c98b3b41 [chore] devops: add languages config for helix editor
The default Helix configuration for Python is different,
so the pylint warnings aren't shown and the formatter
re-formats files by accident when you edit an existing file.

Therefore, this commit adds `python` language configuration
to ease developing SearXNG with Helix Editor [^1].

[^1]: https://helix-editor.com
2026-06-10 17:38:01 +02:00
Bnyro f4c63c8eb0 [feat] engines: add duckduckgo web engine as alternative to html.duckduckgo.com
html.duckduckgo.com captchas all my IPs very fast. I figured out that using
duckduckgo.com works even if html.duckduckgo.com is captcha-ed, hence adding
support for duckduckgo.com's general web search here.

This implementation fetches the link to the first API page
(i.e. ``links.duckduckgo.com/d.js?...``) from duckduckgo.com and uses the ``n``
parameter of the API to fetch all subsequent pages.

This also means that it's not possible to immediately search for the third
page - the first and the second page would need to be loaded first.

The reason why we can't just normally use the `vqd` value is that the API URLs
require an additional parameter `dp` which seems generated at server-side, so we
can't build it ourselves and must scrape it from the HTML pages.
2026-06-10 16:49:56 +02:00
Markus Heiser 26801e92af [fix] sqlitedb: create DB Schema (DDL) during app initialization (hardening) (#6187)
The initialization of the DB schema ("base schema") has so far been done on
demand, which causes race conditions with competing threads and processes.

The DDL statements for creating the "base schema" are now executed as part of
the initialization of the app.

Further improvements were made to harden the database applications:

- Wikidata & Radio-Browser engine perform their initialization only once (so far
  the initialization was carried out in each thread/process).

- If multiple processes try to set DB's WAL mode when opening the DB at the same
  time, this usually leads to another race condition, which is now also caught.

Related:

- https://github.com/searxng/searxng/issues/6181#issuecomment-4586705

Closes: #6181

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
2026-06-10 15:48:49 +02:00
11 changed files with 245 additions and 180 deletions
-163
View File
@@ -1,163 +0,0 @@
;;; .dir-locals.el
;;
;; Per-Directory Local Variables:
;; https://www.gnu.org/software/emacs/manual/html_node/emacs/Directory-Variables.html
;;
;; For full fledge developer tools install emacs packages:
;;
;; M-x package-install ...
;;
;; magit gitconfig
;; nvm lsp-mode lsp-pyright lsp-eslint
;; pyvenv pylint pip-requirements
;; jinja2-mode
;; json-mode
;; company company-jedi company-quickhelp company-shell
;; realgud
;; sphinx-doc markdown-mode graphviz-dot-mode
;; apache-mode nginx-mode
;;
;; To setup a developer environment, build target::
;;
;; $ make node.env.dev pyenv.install
;;
;; Some buffer locals are referencing the project environment:
;;
;; - prj-root --> <repo>/
;; - nvm-dir --> <repo>/.nvm
;; - python-environment-directory --> <repo>/local
;; - python-environment-default-root-name --> py3
;; - python-shell-virtualenv-root --> <repo>/local/py3
;; When this variable is set with the path of the virtualenv to use,
;; `process-environment' and `exec-path' get proper values in order to run
;; shells inside the specified virtualenv, example::
;; (setq python-shell-virtualenv-root "/path/to/env/")
;; - python-shell-interpreter --> <repo>/local/py3/bin/python
;;
;; Python development:
;;
;; Jedi, flycheck & other python stuff should use the 'python-shell-interpreter'
;; from the local py3 environment.
;;
((nil
. ((fill-column . 80)
(indent-tabs-mode . nil)
(eval . (progn
(add-to-list 'auto-mode-alist '("\\.html\\'" . jinja2-mode))
;; project root folder is where the `.dir-locals.el' is located
(setq-local prj-root
(locate-dominating-file default-directory ".dir-locals.el"))
(setq-local python-environment-directory
(expand-file-name "./local" prj-root))
;; to get in use of NVM environment, install https://github.com/rejeep/nvm.el
(setq-local nvm-dir (expand-file-name "./.nvm" prj-root))
;; use nodejs from the (local) NVM environment (see nvm-dir)
(nvm-use-for-buffer)
(ignore-errors (require 'lsp))
(setq-local lsp-server-install-dir (car (cdr nvm-current-version)))
(setq-local lsp-enable-file-watchers nil)
;; use 'py3' environment as default
(setq-local python-environment-default-root-name
"py3")
(setq-local python-shell-virtualenv-root
(expand-file-name
python-environment-default-root-name python-environment-directory))
(setq-local python-shell-interpreter
(expand-file-name
"bin/python" python-shell-virtualenv-root))))))
(makefile-gmake-mode
. ((indent-tabs-mode . t)))
(yaml-mode
. ((eval . (progn
;; flycheck should use the local py3 environment
(setq-local flycheck-yaml-yamllint-executable
(expand-file-name "bin/yamllint" python-shell-virtualenv-root))
(setq-local flycheck-yamllintrc
(expand-file-name ".yamllint.yml" prj-root))
(flycheck-checker . yaml-yamllint)))))
(json-mode
. ((eval . (progn
(setq-local js-indent-level 4)
(flycheck-checker . json-python-json)))))
(js-mode
. ((eval . (progn
(ignore-errors (require 'lsp-eslint))
(setq-local js-indent-level 2)
;; flycheck should use the eslint checker from developer tools
(setq-local flycheck-javascript-eslint-executable
(expand-file-name "node_modules/.bin/eslint" prj-root))
;; (flycheck-mode)
(if (featurep 'lsp-eslint)
(lsp))
))))
(python-mode
. ((eval . (progn
(ignore-errors (require 'jedi-core))
(ignore-errors (require 'lsp-pyright))
(ignore-errors (sphinx-doc-mode))
(setq-local python-environment-virtualenv
(list (expand-file-name "bin/virtualenv" python-shell-virtualenv-root)
;;"--system-site-packages"
"--quiet"))
(setq-local pylint-command
(expand-file-name "bin/pylint" python-shell-virtualenv-root))
(if (featurep 'lsp-pyright)
(lsp))
;; pylint will find the '.pylintrc' file next to the CWD
;; https://pylint.readthedocs.io/en/latest/user_guide/run.html#command-line-options
(setq-local flycheck-pylintrc
".pylintrc")
;; flycheck & other python stuff should use the local py3 environment
(setq-local flycheck-python-pylint-executable
python-shell-interpreter)
;; use 'M-x jedi:show-setup-info' and 'M-x epc:controller' to inspect jedi server
;; https://tkf.github.io/emacs-jedi/latest/#jedi:environment-root -- You
;; can specify a full path instead of a name (relative path). In that case,
;; python-environment-directory is ignored and Python virtual environment
;; is created at the specified path.
(setq-local jedi:environment-root
python-shell-virtualenv-root)
;; https://tkf.github.io/emacs-jedi/latest/#jedi:server-command
(setq-local jedi:server-command
(list python-shell-interpreter
jedi:server-script))
;; jedi:environment-virtualenv --> see above 'python-environment-virtualenv'
;; is set buffer local! No need to setup jedi:environment-virtualenv:
;;
;; Virtualenv command to use. A list of string. If it is nil,
;; python-environment-virtualenv is used instead. You must set non-nil
;; value to jedi:environment-root in order to make this setting work.
;;
;; https://tkf.github.io/emacs-jedi/latest/#jedi:environment-virtualenv
;;
;; (setq-local jedi:environment-virtualenv
;; (list (expand-file-name "bin/virtualenv" python-shell-virtualenv-root)
;; "--python"
;; "/usr/bin/python3.4"
;; ))
))))
)
+10
View File
@@ -0,0 +1,10 @@
[[language]]
name = "python"
language-servers = ["basedpyright", "pylsp"]
formatter = { command = "black", args = [
"--target-version",
"py311",
"--line-length",
"120",
"--skip-string-normalization",
] }
+8 -5
View File
@@ -444,12 +444,10 @@ class ExpireCacheSQLite(sqlitedb.SQLiteAppl, ExpireCache):
def get(self, key: str, default: typing.Any = None, ctx: str | None = None) -> typing.Any: def get(self, key: str, default: typing.Any = None, ctx: str | None = None) -> typing.Any:
"""Get value of ``key`` from table given by argument ``ctx``. If """Get value of ``key`` from table given by argument ``ctx``. If
``ctx`` argument is ``None`` (the default), a table name is generated ``ctx`` argument is ``None`` (the default), a table name is generated
from the :py:obj:`ExpireCacheCfg.name`. If ``key`` not exists (in from the :py:obj:`ExpireCacheCfg.name`. If ``key`` not exists in
table), the ``default`` value is returned. the table or the table not exists, the ``default`` value is returned.
""" """
table = ctx table = ctx
self.maintenance()
if not table: if not table:
table = self.normalize_name(self.cfg.name) table = self.normalize_name(self.cfg.name)
@@ -457,6 +455,9 @@ class ExpireCacheSQLite(sqlitedb.SQLiteAppl, ExpireCache):
if table not in self.table_names: if table not in self.table_names:
return default return default
# Before values are taken from the table, a maintenance interval may
# need to be carried out.
self.maintenance()
sql = f"SELECT value FROM {table} WHERE key = ?" sql = f"SELECT value FROM {table} WHERE key = ?"
row = self.DB.execute(sql, (key,)).fetchone() row = self.DB.execute(sql, (key,)).fetchone()
if row is None: if row is None:
@@ -469,12 +470,14 @@ class ExpireCacheSQLite(sqlitedb.SQLiteAppl, ExpireCache):
If ``ctx`` argument is ``None`` (the default), a table name is If ``ctx`` argument is ``None`` (the default), a table name is
generated from the :py:obj:`ExpireCacheCfg.name`.""" generated from the :py:obj:`ExpireCacheCfg.name`."""
table = ctx table = ctx
self.maintenance()
if not table: if not table:
table = self.normalize_name(self.cfg.name) table = self.normalize_name(self.cfg.name)
if table in self.table_names: if table in self.table_names:
# Before values are taken from the table, a maintenance interval may
# need to be carried out.
self.maintenance()
for row in self.DB.execute(f"SELECT key, value FROM {table}"): for row in self.DB.execute(f"SELECT key, value FROM {table}"):
yield row[0], self.deserialize(row[1]) yield row[0], self.deserialize(row[1])
+4 -1
View File
@@ -12,6 +12,7 @@ import typing as t
import sys import sys
import copy import copy
import os
from os.path import realpath, dirname from os.path import realpath, dirname
import types import types
@@ -278,6 +279,8 @@ def load_engines(engine_list: list[dict[str, t.Any]]):
else: else:
# if an engine can't be loaded (if for example the engine is missing # if an engine can't be loaded (if for example the engine is missing
# tor or some other requirements) its set to inactive! # tor or some other requirements) its set to inactive!
logger.error("loading engine %s failed: set engine to inactive!", engine_data.get("name", "???")) logger.error(
f"(PID {os.getpid()}) loading engine %s failed: set engine to inactive!", engine_data.get("name", "???")
)
engine_data["inactive"] = True engine_data["inactive"] = True
return engines return engines
+154
View File
@@ -0,0 +1,154 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""DuckDuckGo Web (general)
This implementation fetches the link to the first API page
(i.e. ``links.duckduckgo.com/d.js?...``) from duckduckgo.com and uses the ``n``
parameter of the API to fetch all subsequent pages.
This also means that it's not possible to immediately search for the third
page - the first and the second page would need to be loaded first.
The reason why we can't just normally use the `vqd` value is that the API URLs
require an additional parameter `dp` which seems generated at server-side, so we
can't build it ourselves and must scrape it from the HTML pages.
"""
import typing as t
from urllib.parse import quote_plus
from lxml import html
from searx.utils import html_to_text, gen_useragent, extract_text, eval_xpath
from searx.result_types import EngineResults
from searx.enginelib import EngineCache
from searx.network import get
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": "https://duckduckgo.com/",
"wikidata_id": "Q12805",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
# engine dependent config
categories = ["general"]
paging = True
_HTTP_User_Agent: str = gen_useragent()
base_url = "https://duckduckgo.com"
CACHE: EngineCache
"""Cache to store the API URLs for combinations of (query, page)."""
def setup(engine_settings: dict[str, str]):
global CACHE # pylint:disable=global-statement
CACHE = EngineCache(engine_settings["name"])
return CACHE
def _fetch_first_page_link(
query: str,
headers: dict[str, str],
):
"""Search for a::
<link id="deep_preload_link" rel="preload" as="script"
href="https://links.duckduckgo.com/d.js?q=rust&t=D&l=us-en&s=0&a=h_&ct=DE&vqd=VQD_VALUE&bing_market=en-US&p_ent=&ex=-1&dp=LONG_TOKEN
>
This points to the first page
""" # pylint:disable=line-too-long
cache_key = _cache_key(query, 1)
cached: str | None = CACHE.get(cache_key)
if cached:
return cached
resp = get(
url=f"{base_url}/?q={quote_plus(query)}&t=h_&ia=web",
headers=headers,
timeout=2,
)
if resp.status_code != 200:
logger.error("vqd: got HTTP %s from duckduckgo.com", resp.status_code)
dom = html.fromstring(resp.text)
first_page_link = extract_text(eval_xpath(dom, "//link[@id='deep_preload_link']/@href"))
if not first_page_link:
logger.error("vqd: failed to load first page JS url from ddg response (return empty string)")
return ""
logger.debug("got link to first page from duckduckgo.com request: '%s'", first_page_link)
CACHE.set(cache_key, first_page_link, expire=7200)
return first_page_link
def _cache_key(query: str, pageno: int) -> str:
return f"nextpage_url|{query}|{pageno}"
def request(query: str, params: "OnlineParams") -> None:
if len(query) >= 500:
# DDG does not accept queries with more than 499 chars
params["url"] = None
return
headers = params["headers"]
# The vqd value is generated from the query and the UA header. To be able
# to reuse the vqd value, the UA header must be static.
headers["User-Agent"] = _HTTP_User_Agent
headers["Accept"] = "*/*"
headers["Referer"] = f"{base_url}/"
headers["Host"] = "duckduckgo.com"
# Sec-Fetch headers are required to not get blocked when sending a Firefox user agent
headers["Sec-Fetch-Dest"] = "script"
headers["Sec-Fetch-Mode"] = "no-cors"
headers["Sec-Fetch-Site"] = "same-site"
api_url = ""
if params["pageno"] > 1:
api_url = CACHE.get(_cache_key(query, params["pageno"]))
else:
api_url = _fetch_first_page_link(query, headers)
if not api_url:
params["url"] = None
return
params["url"] = api_url.replace("/d.js?", "/d.js?o=json&")
# TODO: support safesearch, timerange and engine traits # pylint:disable=fixme
def response(resp: "SXNG_Response"):
res = EngineResults()
res_json = resp.json()
for result in res_json["results"]:
if "u" not in result:
continue
res.add(res.types.MainResult(url=result["u"], title=result["t"], content=html_to_text(result["a"])))
# link to next page
next_page_path = res_json["results"][-1].get("n")
if next_page_path:
CACHE.set(
_cache_key(resp.search_params["query"], resp.search_params["pageno"] + 1),
base_url + next_page_path,
expire=60 * 60,
)
return res
+13
View File
@@ -6,6 +6,7 @@
""" """
import os
import random import random
import socket import socket
from urllib.parse import urlencode from urllib.parse import urlencode
@@ -59,7 +60,19 @@ seconds."""
def init(_): def init(_):
global CACHE # pylint: disable=global-statement global CACHE # pylint: disable=global-statement
CACHE = EngineCache("radio_browser") CACHE = EngineCache("radio_browser")
# In an environment with competing processes, the initial loading of the
# cache is required only once.
eng_state: str | None = CACHE.get("eng_state")
if not eng_state or not eng_state.startswith("STATE:"):
CACHE.set("eng_state", f"STATE: being initialized by PID {os.getpid()}")
try:
server_list() server_list()
except Exception:
CACHE.set("eng_state", f"ERROR: initialization by PID {os.getpid()} failed.")
raise
else:
logger.debug(eng_state)
def server_list() -> list[str]: def server_list() -> list[str]:
+13
View File
@@ -7,6 +7,7 @@ Some implementations are shared from :ref:`wikipedia engine`.
import typing as t import typing as t
import os
from hashlib import md5 from hashlib import md5
from urllib.parse import urlencode, unquote from urllib.parse import urlencode, unquote
from json import loads from json import loads
@@ -827,7 +828,19 @@ def debug_explain_wikidata_query(query: str, method: str = "GET"):
def init(_): def init(_):
global CACHE # pylint: disable=global-statement global CACHE # pylint: disable=global-statement
CACHE = EngineCache("wikidata") CACHE = EngineCache("wikidata")
# In an environment with competing processes, the initial loading of the
# cache is required only once.
eng_state: str | None = CACHE.get("eng_state")
if not eng_state or not eng_state.startswith("STATE:"):
CACHE.set("eng_state", f"STATE: being initialized by PID {os.getpid()}")
try:
init_wikidata_properties() init_wikidata_properties()
except Exception:
CACHE.set("eng_state", f"ERROR: initialization by PID {os.getpid()} failed.")
raise
else:
logger.debug(eng_state)
def init_wikidata_properties(): def init_wikidata_properties():
+4 -1
View File
@@ -3,6 +3,7 @@
import typing as t import typing as t
import os
import logging import logging
import threading import threading
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
@@ -154,7 +155,9 @@ class EngineProcessor(ABC):
try: try:
init_ok = self.engine.init(eng_setting) init_ok = self.engine.init(eng_setting)
except Exception: # pylint: disable=broad-except except Exception: # pylint: disable=broad-except
logger.exception("Init method of engine %s failed due to an exception.", self.engine.name) logger.exception(
f"(PID {os.getpid()}) Init method of engine %s failed due to an exception.", self.engine.name
)
init_ok = False init_ok = False
# In older engines, None is returned from the init method, which is # In older engines, None is returned from the init method, which is
# equivalent to indicating that the initialization was successful. # equivalent to indicating that the initialization was successful.
+7
View File
@@ -803,10 +803,17 @@ engines:
display_type: ["infobox"] display_type: ["infobox"]
categories: [general] categories: [general]
# duckduckgo uses html.duckduckgo.com,
# duckduckgo web uses duckduckgo.com
- name: duckduckgo - name: duckduckgo
engine: duckduckgo engine: duckduckgo
shortcut: ddg shortcut: ddg
- name: duckduckgo web
engine: duckduckgo_web
shortcut: ddgw
disabled: true
- name: duckduckgo images - name: duckduckgo images
engine: duckduckgo_extra engine: duckduckgo_extra
categories: [images] categories: [images]
+13 -4
View File
@@ -121,8 +121,8 @@ class SQLiteAppl(abc.ABC):
.. _WAL: https://sqlite.org/wal.html .. _WAL: https://sqlite.org/wal.html
""" """
SQLITE_CONNECT_ARGS: dict[str,str|int|bool|None] = { SQLITE_CONNECT_ARGS: dict[str, str | float | int | bool | None] = {
# "timeout": 5.0, "timeout": 3.0, # default is 5sec
# "detect_types": 0, # "detect_types": 0,
"check_same_thread": bool(SQLITE_THREADING_MODE != "serialized"), "check_same_thread": bool(SQLITE_THREADING_MODE != "serialized"),
"cached_statements": 0, # https://github.com/python/cpython/issues/118172 "cached_statements": 0, # https://github.com/python/cpython/issues/118172
@@ -195,6 +195,7 @@ class SQLiteAppl(abc.ABC):
self.db_url: str = db_url self.db_url: str = db_url
self.properties: SQLiteProperties = SQLiteProperties(db_url) self.properties: SQLiteProperties = SQLiteProperties(db_url)
self._init_done: bool = False self._init_done: bool = False
self._DB: sqlite3.Connection | None = None
self._compatibility() self._compatibility()
# atexit.register(self.tear_down) # atexit.register(self.tear_down)
@@ -209,7 +210,7 @@ class SQLiteAppl(abc.ABC):
def _compatibility(self): def _compatibility(self):
if self.SQLITE_THREADING_MODE == "serialized": if self.SQLITE_THREADING_MODE == "serialized":
self._DB: sqlite3.Connection | None = None self._DB = None
else: else:
msg = ( msg = (
f"SQLite library is compiled with {self.SQLITE_THREADING_MODE} mode," f"SQLite library is compiled with {self.SQLITE_THREADING_MODE} mode,"
@@ -228,7 +229,13 @@ class SQLiteAppl(abc.ABC):
def _connect(self) -> sqlite3.Connection: def _connect(self) -> sqlite3.Connection:
conn = sqlite3.Connection(self.db_url, **self.SQLITE_CONNECT_ARGS) # type: ignore conn = sqlite3.Connection(self.db_url, **self.SQLITE_CONNECT_ARGS) # type: ignore
try:
with conn:
conn.execute(f"PRAGMA journal_mode={self.SQLITE_JOURNAL_MODE}") conn.execute(f"PRAGMA journal_mode={self.SQLITE_JOURNAL_MODE}")
except sqlite3.OperationalError:
# when database is locked, the journal_mode is already set by
# different but concurrent process (no need to set it once more)
pass
self.register_functions(conn) self.register_functions(conn)
return conn return conn
@@ -312,6 +319,7 @@ class SQLiteAppl(abc.ABC):
# Since more than one instance of SQLiteAppl share the same DB # Since more than one instance of SQLiteAppl share the same DB
# connection, we need to make sure that each SQLiteAppl instance has run # connection, we need to make sure that each SQLiteAppl instance has run
# its init method at least once. # its init method at least once.
with conn:
self.init(conn) self.init(conn)
return conn return conn
@@ -330,6 +338,7 @@ class SQLiteAppl(abc.ABC):
self._init_done = True self._init_done = True
logger.debug("init DB: %s", self.db_url) logger.debug("init DB: %s", self.db_url)
with conn:
self.properties.init(conn) self.properties.init(conn)
ver = self.properties("DB_SCHEMA") ver = self.properties("DB_SCHEMA")
@@ -409,7 +418,7 @@ CREATE TABLE IF NOT EXISTS properties (
self._init_done = True self._init_done = True
logger.debug("init properties of DB: %s", self.db_url) logger.debug("init properties of DB: %s", self.db_url)
res = conn.execute(self.SQL_TABLE_EXISTS) res = conn.execute(self.SQL_TABLE_EXISTS)
if res.fetchone() is None: # DB schema needs to be be created if res.fetchone() is None: # DB schema needs to be created
self.create_schema(conn) self.create_schema(conn)
return True return True
+14 -1
View File
@@ -1348,6 +1348,8 @@ def run():
def init(): def init():
# pylint: disable=import-outside-toplevel
if searx.sxng_debug or app.debug: if searx.sxng_debug or app.debug:
app.debug = True app.debug = True
searx.sxng_debug = True searx.sxng_debug = True
@@ -1358,6 +1360,18 @@ def init():
logger.error("server.secret_key is not changed. Please use something else instead of ultrasecretkey.") logger.error("server.secret_key is not changed. Please use something else instead of ultrasecretkey.")
sys.exit(1) sys.exit(1)
# init database schema first / DB schema is created with the first connect
from searx.data import get_cache
from searx.enginelib import ENGINES_CACHE
conn = get_cache().connect()
conn.close()
conn = ENGINES_CACHE.connect()
conn.close()
favicons.init()
# init application
locales_initialize() locales_initialize()
valkey_initialize() valkey_initialize()
searx.plugins.initialize(app) searx.plugins.initialize(app)
@@ -1366,7 +1380,6 @@ def init():
searx.search.initialize(check_network=True, enable_metrics=metrics) searx.search.initialize(check_network=True, enable_metrics=metrics)
limiter.initialize(app, settings) limiter.initialize(app, settings)
favicons.init()
def static_headers(headers: Headers, _path: str, _url: str) -> None: def static_headers(headers: Headers, _path: str, _url: str) -> None: