hermes-agent/plugins/web/ddgs/provider.py
uzunkuyruk 489b85ee1e fix(ddgs): bound DuckDuckGo search with a wall-clock timeout (#36776)
A single ddgs (DuckDuckGo) search could hang indefinitely and block the
shared agent loop — and therefore every platform (CLI, Telegram, Matrix...).
The DDGS constructor's timeout only bounds individual HTTP requests; ddgs's
multi-engine retry loop has no overall cap, so a slow/rate-limited response
could spin for 20+ minutes with no output and no error.

Run the synchronous ddgs call in a single-worker ThreadPoolExecutor and cap
it with future.result(timeout=_SEARCH_TIMEOUT_SECS=30). On timeout, return a
clear failure ("DuckDuckGo search timed out ... try a different provider")
instead of blocking; the pool is shut down with cancel_futures so a hung
worker is never awaited.

Salvaged from #37422 by @uzunkuyruk (authorship preserved). Re-applied on
current main (the PR's provider.py base had diverged). Added a load-bearing
timeout regression test (the original PR only updated the fake's constructor
and had no timeout-behavior test) — mutation-verified to fail without the cap.

Closes #36776.
2026-06-25 01:45:06 +05:30

158 lines
6.2 KiB
Python

"""DuckDuckGo search — plugin form (via the ``ddgs`` package).
Subclasses the plugin-facing :class:`agent.web_search_provider.WebSearchProvider`.
The legacy in-tree module ``tools.web_providers.ddgs`` was removed in the
same commit that moved this code under ``plugins/``; this file is now the
canonical implementation.
The ``ddgs`` package is an optional dependency. ``is_available()`` reflects
whether the package is importable; the plugin still registers either way so
``hermes tools`` can prompt the user to install it.
"""
from __future__ import annotations
import concurrent.futures as _cf
import logging
from typing import Any, Dict
from agent.web_search_provider import WebSearchProvider
logger = logging.getLogger(__name__)
# Overall wall-clock cap for a single ddgs search. The DDGS constructor's
# ``timeout`` only bounds individual HTTP requests; ddgs's multi-engine retry
# loop has no overall cap, so a slow/rate-limited DuckDuckGo response can hang
# the (single, shared) agent loop indefinitely and block every platform
# (#36776). Enforce a hard cap here via a worker thread.
_SEARCH_TIMEOUT_SECS = 30
def _run_ddgs_search(query: str, safe_limit: int) -> list[dict[str, Any]]:
"""Run the blocking ddgs query and return normalized hits.
Module-level (not a closure) so tests can patch it directly without
spawning a real multi-second worker thread. ``DDGS(timeout=...)`` bounds
each individual HTTP request; the overall wall-clock cap is enforced by
the caller via a future timeout.
"""
from ddgs import DDGS # type: ignore
results: list[dict[str, Any]] = []
with DDGS(timeout=10) as client:
for i, hit in enumerate(client.text(query, max_results=safe_limit)):
if i >= safe_limit:
break
url = str(hit.get("href") or hit.get("url") or "")
results.append(
{
"title": str(hit.get("title", "")),
"url": url,
"description": str(hit.get("body", "")),
"position": i + 1,
}
)
return results
class DDGSWebSearchProvider(WebSearchProvider):
"""DuckDuckGo HTML-scrape search provider.
No API key needed. Rate limits are enforced server-side by DuckDuckGo;
the provider surfaces ``DuckDuckGoSearchException`` and other ddgs errors
as ``{"success": False, "error": ...}`` rather than raising.
"""
@property
def name(self) -> str:
return "ddgs"
@property
def display_name(self) -> str:
return "DuckDuckGo (ddgs)"
def is_available(self) -> bool:
"""Return True when the ``ddgs`` package is importable.
Probes the import once; cheap because Python caches the import. Must
NOT perform network I/O — runs at tool-registration time and on every
``hermes tools`` paint.
"""
try:
import ddgs # noqa: F401
return True
except ImportError:
return False
def supports_search(self) -> bool:
return True
def supports_extract(self) -> bool:
return False
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
"""Execute a DuckDuckGo search and return normalized results.
The synchronous ``ddgs`` call is run in a worker thread with a hard
wall-clock timeout (``_SEARCH_TIMEOUT_SECS``) so a hung search cannot
block the shared agent loop indefinitely (#36776).
"""
try:
import ddgs # type: ignore # noqa: F401 — availability probe
except ImportError:
return {
"success": False,
"error": "ddgs package is not installed — run `pip install ddgs`",
}
# DDGS().text yields at most `max_results` items; we cap defensively
# in case the package ignores the hint.
safe_limit = max(1, int(limit))
# A fresh single-worker pool per call (rather than a module-level one)
# is intentional: on timeout the blocking ddgs call cannot be cancelled
# and keeps running, so a shared pool would serialise every later search
# behind that hung worker. A per-call pool isolates each search from a
# previously-hung one.
pool = _cf.ThreadPoolExecutor(max_workers=1)
try:
future = pool.submit(_run_ddgs_search, query, safe_limit)
try:
web_results = future.result(timeout=_SEARCH_TIMEOUT_SECS)
except _cf.TimeoutError:
logger.warning(
"DDGS search timed out after %ds for query: %r",
_SEARCH_TIMEOUT_SECS, query,
)
return {
"success": False,
"error": (
f"DuckDuckGo search timed out after {_SEARCH_TIMEOUT_SECS}s — "
"DuckDuckGo may be rate-limiting or slow. Try again later "
"or switch to a different search provider."
),
}
except Exception as exc: # noqa: BLE001 — ddgs raises its own exceptions
logger.warning("DDGS search error: %s", exc)
return {"success": False, "error": f"DuckDuckGo search failed: {exc}"}
finally:
# Return immediately without joining the worker. On timeout the
# already-running ddgs call can't be cancelled (cancel_futures only
# affects not-yet-started work), so the worker runs to completion
# on its own; it writes nothing shared, so leaking it is safe.
pool.shutdown(wait=False, cancel_futures=True)
logger.info("DDGS search '%s': %d results (limit %d)", query, len(web_results), limit)
return {"success": True, "data": {"web": web_results}}
def get_setup_schema(self) -> Dict[str, Any]:
return {
"name": "DuckDuckGo (ddgs)",
"badge": "free · no key · search only",
"tag": "Search via the ddgs Python package — no API key (pair with any extract provider)",
"env_vars": [],
# Trigger `_run_post_setup("ddgs")` after the user picks this row
# so the ddgs Python package gets pip-installed on first selection.
"post_setup": "ddgs",
}