refactor(web): remove legacy in-tree provider modules

Deletes tools/web_providers/{brave_free,ddgs,searxng}.py — the three
providers that moved to plugins/web/ in prior commits. tools/web_tools.py
no longer imports them (registry dispatch as of d8735963f), so removing
them is purely a cleanup pass.

Also migrates the existing tests to the new import paths:
  tests/tools/test_web_providers_brave_free.py
  tests/tools/test_web_providers_ddgs.py
  tests/tools/test_web_providers_searxng.py

Mechanical rewrites:
  - `from tools.web_providers.X import YSearchProvider`
      -> `from plugins.web.X.provider import YWebSearchProvider`
  - `.is_configured()` -> `.is_available()`        (legacy method  -> new method)
  - `.provider_name()` -> `.name`                  (legacy method  -> new property)
  - `from tools.web_providers.base import WebSearchProvider`
      -> `from agent.web_search_provider import WebSearchProvider`
      (the subclass-check asserts membership in the new plugin-facing ABC)
  - `sys.modules.delitem("tools.web_providers.ddgs")` updated to point at
    `plugins.web.ddgs.provider` (cache-busting for lazy ddgs imports)

The TestXBackendWiring / TestXSearchOnlyErrors classes (covering
_is_backend_available, _get_backend, check_web_api_key, and the
"search-only" error paths in web_extract/web_crawl) are untouched —
those still test web_tools.py's backend-selection logic, which continues
to recognize the names "brave-free" / "ddgs" / "searxng" even after the
modules behind them moved to plugins.

tools/web_providers/base.py is intentionally NOT deleted by this commit
— it's the parent ABC of the legacy modules and shares its name with
agent/web_search_provider.py::WebSearchProvider. Removing it surfaces the
naming collision (see PR description Finding 0); the real migration PR
deletes it in the same commit that drops the _WEB_PLUGIN_SKIPLIST
guards in hermes_cli/tools_config.py.

Test results:
  bash scripts/run_tests.sh tests/tools/test_web_providers_*.py
  -> 65 passed in 3.41s (all rewritten unit tests + unchanged integration tests)
  bash scripts/run_tests.sh tests/tools/test_web_*.py
  -> 141 passed in 4.70s (full web test set, post-deletion)
This commit is contained in:
kshitijk4poor 2026-05-13 23:52:02 +05:30 committed by Teknium
parent 714630110b
commit 6b219f5af6
9 changed files with 105 additions and 462 deletions

View file

@ -1,130 +0,0 @@
"""Brave Search web search provider (free tier).
Brave Search's Data-for-Search API offers a free tier (2,000 queries/mo at the
time of writing) after signing up at https://brave.com/search/api/. This
provider implements ``WebSearchProvider`` only the Data-for-Search endpoint
returns search results, it does not extract/crawl arbitrary URLs.
Configuration::
# ~/.hermes/.env
BRAVE_SEARCH_API_KEY=your-subscription-token
# ~/.hermes/config.yaml
web:
search_backend: "brave-free"
extract_backend: "firecrawl" # pair with an extract provider if needed
The API uses the ``X-Subscription-Token`` header. Free-tier keys are rate
limited (1 qps) and capped at 2k queries/month; see the Brave dashboard for
current quotas.
"""
from __future__ import annotations
import logging
import os
from typing import Any, Dict
from tools.web_providers.base import WebSearchProvider
logger = logging.getLogger(__name__)
_BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
class BraveFreeSearchProvider(WebSearchProvider):
"""Search via the Brave Search API (free tier).
Requires ``BRAVE_SEARCH_API_KEY`` to be set. The value is passed as the
``X-Subscription-Token`` header. No extract capability pair with
Firecrawl/Tavily/Exa/Parallel when you also need ``web_extract``.
"""
def provider_name(self) -> str:
return "brave-free"
def is_configured(self) -> bool:
"""Return True when ``BRAVE_SEARCH_API_KEY`` is set to a non-empty value."""
return bool(os.getenv("BRAVE_SEARCH_API_KEY", "").strip())
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
"""Execute a search against the Brave Search API.
Returns normalized results::
{
"success": True,
"data": {
"web": [
{
"title": str,
"url": str,
"description": str,
"position": int,
},
...
]
}
}
On failure returns ``{"success": False, "error": str}``.
"""
import httpx
api_key = os.getenv("BRAVE_SEARCH_API_KEY", "").strip()
if not api_key:
return {"success": False, "error": "BRAVE_SEARCH_API_KEY is not set"}
# Brave's `count` is capped at 20.
count = max(1, min(int(limit), 20))
try:
resp = httpx.get(
_BRAVE_ENDPOINT,
params={"q": query, "count": count},
headers={
"X-Subscription-Token": api_key,
"Accept": "application/json",
},
timeout=15,
)
resp.raise_for_status()
except httpx.HTTPStatusError as exc:
logger.warning("Brave Search HTTP error: %s", exc)
return {
"success": False,
"error": f"Brave Search returned HTTP {exc.response.status_code}",
}
except httpx.RequestError as exc:
logger.warning("Brave Search request error: %s", exc)
return {"success": False, "error": f"Could not reach Brave Search: {exc}"}
try:
data = resp.json()
except Exception as exc: # noqa: BLE001
logger.warning("Brave Search response parse error: %s", exc)
return {"success": False, "error": "Could not parse Brave Search response as JSON"}
raw_results = (data.get("web") or {}).get("results", []) or []
truncated = raw_results[:limit]
web_results = [
{
"title": str(r.get("title", "")),
"url": str(r.get("url", "")),
"description": str(r.get("description", "")),
"position": i + 1,
}
for i, r in enumerate(truncated)
]
logger.info(
"Brave Search '%s': %d results (from %d raw, limit %d)",
query,
len(web_results),
len(raw_results),
limit,
)
return {"success": True, "data": {"web": web_results}}

View file

@ -1,98 +0,0 @@
"""DuckDuckGo web search provider via the ``ddgs`` Python package.
DuckDuckGo does not provide an official programmatic search API. The
community-maintained `ddgs <https://pypi.org/project/ddgs/>`_ package (the
renamed successor of ``duckduckgo-search``) scrapes DuckDuckGo's HTML results
page and normalizes them. It implements ``WebSearchProvider`` only there is
no extract capability.
Configuration::
# No API key required. Enable by installing the package and pointing the
# web backend at ddgs:
pip install ddgs
# ~/.hermes/config.yaml
web:
search_backend: "ddgs"
extract_backend: "firecrawl" # pair with an extract provider if needed
Rate limits are enforced server-side by DuckDuckGo. Expect intermittent
``DuckDuckGoSearchException`` / 202 responses under heavy use; this provider
surfaces them as ``{"success": False, "error": ...}`` rather than crashing
the tool call.
See https://duckduckgo.com/?q=duckduckgo+tos for terms of use.
"""
from __future__ import annotations
import logging
from typing import Any, Dict
from tools.web_providers.base import WebSearchProvider
logger = logging.getLogger(__name__)
class DDGSSearchProvider(WebSearchProvider):
"""Search via the ``ddgs`` package (DuckDuckGo HTML scrape).
No API key required. The provider is considered "configured" when the
``ddgs`` package is importable there is nothing else to set up.
"""
def provider_name(self) -> str:
return "ddgs"
def is_configured(self) -> bool:
"""Return True when the ``ddgs`` package is importable.
Called at tool-registration time; must not perform network I/O.
"""
try:
import ddgs # noqa: F401
return True
except ImportError:
return False
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
"""Execute a DuckDuckGo search and return normalized results.
Returns ``{"success": True, "data": {"web": [...]}}`` on success or
``{"success": False, "error": str}`` on failure (missing package,
rate-limited, network error, etc.).
"""
try:
from ddgs import DDGS # type: ignore
except ImportError:
return {
"success": False,
"error": "ddgs package is not installed — run `pip install ddgs`",
}
# DDGS().text yields at most `max_results` items; we cap defensively
# in case the package ignores the hint.
safe_limit = max(1, int(limit))
try:
web_results = []
with DDGS() as client:
for i, hit in enumerate(client.text(query, max_results=safe_limit)):
if i >= safe_limit:
break
url = str(hit.get("href") or hit.get("url") or "")
web_results.append(
{
"title": str(hit.get("title", "")),
"url": url,
"description": str(hit.get("body", "")),
"position": i + 1,
}
)
except Exception as exc: # noqa: BLE001 — ddgs raises its own exceptions
logger.warning("DDGS search error: %s", exc)
return {"success": False, "error": f"DuckDuckGo search failed: {exc}"}
logger.info("DDGS search '%s': %d results (limit %d)", query, len(web_results), limit)
return {"success": True, "data": {"web": web_results}}

View file

@ -1,132 +0,0 @@
"""SearXNG web search provider.
SearXNG is a free, self-hosted, privacy-respecting metasearch engine.
It implements ``WebSearchProvider`` only there is no extract capability.
Configuration::
# ~/.hermes/.env
SEARXNG_URL=http://localhost:8080
# Use SearXNG for search, pair with any extract provider:
# ~/.hermes/config.yaml
web:
search_backend: "searxng"
extract_backend: "firecrawl"
Public SearXNG instances are listed at https://searx.space/ but self-hosting
is recommended for production use (rate limits and availability vary per
public instance).
"""
from __future__ import annotations
import logging
import os
from typing import Any, Dict
from tools.web_providers.base import WebSearchProvider
logger = logging.getLogger(__name__)
class SearXNGSearchProvider(WebSearchProvider):
"""Search via a SearXNG instance.
Requires ``SEARXNG_URL`` to be set (e.g. ``http://localhost:8080``).
No API key needed SearXNG is open-source and self-hosted.
Uses the SearXNG JSON API (``/search?format=json``). Results are
sorted by SearXNG's own score and truncated to *limit*.
"""
def provider_name(self) -> str:
return "searxng"
def is_configured(self) -> bool:
"""Return True when ``SEARXNG_URL`` is set to a non-empty value."""
return bool(os.getenv("SEARXNG_URL", "").strip())
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
"""Execute a search against the configured SearXNG instance.
Returns normalized results::
{
"success": True,
"data": {
"web": [
{
"title": str,
"url": str,
"description": str,
"position": int,
},
...
]
}
}
On failure returns ``{"success": False, "error": str}``.
"""
import httpx
base_url = os.getenv("SEARXNG_URL", "").strip().rstrip("/")
if not base_url:
return {"success": False, "error": "SEARXNG_URL is not set"}
params: Dict[str, Any] = {
"q": query,
"format": "json",
"pageno": 1,
}
try:
resp = httpx.get(
f"{base_url}/search",
params=params,
timeout=15,
headers={"Accept": "application/json"},
)
resp.raise_for_status()
except httpx.HTTPStatusError as exc:
logger.warning("SearXNG HTTP error: %s", exc)
return {"success": False, "error": f"SearXNG returned HTTP {exc.response.status_code}"}
except httpx.RequestError as exc:
logger.warning("SearXNG request error: %s", exc)
return {"success": False, "error": f"Could not reach SearXNG at {base_url}: {exc}"}
try:
data = resp.json()
except Exception as exc: # noqa: BLE001
logger.warning("SearXNG response parse error: %s", exc)
return {"success": False, "error": "Could not parse SearXNG response as JSON"}
raw_results = data.get("results", [])
# SearXNG may return a score field; sort descending and cap to limit.
sorted_results = sorted(
raw_results,
key=lambda r: float(r.get("score", 0)),
reverse=True,
)[:limit]
web_results = [
{
"title": str(r.get("title", "")),
"url": str(r.get("url", "")),
"description": str(r.get("content", "")),
"position": i + 1,
}
for i, r in enumerate(sorted_results)
]
logger.info(
"SearXNG search '%s': %d results (from %d raw, limit %d)",
query,
len(web_results),
len(raw_results),
limit,
)
return {"success": True, "data": {"web": web_results}}