From cd2cbc73b7c56f0c19f41a6bb21808239078653c Mon Sep 17 00:00:00 2001 From: kshitij <82637225+kshitijk4poor@users.noreply.github.com> Date: Wed, 6 May 2026 09:16:25 -0700 Subject: [PATCH] refactor(web): per-capability backend selection for search/extract split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the foundation for independently selecting web search and extract backends — enabling future combinations like SearXNG for search + Firecrawl for extract. Architecture: - tools/web_providers/base.py: WebSearchProvider and WebExtractProvider ABCs with normalized result contracts (mirrors CloudBrowserProvider) - tools/web_tools.py: _get_search_backend() and _get_extract_backend() read per-capability config keys, fall through to shared web.backend - hermes_cli/config.py: web.search_backend and web.extract_backend in DEFAULT_CONFIG (empty = inherit from web.backend) Behavioral change: - web_search_tool() now dispatches via _get_search_backend() - web_extract_tool() now dispatches via _get_extract_backend() - When per-capability keys are empty (default), behavior is identical to before — _get_search_backend() falls through to _get_backend() This is purely structural — no new backends are added. SearXNG and other search-only/extract-only providers can now be added as simple drop-in modules in follow-up PRs. 12 new tests, 49 existing tests pass with zero regressions. Ref: #19198 --- hermes_cli/config.py | 8 +- tests/tools/test_web_providers.py | 194 ++++++++++++++++++++++++++++ tools/web_providers/ARCHITECTURE.md | 73 +++++++++++ tools/web_providers/__init__.py | 6 + tools/web_providers/base.py | 89 +++++++++++++ tools/web_tools.py | 46 ++++++- 6 files changed, 411 insertions(+), 5 deletions(-) create mode 100644 tests/tools/test_web_providers.py create mode 100644 tools/web_providers/ARCHITECTURE.md create mode 100644 tools/web_providers/__init__.py create mode 100644 tools/web_providers/base.py diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 571381f4e3..76bb3f07af 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -544,7 +544,13 @@ DEFAULT_CONFIG = { # via TERMINAL_LOCAL_PERSISTENT env var. "persistent_shell": True, }, - + + "web": { + "backend": "", # shared fallback — applies to both search and extract + "search_backend": "", # per-capability override for web_search (e.g. "searxng") + "extract_backend": "", # per-capability override for web_extract (e.g. "native") + }, + "browser": { "inactivity_timeout": 120, "command_timeout": 30, # Timeout for browser commands in seconds (screenshot, navigate, etc.) diff --git a/tests/tools/test_web_providers.py b/tests/tools/test_web_providers.py new file mode 100644 index 0000000000..3c0abb307b --- /dev/null +++ b/tests/tools/test_web_providers.py @@ -0,0 +1,194 @@ +"""Tests for the web tools provider architecture. + +Covers: +- WebSearchProvider / WebExtractProvider ABC enforcement +- Per-capability backend selection (_get_search_backend, _get_extract_backend) +- Backward compatibility (web.backend still works as shared fallback) +- Config keys merge correctly via DEFAULT_CONFIG +""" +from __future__ import annotations + +import json +from typing import Any, Dict, List + +import pytest + + +# --------------------------------------------------------------------------- +# ABC enforcement +# --------------------------------------------------------------------------- + + +class TestWebProviderABCs: + """The ABCs enforce the interface contract.""" + + def test_cannot_instantiate_search_provider(self): + from tools.web_providers.base import WebSearchProvider + + with pytest.raises(TypeError): + WebSearchProvider() # type: ignore[abstract] + + def test_cannot_instantiate_extract_provider(self): + from tools.web_providers.base import WebExtractProvider + + with pytest.raises(TypeError): + WebExtractProvider() # type: ignore[abstract] + + def test_concrete_search_provider_works(self): + from tools.web_providers.base import WebSearchProvider + + class Dummy(WebSearchProvider): + def provider_name(self) -> str: + return "dummy" + def is_configured(self) -> bool: + return True + def search(self, query: str, limit: int = 5) -> Dict[str, Any]: + return {"success": True, "data": {"web": []}} + + d = Dummy() + assert d.provider_name() == "dummy" + assert d.is_configured() is True + assert d.search("test")["success"] is True + + def test_concrete_extract_provider_works(self): + from tools.web_providers.base import WebExtractProvider + + class Dummy(WebExtractProvider): + def provider_name(self) -> str: + return "dummy" + def is_configured(self) -> bool: + return True + def extract(self, urls: List[str], **kwargs) -> Dict[str, Any]: + return {"success": True, "data": [{"url": urls[0], "content": "x"}]} + + d = Dummy() + assert d.provider_name() == "dummy" + assert d.extract(["https://example.com"])["success"] is True + + +# --------------------------------------------------------------------------- +# Per-capability backend selection +# --------------------------------------------------------------------------- + + +class TestPerCapabilityBackendSelection: + """_get_search_backend and _get_extract_backend read per-capability config.""" + + def test_search_backend_overrides_generic(self, monkeypatch): + from tools import web_tools + + monkeypatch.setattr(web_tools, "_load_web_config", lambda: { + "backend": "firecrawl", + "search_backend": "tavily", + }) + monkeypatch.setenv("TAVILY_API_KEY", "test-key") + assert web_tools._get_search_backend() == "tavily" + + def test_extract_backend_overrides_generic(self, monkeypatch): + from tools import web_tools + + monkeypatch.setattr(web_tools, "_load_web_config", lambda: { + "backend": "tavily", + "extract_backend": "exa", + }) + monkeypatch.setenv("EXA_API_KEY", "test-key") + assert web_tools._get_extract_backend() == "exa" + + def test_falls_back_to_generic_backend_when_search_backend_empty(self, monkeypatch): + from tools import web_tools + + monkeypatch.setattr(web_tools, "_load_web_config", lambda: { + "backend": "tavily", + "search_backend": "", + }) + monkeypatch.setenv("TAVILY_API_KEY", "test-key") + assert web_tools._get_search_backend() == "tavily" + + def test_falls_back_to_generic_backend_when_extract_backend_empty(self, monkeypatch): + from tools import web_tools + + monkeypatch.setattr(web_tools, "_load_web_config", lambda: { + "backend": "parallel", + "extract_backend": "", + }) + monkeypatch.setenv("PARALLEL_API_KEY", "test-key") + assert web_tools._get_extract_backend() == "parallel" + + def test_search_backend_ignored_when_not_available(self, monkeypatch): + from tools import web_tools + + monkeypatch.setattr(web_tools, "_load_web_config", lambda: { + "backend": "firecrawl", + "search_backend": "exa", # set but no EXA_API_KEY + }) + monkeypatch.delenv("EXA_API_KEY", raising=False) + monkeypatch.setenv("FIRECRAWL_API_KEY", "fc-key") + # Should fall back to firecrawl since exa isn't configured + assert web_tools._get_search_backend() == "firecrawl" + + def test_fully_backward_compatible_with_web_backend_only(self, monkeypatch): + from tools import web_tools + + monkeypatch.setattr(web_tools, "_load_web_config", lambda: { + "backend": "tavily", + }) + monkeypatch.setenv("TAVILY_API_KEY", "test-key") + # No search_backend or extract_backend set — both fall through + assert web_tools._get_search_backend() == "tavily" + assert web_tools._get_extract_backend() == "tavily" + + +# --------------------------------------------------------------------------- +# Config key presence in DEFAULT_CONFIG +# --------------------------------------------------------------------------- + + +class TestDefaultConfig: + """The web section exists in DEFAULT_CONFIG with per-capability keys.""" + + def test_web_section_in_default_config(self): + from hermes_cli.config import DEFAULT_CONFIG + + assert "web" in DEFAULT_CONFIG + web = DEFAULT_CONFIG["web"] + assert "backend" in web + assert "search_backend" in web + assert "extract_backend" in web + # All empty string by default (no override) + assert web["backend"] == "" + assert web["search_backend"] == "" + assert web["extract_backend"] == "" + + +# --------------------------------------------------------------------------- +# web_search_tool uses _get_search_backend +# --------------------------------------------------------------------------- + + +class TestWebSearchUsesSearchBackend: + """web_search_tool dispatches through _get_search_backend not _get_backend.""" + + def test_search_tool_calls_search_backend(self, monkeypatch): + from tools import web_tools + + called_with = [] + original_get_search = web_tools._get_search_backend + + def tracking_get_search(): + result = original_get_search() + called_with.append(("search", result)) + return result + + monkeypatch.setattr(web_tools, "_get_search_backend", tracking_get_search) + monkeypatch.setattr(web_tools, "_load_web_config", lambda: {"backend": "firecrawl"}) + monkeypatch.setenv("FIRECRAWL_API_KEY", "fake") + + # The function will fail at Firecrawl client level but we just + # need to verify _get_search_backend was called + try: + web_tools.web_search_tool("test", 1) + except Exception: + pass + + assert len(called_with) > 0 + assert called_with[0][0] == "search" diff --git a/tools/web_providers/ARCHITECTURE.md b/tools/web_providers/ARCHITECTURE.md new file mode 100644 index 0000000000..f4a7b335e8 --- /dev/null +++ b/tools/web_providers/ARCHITECTURE.md @@ -0,0 +1,73 @@ +# Web Tools Provider Architecture + +## Overview + +Web tools (`web_search`, `web_extract`) use a **per-capability backend selection** system that allows different providers for search and extract independently. + +## Config Keys + +```yaml +web: + backend: "firecrawl" # Shared fallback — applies to both if specific keys not set + search_backend: "" # Per-capability override for web_search + extract_backend: "" # Per-capability override for web_extract +``` + +**Selection priority (per capability):** +1. `web.search_backend` / `web.extract_backend` (explicit per-capability) +2. `web.backend` (shared fallback) +3. Auto-detect from environment variables + +When per-capability keys are empty (default), behavior is identical to the legacy single-backend selection. + +## Architecture + +``` +web_search_tool() + └─ _get_search_backend() + ├─ web.search_backend (if set + available) + └─ _get_backend() fallback + +web_extract_tool() + └─ _get_extract_backend() + ├─ web.extract_backend (if set + available) + └─ _get_backend() fallback +``` + +## Provider ABCs + +New providers implement these interfaces in `tools/web_providers/`: + +```python +from tools.web_providers.base import WebSearchProvider, WebExtractProvider + +class MySearchProvider(WebSearchProvider): + def provider_name(self) -> str: ... + def is_configured(self) -> bool: ... + def search(self, query: str, limit: int = 5) -> Dict[str, Any]: ... + +class MyExtractProvider(WebExtractProvider): + def provider_name(self) -> str: ... + def is_configured(self) -> bool: ... + def extract(self, urls: List[str], **kwargs) -> Dict[str, Any]: ... +``` + +## Adding a New Search Provider + +1. Create `tools/web_providers/your_provider.py` implementing `WebSearchProvider` +2. Add availability check to `_is_backend_available()` in `web_tools.py` +3. Add dispatch branch in `web_search_tool()` +4. Add provider to `hermes tools` picker in `tools_config.py` +5. Add env var to `OPTIONAL_ENV_VARS` in `config.py` (if needed) +6. Write tests in `tests/tools/` + +Search-only providers (like SearXNG) don't need to implement `WebExtractProvider`. +Extract-only providers don't need to implement `WebSearchProvider`. + +## hermes tools UX + +The provider picker uses **progressive disclosure**: +- **Default path** (90% of users): Pick one provider → sets `web.backend` for both. One selection, done. +- **Advanced path**: "Configure separately" option at bottom → two-step sub-picker for search + extract independently. + +See `.hermes/plans/2026-05-03-web-tools-provider-architecture.md` for the full UX flow diagram. diff --git a/tools/web_providers/__init__.py b/tools/web_providers/__init__.py new file mode 100644 index 0000000000..15134175d2 --- /dev/null +++ b/tools/web_providers/__init__.py @@ -0,0 +1,6 @@ +"""Web capability providers — search, extract, crawl. + +Each capability has an ABC in ``base.py`` and vendor implementations in +sibling modules. Provider registries in ``web_tools.py`` map config names +to provider classes. +""" diff --git a/tools/web_providers/base.py b/tools/web_providers/base.py new file mode 100644 index 0000000000..2177218919 --- /dev/null +++ b/tools/web_providers/base.py @@ -0,0 +1,89 @@ +"""Abstract base classes for web capability providers.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List + + +class WebSearchProvider(ABC): + """Interface for web search backends (Firecrawl, Tavily, Exa, etc.). + + Implementations live in sibling modules. The user selects a provider + via ``hermes tools``; the choice is persisted as + ``config["web"]["search_backend"]`` (falling back to + ``config["web"]["backend"]``). + + Search providers return results in a normalized format:: + + { + "success": True, + "data": { + "web": [ + {"title": str, "url": str, "description": str, "position": int}, + ... + ] + } + } + + On failure:: + + {"success": False, "error": str} + """ + + @abstractmethod + def provider_name(self) -> str: + """Short, human-readable name shown in logs and diagnostics.""" + + @abstractmethod + def is_configured(self) -> bool: + """Return True when all required env vars / credentials are present. + + Called at tool-registration time to gate availability. + Must be cheap — no network calls. + """ + + @abstractmethod + def search(self, query: str, limit: int = 5) -> Dict[str, Any]: + """Execute a web search and return normalized results.""" + + +class WebExtractProvider(ABC): + """Interface for web content extraction backends. + + Implementations live in sibling modules. The user selects a provider + via ``hermes tools``; the choice is persisted as + ``config["web"]["extract_backend"]`` (falling back to + ``config["web"]["backend"]``). + + Extract providers return results in a normalized format:: + + { + "success": True, + "data": [ + {"url": str, "title": str, "content": str, + "raw_content": str, "metadata": dict}, + ... + ] + } + + On failure:: + + {"success": False, "error": str} + """ + + @abstractmethod + def provider_name(self) -> str: + """Short, human-readable name shown in logs and diagnostics.""" + + @abstractmethod + def is_configured(self) -> bool: + """Return True when all required env vars / credentials are present. + + Called at tool-registration time to gate availability. + Must be cheap — no network calls. + """ + + @abstractmethod + def extract(self, urls: List[str], **kwargs) -> Dict[str, Any]: + """Extract content from the given URLs and return normalized results.""" diff --git a/tools/web_tools.py b/tools/web_tools.py index e24ace2f87..b5eb111685 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -119,7 +119,7 @@ def _load_web_config() -> dict: return {} def _get_backend() -> str: - """Determine which web backend to use. + """Determine which web backend to use (shared fallback). Reads ``web.backend`` from config.yaml (set by ``hermes tools``). Falls back to whichever API key is present for users who configured @@ -145,6 +145,44 @@ def _get_backend() -> str: return "firecrawl" # default (backward compat) +def _get_search_backend() -> str: + """Determine which backend to use for web_search specifically. + + Selection priority: + 1. ``web.search_backend`` (per-capability override) + 2. ``web.backend`` (shared fallback — existing behavior) + 3. Auto-detect from env vars + + This enables using different providers for search vs extract + (e.g. SearXNG for search + Firecrawl for extract). + """ + return _get_capability_backend("search") + + +def _get_extract_backend() -> str: + """Determine which backend to use for web_extract specifically. + + Selection priority: + 1. ``web.extract_backend`` (per-capability override) + 2. ``web.backend`` (shared fallback — existing behavior) + 3. Auto-detect from env vars + """ + return _get_capability_backend("extract") + + +def _get_capability_backend(capability: str) -> str: + """Shared helper for per-capability backend selection. + + Reads ``web.{capability}_backend`` from config; if set and available, + uses it. Otherwise falls through to the shared ``_get_backend()``. + """ + cfg = _load_web_config() + specific = (cfg.get(f"{capability}_backend") or "").lower().strip() + if specific and _is_backend_available(specific): + return specific + return _get_backend() + + def _is_backend_available(backend: str) -> bool: """Return True when the selected backend is currently usable.""" if backend == "exa": @@ -1129,8 +1167,8 @@ def web_search_tool(query: str, limit: int = 5) -> str: if is_interrupted(): return tool_error("Interrupted", success=False) - # Dispatch to the configured backend - backend = _get_backend() + # Dispatch to the configured search backend + backend = _get_search_backend() if backend == "parallel": response_data = _parallel_search(query, limit) debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", [])) @@ -1286,7 +1324,7 @@ async def web_extract_tool( if not safe_urls: results = [] else: - backend = _get_backend() + backend = _get_extract_backend() if backend == "parallel": results = await _parallel_extract(safe_urls)