mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
refactor(web): per-capability backend selection for search/extract split
Introduce the foundation for independently selecting web search and extract backends — enabling future combinations like SearXNG for search + Firecrawl for extract. Architecture: - tools/web_providers/base.py: WebSearchProvider and WebExtractProvider ABCs with normalized result contracts (mirrors CloudBrowserProvider) - tools/web_tools.py: _get_search_backend() and _get_extract_backend() read per-capability config keys, fall through to shared web.backend - hermes_cli/config.py: web.search_backend and web.extract_backend in DEFAULT_CONFIG (empty = inherit from web.backend) Behavioral change: - web_search_tool() now dispatches via _get_search_backend() - web_extract_tool() now dispatches via _get_extract_backend() - When per-capability keys are empty (default), behavior is identical to before — _get_search_backend() falls through to _get_backend() This is purely structural — no new backends are added. SearXNG and other search-only/extract-only providers can now be added as simple drop-in modules in follow-up PRs. 12 new tests, 49 existing tests pass with zero regressions. Ref: #19198
This commit is contained in:
parent
6388aafbd6
commit
cd2cbc73b7
6 changed files with 411 additions and 5 deletions
|
|
@ -544,7 +544,13 @@ DEFAULT_CONFIG = {
|
||||||
# via TERMINAL_LOCAL_PERSISTENT env var.
|
# via TERMINAL_LOCAL_PERSISTENT env var.
|
||||||
"persistent_shell": True,
|
"persistent_shell": True,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
"web": {
|
||||||
|
"backend": "", # shared fallback — applies to both search and extract
|
||||||
|
"search_backend": "", # per-capability override for web_search (e.g. "searxng")
|
||||||
|
"extract_backend": "", # per-capability override for web_extract (e.g. "native")
|
||||||
|
},
|
||||||
|
|
||||||
"browser": {
|
"browser": {
|
||||||
"inactivity_timeout": 120,
|
"inactivity_timeout": 120,
|
||||||
"command_timeout": 30, # Timeout for browser commands in seconds (screenshot, navigate, etc.)
|
"command_timeout": 30, # Timeout for browser commands in seconds (screenshot, navigate, etc.)
|
||||||
|
|
|
||||||
194
tests/tools/test_web_providers.py
Normal file
194
tests/tools/test_web_providers.py
Normal file
|
|
@ -0,0 +1,194 @@
|
||||||
|
"""Tests for the web tools provider architecture.
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- WebSearchProvider / WebExtractProvider ABC enforcement
|
||||||
|
- Per-capability backend selection (_get_search_backend, _get_extract_backend)
|
||||||
|
- Backward compatibility (web.backend still works as shared fallback)
|
||||||
|
- Config keys merge correctly via DEFAULT_CONFIG
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# ABC enforcement
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestWebProviderABCs:
|
||||||
|
"""The ABCs enforce the interface contract."""
|
||||||
|
|
||||||
|
def test_cannot_instantiate_search_provider(self):
|
||||||
|
from tools.web_providers.base import WebSearchProvider
|
||||||
|
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
WebSearchProvider() # type: ignore[abstract]
|
||||||
|
|
||||||
|
def test_cannot_instantiate_extract_provider(self):
|
||||||
|
from tools.web_providers.base import WebExtractProvider
|
||||||
|
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
WebExtractProvider() # type: ignore[abstract]
|
||||||
|
|
||||||
|
def test_concrete_search_provider_works(self):
|
||||||
|
from tools.web_providers.base import WebSearchProvider
|
||||||
|
|
||||||
|
class Dummy(WebSearchProvider):
|
||||||
|
def provider_name(self) -> str:
|
||||||
|
return "dummy"
|
||||||
|
def is_configured(self) -> bool:
|
||||||
|
return True
|
||||||
|
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
||||||
|
return {"success": True, "data": {"web": []}}
|
||||||
|
|
||||||
|
d = Dummy()
|
||||||
|
assert d.provider_name() == "dummy"
|
||||||
|
assert d.is_configured() is True
|
||||||
|
assert d.search("test")["success"] is True
|
||||||
|
|
||||||
|
def test_concrete_extract_provider_works(self):
|
||||||
|
from tools.web_providers.base import WebExtractProvider
|
||||||
|
|
||||||
|
class Dummy(WebExtractProvider):
|
||||||
|
def provider_name(self) -> str:
|
||||||
|
return "dummy"
|
||||||
|
def is_configured(self) -> bool:
|
||||||
|
return True
|
||||||
|
def extract(self, urls: List[str], **kwargs) -> Dict[str, Any]:
|
||||||
|
return {"success": True, "data": [{"url": urls[0], "content": "x"}]}
|
||||||
|
|
||||||
|
d = Dummy()
|
||||||
|
assert d.provider_name() == "dummy"
|
||||||
|
assert d.extract(["https://example.com"])["success"] is True
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Per-capability backend selection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestPerCapabilityBackendSelection:
|
||||||
|
"""_get_search_backend and _get_extract_backend read per-capability config."""
|
||||||
|
|
||||||
|
def test_search_backend_overrides_generic(self, monkeypatch):
|
||||||
|
from tools import web_tools
|
||||||
|
|
||||||
|
monkeypatch.setattr(web_tools, "_load_web_config", lambda: {
|
||||||
|
"backend": "firecrawl",
|
||||||
|
"search_backend": "tavily",
|
||||||
|
})
|
||||||
|
monkeypatch.setenv("TAVILY_API_KEY", "test-key")
|
||||||
|
assert web_tools._get_search_backend() == "tavily"
|
||||||
|
|
||||||
|
def test_extract_backend_overrides_generic(self, monkeypatch):
|
||||||
|
from tools import web_tools
|
||||||
|
|
||||||
|
monkeypatch.setattr(web_tools, "_load_web_config", lambda: {
|
||||||
|
"backend": "tavily",
|
||||||
|
"extract_backend": "exa",
|
||||||
|
})
|
||||||
|
monkeypatch.setenv("EXA_API_KEY", "test-key")
|
||||||
|
assert web_tools._get_extract_backend() == "exa"
|
||||||
|
|
||||||
|
def test_falls_back_to_generic_backend_when_search_backend_empty(self, monkeypatch):
|
||||||
|
from tools import web_tools
|
||||||
|
|
||||||
|
monkeypatch.setattr(web_tools, "_load_web_config", lambda: {
|
||||||
|
"backend": "tavily",
|
||||||
|
"search_backend": "",
|
||||||
|
})
|
||||||
|
monkeypatch.setenv("TAVILY_API_KEY", "test-key")
|
||||||
|
assert web_tools._get_search_backend() == "tavily"
|
||||||
|
|
||||||
|
def test_falls_back_to_generic_backend_when_extract_backend_empty(self, monkeypatch):
|
||||||
|
from tools import web_tools
|
||||||
|
|
||||||
|
monkeypatch.setattr(web_tools, "_load_web_config", lambda: {
|
||||||
|
"backend": "parallel",
|
||||||
|
"extract_backend": "",
|
||||||
|
})
|
||||||
|
monkeypatch.setenv("PARALLEL_API_KEY", "test-key")
|
||||||
|
assert web_tools._get_extract_backend() == "parallel"
|
||||||
|
|
||||||
|
def test_search_backend_ignored_when_not_available(self, monkeypatch):
|
||||||
|
from tools import web_tools
|
||||||
|
|
||||||
|
monkeypatch.setattr(web_tools, "_load_web_config", lambda: {
|
||||||
|
"backend": "firecrawl",
|
||||||
|
"search_backend": "exa", # set but no EXA_API_KEY
|
||||||
|
})
|
||||||
|
monkeypatch.delenv("EXA_API_KEY", raising=False)
|
||||||
|
monkeypatch.setenv("FIRECRAWL_API_KEY", "fc-key")
|
||||||
|
# Should fall back to firecrawl since exa isn't configured
|
||||||
|
assert web_tools._get_search_backend() == "firecrawl"
|
||||||
|
|
||||||
|
def test_fully_backward_compatible_with_web_backend_only(self, monkeypatch):
|
||||||
|
from tools import web_tools
|
||||||
|
|
||||||
|
monkeypatch.setattr(web_tools, "_load_web_config", lambda: {
|
||||||
|
"backend": "tavily",
|
||||||
|
})
|
||||||
|
monkeypatch.setenv("TAVILY_API_KEY", "test-key")
|
||||||
|
# No search_backend or extract_backend set — both fall through
|
||||||
|
assert web_tools._get_search_backend() == "tavily"
|
||||||
|
assert web_tools._get_extract_backend() == "tavily"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Config key presence in DEFAULT_CONFIG
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestDefaultConfig:
|
||||||
|
"""The web section exists in DEFAULT_CONFIG with per-capability keys."""
|
||||||
|
|
||||||
|
def test_web_section_in_default_config(self):
|
||||||
|
from hermes_cli.config import DEFAULT_CONFIG
|
||||||
|
|
||||||
|
assert "web" in DEFAULT_CONFIG
|
||||||
|
web = DEFAULT_CONFIG["web"]
|
||||||
|
assert "backend" in web
|
||||||
|
assert "search_backend" in web
|
||||||
|
assert "extract_backend" in web
|
||||||
|
# All empty string by default (no override)
|
||||||
|
assert web["backend"] == ""
|
||||||
|
assert web["search_backend"] == ""
|
||||||
|
assert web["extract_backend"] == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# web_search_tool uses _get_search_backend
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestWebSearchUsesSearchBackend:
|
||||||
|
"""web_search_tool dispatches through _get_search_backend not _get_backend."""
|
||||||
|
|
||||||
|
def test_search_tool_calls_search_backend(self, monkeypatch):
|
||||||
|
from tools import web_tools
|
||||||
|
|
||||||
|
called_with = []
|
||||||
|
original_get_search = web_tools._get_search_backend
|
||||||
|
|
||||||
|
def tracking_get_search():
|
||||||
|
result = original_get_search()
|
||||||
|
called_with.append(("search", result))
|
||||||
|
return result
|
||||||
|
|
||||||
|
monkeypatch.setattr(web_tools, "_get_search_backend", tracking_get_search)
|
||||||
|
monkeypatch.setattr(web_tools, "_load_web_config", lambda: {"backend": "firecrawl"})
|
||||||
|
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake")
|
||||||
|
|
||||||
|
# The function will fail at Firecrawl client level but we just
|
||||||
|
# need to verify _get_search_backend was called
|
||||||
|
try:
|
||||||
|
web_tools.web_search_tool("test", 1)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
assert len(called_with) > 0
|
||||||
|
assert called_with[0][0] == "search"
|
||||||
73
tools/web_providers/ARCHITECTURE.md
Normal file
73
tools/web_providers/ARCHITECTURE.md
Normal file
|
|
@ -0,0 +1,73 @@
|
||||||
|
# Web Tools Provider Architecture
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Web tools (`web_search`, `web_extract`) use a **per-capability backend selection** system that allows different providers for search and extract independently.
|
||||||
|
|
||||||
|
## Config Keys
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
web:
|
||||||
|
backend: "firecrawl" # Shared fallback — applies to both if specific keys not set
|
||||||
|
search_backend: "" # Per-capability override for web_search
|
||||||
|
extract_backend: "" # Per-capability override for web_extract
|
||||||
|
```
|
||||||
|
|
||||||
|
**Selection priority (per capability):**
|
||||||
|
1. `web.search_backend` / `web.extract_backend` (explicit per-capability)
|
||||||
|
2. `web.backend` (shared fallback)
|
||||||
|
3. Auto-detect from environment variables
|
||||||
|
|
||||||
|
When per-capability keys are empty (default), behavior is identical to the legacy single-backend selection.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
web_search_tool()
|
||||||
|
└─ _get_search_backend()
|
||||||
|
├─ web.search_backend (if set + available)
|
||||||
|
└─ _get_backend() fallback
|
||||||
|
|
||||||
|
web_extract_tool()
|
||||||
|
└─ _get_extract_backend()
|
||||||
|
├─ web.extract_backend (if set + available)
|
||||||
|
└─ _get_backend() fallback
|
||||||
|
```
|
||||||
|
|
||||||
|
## Provider ABCs
|
||||||
|
|
||||||
|
New providers implement these interfaces in `tools/web_providers/`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from tools.web_providers.base import WebSearchProvider, WebExtractProvider
|
||||||
|
|
||||||
|
class MySearchProvider(WebSearchProvider):
|
||||||
|
def provider_name(self) -> str: ...
|
||||||
|
def is_configured(self) -> bool: ...
|
||||||
|
def search(self, query: str, limit: int = 5) -> Dict[str, Any]: ...
|
||||||
|
|
||||||
|
class MyExtractProvider(WebExtractProvider):
|
||||||
|
def provider_name(self) -> str: ...
|
||||||
|
def is_configured(self) -> bool: ...
|
||||||
|
def extract(self, urls: List[str], **kwargs) -> Dict[str, Any]: ...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Adding a New Search Provider
|
||||||
|
|
||||||
|
1. Create `tools/web_providers/your_provider.py` implementing `WebSearchProvider`
|
||||||
|
2. Add availability check to `_is_backend_available()` in `web_tools.py`
|
||||||
|
3. Add dispatch branch in `web_search_tool()`
|
||||||
|
4. Add provider to `hermes tools` picker in `tools_config.py`
|
||||||
|
5. Add env var to `OPTIONAL_ENV_VARS` in `config.py` (if needed)
|
||||||
|
6. Write tests in `tests/tools/`
|
||||||
|
|
||||||
|
Search-only providers (like SearXNG) don't need to implement `WebExtractProvider`.
|
||||||
|
Extract-only providers don't need to implement `WebSearchProvider`.
|
||||||
|
|
||||||
|
## hermes tools UX
|
||||||
|
|
||||||
|
The provider picker uses **progressive disclosure**:
|
||||||
|
- **Default path** (90% of users): Pick one provider → sets `web.backend` for both. One selection, done.
|
||||||
|
- **Advanced path**: "Configure separately" option at bottom → two-step sub-picker for search + extract independently.
|
||||||
|
|
||||||
|
See `.hermes/plans/2026-05-03-web-tools-provider-architecture.md` for the full UX flow diagram.
|
||||||
6
tools/web_providers/__init__.py
Normal file
6
tools/web_providers/__init__.py
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
"""Web capability providers — search, extract, crawl.
|
||||||
|
|
||||||
|
Each capability has an ABC in ``base.py`` and vendor implementations in
|
||||||
|
sibling modules. Provider registries in ``web_tools.py`` map config names
|
||||||
|
to provider classes.
|
||||||
|
"""
|
||||||
89
tools/web_providers/base.py
Normal file
89
tools/web_providers/base.py
Normal file
|
|
@ -0,0 +1,89 @@
|
||||||
|
"""Abstract base classes for web capability providers."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
|
||||||
|
class WebSearchProvider(ABC):
|
||||||
|
"""Interface for web search backends (Firecrawl, Tavily, Exa, etc.).
|
||||||
|
|
||||||
|
Implementations live in sibling modules. The user selects a provider
|
||||||
|
via ``hermes tools``; the choice is persisted as
|
||||||
|
``config["web"]["search_backend"]`` (falling back to
|
||||||
|
``config["web"]["backend"]``).
|
||||||
|
|
||||||
|
Search providers return results in a normalized format::
|
||||||
|
|
||||||
|
{
|
||||||
|
"success": True,
|
||||||
|
"data": {
|
||||||
|
"web": [
|
||||||
|
{"title": str, "url": str, "description": str, "position": int},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
On failure::
|
||||||
|
|
||||||
|
{"success": False, "error": str}
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def provider_name(self) -> str:
|
||||||
|
"""Short, human-readable name shown in logs and diagnostics."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def is_configured(self) -> bool:
|
||||||
|
"""Return True when all required env vars / credentials are present.
|
||||||
|
|
||||||
|
Called at tool-registration time to gate availability.
|
||||||
|
Must be cheap — no network calls.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
||||||
|
"""Execute a web search and return normalized results."""
|
||||||
|
|
||||||
|
|
||||||
|
class WebExtractProvider(ABC):
|
||||||
|
"""Interface for web content extraction backends.
|
||||||
|
|
||||||
|
Implementations live in sibling modules. The user selects a provider
|
||||||
|
via ``hermes tools``; the choice is persisted as
|
||||||
|
``config["web"]["extract_backend"]`` (falling back to
|
||||||
|
``config["web"]["backend"]``).
|
||||||
|
|
||||||
|
Extract providers return results in a normalized format::
|
||||||
|
|
||||||
|
{
|
||||||
|
"success": True,
|
||||||
|
"data": [
|
||||||
|
{"url": str, "title": str, "content": str,
|
||||||
|
"raw_content": str, "metadata": dict},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
On failure::
|
||||||
|
|
||||||
|
{"success": False, "error": str}
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def provider_name(self) -> str:
|
||||||
|
"""Short, human-readable name shown in logs and diagnostics."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def is_configured(self) -> bool:
|
||||||
|
"""Return True when all required env vars / credentials are present.
|
||||||
|
|
||||||
|
Called at tool-registration time to gate availability.
|
||||||
|
Must be cheap — no network calls.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def extract(self, urls: List[str], **kwargs) -> Dict[str, Any]:
|
||||||
|
"""Extract content from the given URLs and return normalized results."""
|
||||||
|
|
@ -119,7 +119,7 @@ def _load_web_config() -> dict:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def _get_backend() -> str:
|
def _get_backend() -> str:
|
||||||
"""Determine which web backend to use.
|
"""Determine which web backend to use (shared fallback).
|
||||||
|
|
||||||
Reads ``web.backend`` from config.yaml (set by ``hermes tools``).
|
Reads ``web.backend`` from config.yaml (set by ``hermes tools``).
|
||||||
Falls back to whichever API key is present for users who configured
|
Falls back to whichever API key is present for users who configured
|
||||||
|
|
@ -145,6 +145,44 @@ def _get_backend() -> str:
|
||||||
return "firecrawl" # default (backward compat)
|
return "firecrawl" # default (backward compat)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_search_backend() -> str:
|
||||||
|
"""Determine which backend to use for web_search specifically.
|
||||||
|
|
||||||
|
Selection priority:
|
||||||
|
1. ``web.search_backend`` (per-capability override)
|
||||||
|
2. ``web.backend`` (shared fallback — existing behavior)
|
||||||
|
3. Auto-detect from env vars
|
||||||
|
|
||||||
|
This enables using different providers for search vs extract
|
||||||
|
(e.g. SearXNG for search + Firecrawl for extract).
|
||||||
|
"""
|
||||||
|
return _get_capability_backend("search")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_extract_backend() -> str:
|
||||||
|
"""Determine which backend to use for web_extract specifically.
|
||||||
|
|
||||||
|
Selection priority:
|
||||||
|
1. ``web.extract_backend`` (per-capability override)
|
||||||
|
2. ``web.backend`` (shared fallback — existing behavior)
|
||||||
|
3. Auto-detect from env vars
|
||||||
|
"""
|
||||||
|
return _get_capability_backend("extract")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_capability_backend(capability: str) -> str:
|
||||||
|
"""Shared helper for per-capability backend selection.
|
||||||
|
|
||||||
|
Reads ``web.{capability}_backend`` from config; if set and available,
|
||||||
|
uses it. Otherwise falls through to the shared ``_get_backend()``.
|
||||||
|
"""
|
||||||
|
cfg = _load_web_config()
|
||||||
|
specific = (cfg.get(f"{capability}_backend") or "").lower().strip()
|
||||||
|
if specific and _is_backend_available(specific):
|
||||||
|
return specific
|
||||||
|
return _get_backend()
|
||||||
|
|
||||||
|
|
||||||
def _is_backend_available(backend: str) -> bool:
|
def _is_backend_available(backend: str) -> bool:
|
||||||
"""Return True when the selected backend is currently usable."""
|
"""Return True when the selected backend is currently usable."""
|
||||||
if backend == "exa":
|
if backend == "exa":
|
||||||
|
|
@ -1129,8 +1167,8 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
||||||
if is_interrupted():
|
if is_interrupted():
|
||||||
return tool_error("Interrupted", success=False)
|
return tool_error("Interrupted", success=False)
|
||||||
|
|
||||||
# Dispatch to the configured backend
|
# Dispatch to the configured search backend
|
||||||
backend = _get_backend()
|
backend = _get_search_backend()
|
||||||
if backend == "parallel":
|
if backend == "parallel":
|
||||||
response_data = _parallel_search(query, limit)
|
response_data = _parallel_search(query, limit)
|
||||||
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
||||||
|
|
@ -1286,7 +1324,7 @@ async def web_extract_tool(
|
||||||
if not safe_urls:
|
if not safe_urls:
|
||||||
results = []
|
results = []
|
||||||
else:
|
else:
|
||||||
backend = _get_backend()
|
backend = _get_extract_backend()
|
||||||
|
|
||||||
if backend == "parallel":
|
if backend == "parallel":
|
||||||
results = await _parallel_extract(safe_urls)
|
results = await _parallel_extract(safe_urls)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue