mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
Self-review of the plugin migration surfaced one warning and a handful of
doc/dead-code cleanups. None affect production behaviour through the main
dispatcher (which always calls `tools.web_tools._get_backend()` first and
preserves the full 7-provider walk), but direct callers of
`agent.web_search_registry.get_active_*_provider()` previously diverged
from the legacy order and could return `None` for users with credentials
but no explicit `web.backend` config key.
Changes
-------
1. `_LEGACY_PREFERENCE` was shipped as a 4-tuple
`("brave-free", "firecrawl", "searxng", "ddgs")` while the PR
description and the legacy `_get_backend()` candidate order both
call for the 7-tuple
`(firecrawl, parallel, tavily, exa, searxng, brave-free, ddgs)`.
Replaced with the 7-tuple. Verified empirically: with TAVILY+EXA keys
and no config, `get_active_search_provider()` now returns tavily
(was None); with EXA+PARALLEL it returns parallel (was None); with
BRAVE+FIRECRAWL it returns firecrawl (was brave-free).
2. `agent/web_search_registry.py` — module docstring, `_resolve` step-3
docstring, and inline comment all listed the old 4-tuple and claimed
"brave-free first because it was the shipped default". The legacy
default is `"firecrawl"`. Rewritten to match the new ordering and
reference `tools.web_tools._get_backend()` as the source of truth.
3. `agent/web_search_registry.py` — `get_active_crawl_provider`
docstring said "only Tavily implements it among built-in providers".
Firecrawl also advertises `supports_crawl=True` after the previous
commit. Updated to "Tavily and Firecrawl".
4. `plugins/web/tavily/provider.py` — module docstring said "Tavily is
the only built-in backend that natively crawls". Updated.
5. `agent/web_search_provider.py` — ABC docstring mentioned only
`search` / `extract` capabilities. Added `crawl` for accuracy.
6. `plugins/web/{firecrawl,parallel,exa}/provider.py` — dead plugin-level
cache globals (`_firecrawl_client`, `_parallel_client`,
`_async_parallel_client`, `_exa_client`) were declared but never read
(all reads/writes go through `_wt.*` per the `extracting-inline-
helpers-to-plugins` recipe). Removed the dead declarations; the
reset-for-tests helpers in firecrawl + parallel now clear the
canonical `_wt._<name>` slots, matching the pattern exa already used.
Tests
-----
218/218 web-targeted tests still pass (no test changes needed). 4910/4910
in `tests/tools/` still green.
262 lines
9.7 KiB
Python
262 lines
9.7 KiB
Python
"""
|
|
Web Search Provider Registry
|
|
============================
|
|
|
|
Central map of registered web providers. Populated by plugins at import-time
|
|
via :meth:`PluginContext.register_web_search_provider`; consumed by the
|
|
``web_search`` and ``web_extract`` tool wrappers in :mod:`tools.web_tools` to
|
|
dispatch each call to the active backend.
|
|
|
|
Active selection
|
|
----------------
|
|
The active provider is chosen by configuration with this precedence:
|
|
|
|
1. ``web.search_backend`` / ``web.extract_backend`` / ``web.crawl_backend``
|
|
(per-capability override).
|
|
2. ``web.backend`` (shared fallback).
|
|
3. If exactly one capability-eligible provider is registered AND available,
|
|
use it.
|
|
4. Legacy preference order — ``firecrawl`` → ``parallel`` → ``tavily`` →
|
|
``exa`` → ``searxng`` → ``brave-free`` → ``ddgs`` — filtered by
|
|
availability. Matches the historic ``tools.web_tools._get_backend()``
|
|
candidate order so installs that never set a config key keep landing
|
|
on the same provider they did before the plugin migration.
|
|
5. Otherwise ``None`` — the tool surfaces a helpful error pointing at
|
|
``hermes tools``.
|
|
|
|
The capability filter (``supports_search`` / ``supports_extract`` /
|
|
``supports_crawl``) is applied at every step so a search-only provider
|
|
(``brave-free``) configured as ``web.extract_backend`` correctly falls
|
|
through to an extract-capable backend.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import threading
|
|
from typing import Dict, List, Optional
|
|
|
|
from agent.web_search_provider import WebSearchProvider
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_providers: Dict[str, WebSearchProvider] = {}
|
|
_lock = threading.Lock()
|
|
|
|
|
|
def register_provider(provider: WebSearchProvider) -> None:
|
|
"""Register a web search/extract provider.
|
|
|
|
Re-registration (same ``name``) overwrites the previous entry and logs
|
|
a debug message — makes hot-reload scenarios (tests, dev loops) behave
|
|
predictably.
|
|
"""
|
|
if not isinstance(provider, WebSearchProvider):
|
|
raise TypeError(
|
|
f"register_provider() expects a WebSearchProvider instance, "
|
|
f"got {type(provider).__name__}"
|
|
)
|
|
name = provider.name
|
|
if not isinstance(name, str) or not name.strip():
|
|
raise ValueError("Web provider .name must be a non-empty string")
|
|
with _lock:
|
|
existing = _providers.get(name)
|
|
_providers[name] = provider
|
|
if existing is not None:
|
|
logger.debug(
|
|
"Web provider '%s' re-registered (was %r)",
|
|
name, type(existing).__name__,
|
|
)
|
|
else:
|
|
logger.debug(
|
|
"Registered web provider '%s' (%s)",
|
|
name, type(provider).__name__,
|
|
)
|
|
|
|
|
|
def list_providers() -> List[WebSearchProvider]:
|
|
"""Return all registered providers, sorted by name."""
|
|
with _lock:
|
|
items = list(_providers.values())
|
|
return sorted(items, key=lambda p: p.name)
|
|
|
|
|
|
def get_provider(name: str) -> Optional[WebSearchProvider]:
|
|
"""Return the provider registered under *name*, or None."""
|
|
if not isinstance(name, str):
|
|
return None
|
|
with _lock:
|
|
return _providers.get(name.strip())
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Active-provider resolution
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _read_config_key(*path: str) -> Optional[str]:
|
|
"""Resolve a dotted config key from ``config.yaml``. Returns None on miss."""
|
|
try:
|
|
from hermes_cli.config import load_config
|
|
|
|
cfg = load_config()
|
|
cur = cfg
|
|
for segment in path:
|
|
if not isinstance(cur, dict):
|
|
return None
|
|
cur = cur.get(segment)
|
|
if isinstance(cur, str) and cur.strip():
|
|
return cur.strip()
|
|
except Exception as exc:
|
|
logger.debug("Could not read config %s: %s", ".".join(path), exc)
|
|
return None
|
|
|
|
|
|
# Legacy preference order — preserves behaviour for users who set no
|
|
# ``web.backend`` / ``web.<capability>_backend`` config key at all. Matches
|
|
# the historic candidate order in :func:`tools.web_tools._get_backend`
|
|
# (paid providers first so existing paid setups don't get downgraded to
|
|
# a free tier on upgrade). Filtered by ``is_available()`` at walk time so
|
|
# we don't surface a provider the user has no credentials for.
|
|
_LEGACY_PREFERENCE = (
|
|
"firecrawl",
|
|
"parallel",
|
|
"tavily",
|
|
"exa",
|
|
"searxng",
|
|
"brave-free",
|
|
"ddgs",
|
|
)
|
|
|
|
|
|
def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearchProvider]:
|
|
"""Resolve the active provider for a capability ("search" | "extract" | "crawl").
|
|
|
|
Resolution rules (in order):
|
|
|
|
1. **Explicit config wins, ignoring availability.** If
|
|
``web.{capability}_backend`` or ``web.backend`` names a registered
|
|
provider that supports *capability*, return it even if its
|
|
:meth:`is_available` returns False — the dispatcher will surface a
|
|
precise "X_API_KEY is not set" error to the user instead of silently
|
|
routing somewhere else. Matches legacy
|
|
:func:`tools.web_tools._get_backend` behavior for configured names.
|
|
|
|
2. **Single-provider shortcut.** When only one registered provider
|
|
supports *capability* AND ``is_available()`` reports True, return it.
|
|
|
|
3. **Legacy preference walk, filtered by availability.** Walk the
|
|
:data:`_LEGACY_PREFERENCE` order (firecrawl → parallel → tavily →
|
|
exa → searxng → brave-free → ddgs) looking for a provider whose
|
|
``supports_<capability>()`` is True AND whose ``is_available()`` is
|
|
True. Matches the historic ``tools.web_tools._get_backend()``
|
|
candidate order so users with credentials but no explicit config
|
|
key keep landing on the same provider as pre-migration. This is
|
|
the path that fires when no config key is set — pick the
|
|
highest-priority backend the user actually has credentials for.
|
|
|
|
Returns None when no provider is configured AND no available provider
|
|
matches the legacy preference; the dispatcher then returns a "set up a
|
|
provider" error to the user.
|
|
"""
|
|
with _lock:
|
|
snapshot = dict(_providers)
|
|
|
|
def _capable(p: WebSearchProvider) -> bool:
|
|
if capability == "search":
|
|
return bool(p.supports_search())
|
|
if capability == "extract":
|
|
return bool(p.supports_extract())
|
|
if capability == "crawl":
|
|
return bool(p.supports_crawl())
|
|
return False
|
|
|
|
def _is_available_safe(p: WebSearchProvider) -> bool:
|
|
"""Wrap ``is_available()`` so a buggy provider doesn't kill resolution."""
|
|
try:
|
|
return bool(p.is_available())
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.debug("provider %s.is_available() raised %s", p.name, exc)
|
|
return False
|
|
|
|
# 1. Explicit config wins — return regardless of is_available() so the
|
|
# user gets a precise downstream error message rather than a silent
|
|
# backend switch. Matches _get_backend() in web_tools.py.
|
|
if configured:
|
|
provider = snapshot.get(configured)
|
|
if provider is not None and _capable(provider):
|
|
return provider
|
|
if provider is None:
|
|
logger.debug(
|
|
"web backend '%s' configured but not registered; falling back",
|
|
configured,
|
|
)
|
|
else:
|
|
logger.debug(
|
|
"web backend '%s' configured but does not support '%s'; falling back",
|
|
configured, capability,
|
|
)
|
|
|
|
# 2. + 3. Fallback path — filter by availability so we don't surface
|
|
# a provider the user has no credentials for. Without this filter,
|
|
# a registered-but-unconfigured provider could end up "active" on
|
|
# a fresh install with no API keys at all.
|
|
eligible = [
|
|
p for p in snapshot.values()
|
|
if _capable(p) and _is_available_safe(p)
|
|
]
|
|
if len(eligible) == 1:
|
|
return eligible[0]
|
|
|
|
for legacy in _LEGACY_PREFERENCE:
|
|
provider = snapshot.get(legacy)
|
|
if (
|
|
provider is not None
|
|
and _capable(provider)
|
|
and _is_available_safe(provider)
|
|
):
|
|
return provider
|
|
|
|
return None
|
|
|
|
|
|
def get_active_search_provider() -> Optional[WebSearchProvider]:
|
|
"""Resolve the currently-active web search provider.
|
|
|
|
Reads ``web.search_backend`` (preferred) or ``web.backend`` (shared
|
|
fallback) from config.yaml; falls back per the module docstring.
|
|
"""
|
|
explicit = _read_config_key("web", "search_backend") or _read_config_key("web", "backend")
|
|
return _resolve(explicit, capability="search")
|
|
|
|
|
|
def get_active_extract_provider() -> Optional[WebSearchProvider]:
|
|
"""Resolve the currently-active web extract provider.
|
|
|
|
Reads ``web.extract_backend`` (preferred) or ``web.backend`` (shared
|
|
fallback) from config.yaml; falls back per the module docstring.
|
|
"""
|
|
explicit = _read_config_key("web", "extract_backend") or _read_config_key("web", "backend")
|
|
return _resolve(explicit, capability="extract")
|
|
|
|
|
|
def get_active_crawl_provider() -> Optional[WebSearchProvider]:
|
|
"""Resolve the currently-active web crawl provider.
|
|
|
|
Reads ``web.crawl_backend`` (preferred) or ``web.backend`` (shared
|
|
fallback) from config.yaml; falls back per the module docstring.
|
|
|
|
Crawl is a niche capability — among built-in providers only Tavily and
|
|
Firecrawl implement it. Callers should expect ``None`` and fall back to
|
|
a different strategy (e.g. summarize-via-LLM) when neither is
|
|
configured.
|
|
"""
|
|
explicit = _read_config_key("web", "crawl_backend") or _read_config_key("web", "backend")
|
|
return _resolve(explicit, capability="crawl")
|
|
|
|
|
|
def _reset_for_tests() -> None:
|
|
"""Clear the registry. **Test-only.**"""
|
|
with _lock:
|
|
_providers.clear()
|