From 657e6d87cc65e14680282b6e2fdc1a9bcf702493 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Thu, 14 May 2026 02:02:01 +0530 Subject: [PATCH] fix(web): align _LEGACY_PREFERENCE with legacy 7-provider order + doc cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review of the plugin migration surfaced one warning and a handful of doc/dead-code cleanups. None affect production behaviour through the main dispatcher (which always calls `tools.web_tools._get_backend()` first and preserves the full 7-provider walk), but direct callers of `agent.web_search_registry.get_active_*_provider()` previously diverged from the legacy order and could return `None` for users with credentials but no explicit `web.backend` config key. Changes ------- 1. `_LEGACY_PREFERENCE` was shipped as a 4-tuple `("brave-free", "firecrawl", "searxng", "ddgs")` while the PR description and the legacy `_get_backend()` candidate order both call for the 7-tuple `(firecrawl, parallel, tavily, exa, searxng, brave-free, ddgs)`. Replaced with the 7-tuple. Verified empirically: with TAVILY+EXA keys and no config, `get_active_search_provider()` now returns tavily (was None); with EXA+PARALLEL it returns parallel (was None); with BRAVE+FIRECRAWL it returns firecrawl (was brave-free). 2. `agent/web_search_registry.py` — module docstring, `_resolve` step-3 docstring, and inline comment all listed the old 4-tuple and claimed "brave-free first because it was the shipped default". The legacy default is `"firecrawl"`. Rewritten to match the new ordering and reference `tools.web_tools._get_backend()` as the source of truth. 3. `agent/web_search_registry.py` — `get_active_crawl_provider` docstring said "only Tavily implements it among built-in providers". Firecrawl also advertises `supports_crawl=True` after the previous commit. Updated to "Tavily and Firecrawl". 4. `plugins/web/tavily/provider.py` — module docstring said "Tavily is the only built-in backend that natively crawls". Updated. 5. `agent/web_search_provider.py` — ABC docstring mentioned only `search` / `extract` capabilities. Added `crawl` for accuracy. 6. `plugins/web/{firecrawl,parallel,exa}/provider.py` — dead plugin-level cache globals (`_firecrawl_client`, `_parallel_client`, `_async_parallel_client`, `_exa_client`) were declared but never read (all reads/writes go through `_wt.*` per the `extracting-inline- helpers-to-plugins` recipe). Removed the dead declarations; the reset-for-tests helpers in firecrawl + parallel now clear the canonical `_wt._` slots, matching the pattern exa already used. Tests ----- 218/218 web-targeted tests still pass (no test changes needed). 4910/4910 in `tests/tools/` still green. --- agent/web_search_provider.py | 11 +++--- agent/web_search_registry.py | 62 +++++++++++++++++++++---------- plugins/web/exa/provider.py | 7 ++-- plugins/web/firecrawl/provider.py | 21 +++++++---- plugins/web/parallel/provider.py | 25 +++++++------ plugins/web/tavily/provider.py | 4 +- 6 files changed, 82 insertions(+), 48 deletions(-) diff --git a/agent/web_search_provider.py b/agent/web_search_provider.py index ed3f79f270e..7223bbf2cfe 100644 --- a/agent/web_search_provider.py +++ b/agent/web_search_provider.py @@ -61,13 +61,14 @@ from typing import Any, Dict, List class WebSearchProvider(abc.ABC): - """Abstract base class for a web search/extract backend. + """Abstract base class for a web search/extract/crawl backend. Subclasses must implement :meth:`is_available` and at least one of - :meth:`search` / :meth:`extract`. The :meth:`supports_search` and - :meth:`supports_extract` capability flags let the registry route each - tool call to the right provider, and let multi-capability providers - (SearXNG, Firecrawl, Tavily, …) advertise both. + :meth:`search` / :meth:`extract` / :meth:`crawl`. The + :meth:`supports_search` / :meth:`supports_extract` / :meth:`supports_crawl` + capability flags let the registry route each tool call to the right + provider, and let multi-capability providers (Firecrawl, Tavily, Exa, + …) advertise multiple capabilities from a single class. """ @property diff --git a/agent/web_search_registry.py b/agent/web_search_registry.py index 8425c129910..c61c16cadb2 100644 --- a/agent/web_search_registry.py +++ b/agent/web_search_registry.py @@ -11,17 +11,23 @@ Active selection ---------------- The active provider is chosen by configuration with this precedence: -1. ``web.search_backend`` (for search) or ``web.extract_backend`` (for extract) -2. ``web.backend`` (shared fallback) -3. If exactly one capability-eligible provider is registered, use it. -4. Legacy preference order (``brave-free`` → ``firecrawl`` → ``searxng`` → ``ddgs``) - so installs that omitted the config key keep working. +1. ``web.search_backend`` / ``web.extract_backend`` / ``web.crawl_backend`` + (per-capability override). +2. ``web.backend`` (shared fallback). +3. If exactly one capability-eligible provider is registered AND available, + use it. +4. Legacy preference order — ``firecrawl`` → ``parallel`` → ``tavily`` → + ``exa`` → ``searxng`` → ``brave-free`` → ``ddgs`` — filtered by + availability. Matches the historic ``tools.web_tools._get_backend()`` + candidate order so installs that never set a config key keep landing + on the same provider they did before the plugin migration. 5. Otherwise ``None`` — the tool surfaces a helpful error pointing at ``hermes tools``. -The capability filter (``supports_search`` vs ``supports_extract``) is applied -at every step so a search-only provider (``brave-free``) configured as -``web.extract_backend`` correctly falls through. +The capability filter (``supports_search`` / ``supports_extract`` / +``supports_crawl``) is applied at every step so a search-only provider +(``brave-free``) configured as ``web.extract_backend`` correctly falls +through to an extract-capable backend. """ from __future__ import annotations @@ -107,10 +113,21 @@ def _read_config_key(*path: str) -> Optional[str]: return None -# Legacy preference order — preserves behaviour for users who set no config -# at all. brave-free first because it was the shipped default after the -# Brave migration; firecrawl second for back-compat with older configs. -_LEGACY_PREFERENCE = ("brave-free", "firecrawl", "searxng", "ddgs") +# Legacy preference order — preserves behaviour for users who set no +# ``web.backend`` / ``web._backend`` config key at all. Matches +# the historic candidate order in :func:`tools.web_tools._get_backend` +# (paid providers first so existing paid setups don't get downgraded to +# a free tier on upgrade). Filtered by ``is_available()`` at walk time so +# we don't surface a provider the user has no credentials for. +_LEGACY_PREFERENCE = ( + "firecrawl", + "parallel", + "tavily", + "exa", + "searxng", + "brave-free", + "ddgs", +) def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearchProvider]: @@ -130,10 +147,14 @@ def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearc supports *capability* AND ``is_available()`` reports True, return it. 3. **Legacy preference walk, filtered by availability.** Walk the - :data:`_LEGACY_PREFERENCE` order looking for a provider whose + :data:`_LEGACY_PREFERENCE` order (firecrawl → parallel → tavily → + exa → searxng → brave-free → ddgs) looking for a provider whose ``supports_()`` is True AND whose ``is_available()`` is - True. This is the path that fires when no config key is set — pick - the highest-priority backend the user actually has credentials for. + True. Matches the historic ``tools.web_tools._get_backend()`` + candidate order so users with credentials but no explicit config + key keep landing on the same provider as pre-migration. This is + the path that fires when no config key is set — pick the + highest-priority backend the user actually has credentials for. Returns None when no provider is configured AND no available provider matches the legacy preference; the dispatcher then returns a "set up a @@ -179,8 +200,8 @@ def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearc # 2. + 3. Fallback path — filter by availability so we don't surface # a provider the user has no credentials for. Without this filter, - # brave-free's slot in the legacy preference order would make it - # the "active" provider on a fresh install with no API keys at all. + # a registered-but-unconfigured provider could end up "active" on + # a fresh install with no API keys at all. eligible = [ p for p in snapshot.values() if _capable(p) and _is_available_safe(p) @@ -226,9 +247,10 @@ def get_active_crawl_provider() -> Optional[WebSearchProvider]: Reads ``web.crawl_backend`` (preferred) or ``web.backend`` (shared fallback) from config.yaml; falls back per the module docstring. - Crawl is a niche capability — only Tavily implements it among built-in - providers. Most callers should expect ``None`` and fall back to a - different strategy (e.g. summarize-via-LLM). + Crawl is a niche capability — among built-in providers only Tavily and + Firecrawl implement it. Callers should expect ``None`` and fall back to + a different strategy (e.g. summarize-via-LLM) when neither is + configured. """ explicit = _read_config_key("web", "crawl_backend") or _read_config_key("web", "backend") return _resolve(explicit, capability="crawl") diff --git a/plugins/web/exa/provider.py b/plugins/web/exa/provider.py index d5735967758..0fea6fb5a8b 100644 --- a/plugins/web/exa/provider.py +++ b/plugins/web/exa/provider.py @@ -32,9 +32,10 @@ from agent.web_search_provider import WebSearchProvider logger = logging.getLogger(__name__) -# Module-level cache for the Exa client so we don't reconstruct it per -# call. Matches the legacy `_exa_client` pattern in tools/web_tools.py. -_exa_client: Any = None +# Module-level note: the canonical ``_exa_client`` cache slot lives on +# :mod:`tools.web_tools` so tests that do ``tools.web_tools._exa_client = +# None`` between cases see fresh state. The plugin reads/writes through +# that public module (see :func:`_get_exa_client`). def _get_exa_client() -> Any: diff --git a/plugins/web/firecrawl/provider.py b/plugins/web/firecrawl/provider.py index ec193781096..e7d4d378bdc 100644 --- a/plugins/web/firecrawl/provider.py +++ b/plugins/web/firecrawl/provider.py @@ -112,9 +112,11 @@ Firecrawl = _FirecrawlProxy() # --------------------------------------------------------------------------- # Client construction (direct vs managed-gateway) # --------------------------------------------------------------------------- - -_firecrawl_client: Any = None -_firecrawl_client_config: Any = None +# +# The canonical cache slots live on :mod:`tools.web_tools` so tests that do +# ``tools.web_tools._firecrawl_client = None`` between cases see fresh +# state. The plugin reads/writes through that public module — see +# :func:`_get_firecrawl_client` below. def _get_direct_firecrawl_config() -> Optional[tuple]: @@ -257,10 +259,15 @@ def _get_firecrawl_client() -> Any: def _reset_client_for_tests() -> None: - """Drop the cached Firecrawl client so tests can re-instantiate cleanly.""" - global _firecrawl_client, _firecrawl_client_config - _firecrawl_client = None - _firecrawl_client_config = None + """Drop the cached Firecrawl client so tests can re-instantiate cleanly. + + Clears the canonical slots on :mod:`tools.web_tools` (where + :func:`_get_firecrawl_client` reads/writes them). + """ + import tools.web_tools as _wt + + _wt._firecrawl_client = None + _wt._firecrawl_client_config = None # --------------------------------------------------------------------------- diff --git a/plugins/web/parallel/provider.py b/plugins/web/parallel/provider.py index 71aae39025a..38578e6b52c 100644 --- a/plugins/web/parallel/provider.py +++ b/plugins/web/parallel/provider.py @@ -36,13 +36,11 @@ from agent.web_search_provider import WebSearchProvider logger = logging.getLogger(__name__) -# Module-level client caches mirroring the legacy `tools.web_tools._parallel_client` -# / `_async_parallel_client` pattern. For tests, the canonical cache lives on -# tools.web_tools so existing setup_method() handlers that reset -# ``tools.web_tools._parallel_client = None`` keep working — we read/write -# the cache via that module rather than these module-level globals. -_parallel_client: Any = None -_async_parallel_client: Any = None +# Module-level note: the canonical cache slots ``_parallel_client`` and +# ``_async_parallel_client`` live on :mod:`tools.web_tools` so tests that do +# ``tools.web_tools._parallel_client = None`` between cases see fresh state. +# The plugin reads/writes through that public module (see +# :func:`_get_sync_client` / :func:`_get_async_client`). def _ensure_parallel_sdk_installed() -> None: @@ -117,10 +115,15 @@ def _get_async_client() -> Any: def _reset_clients_for_tests() -> None: - """Drop both cached clients so tests can re-instantiate cleanly.""" - global _parallel_client, _async_parallel_client - _parallel_client = None - _async_parallel_client = None + """Drop both cached clients so tests can re-instantiate cleanly. + + Clears the canonical slots on :mod:`tools.web_tools` (where + :func:`_get_sync_client` / :func:`_get_async_client` read/write them). + """ + import tools.web_tools as _wt + + _wt._parallel_client = None + _wt._async_parallel_client = None # Backward-compatible aliases for the names that lived in tools.web_tools diff --git a/plugins/web/tavily/provider.py b/plugins/web/tavily/provider.py index fc3406d2ce9..50e15973fb3 100644 --- a/plugins/web/tavily/provider.py +++ b/plugins/web/tavily/provider.py @@ -5,8 +5,8 @@ capabilities advertised: - ``supports_search()`` -> True (Tavily ``/search``) - ``supports_extract()`` -> True (Tavily ``/extract``) -- ``supports_crawl()`` -> True (Tavily ``/crawl``) — Tavily is the only - built-in backend that natively crawls +- ``supports_crawl()`` -> True (Tavily ``/crawl``) — sync HTTP crawl; + Firecrawl also advertises ``supports_crawl=True`` (async) All three are sync — the underlying call is ``httpx.post(...)``. The dispatcher in :func:`tools.web_tools.web_crawl_tool` (which is itself