From e3f0a8889195d3936762b375c659bdbcc394236c Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Thu, 14 May 2026 00:08:03 +0530 Subject: [PATCH] feat(web): extend ABC with supports_crawl and async-extract semantics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two ABC additions to cover the surface area of the remaining four providers (exa, parallel, tavily, firecrawl) which were untouched by the initial spike: 1. supports_crawl() + crawl() — Tavily natively crawls a seed URL via its /crawl endpoint. Exposing supports_crawl=True lets the crawl tool's dispatcher route to Tavily when configured, falling back to the auxiliary-model summarization path otherwise. Firecrawl could add this in a follow-up (the SDK supports it; we just don't surface it as a tool today). 2. Async-or-sync extract() — Parallel's SDK is natively async (AsyncParallel.beta.extract); Exa and Tavily are sync; Firecrawl is sync but called inside asyncio.to_thread() with a 60s timeout. The ABC docstring now permits either shape: implementations declare their own sync/async signature and the dispatcher uses inspect.iscoroutinefunction to detect and await. Also adds get_active_crawl_provider() to web_search_registry mirroring the search/extract resolvers, with web.crawl_backend as the explicit override config key. No behavior change on its own — these are scaffolds for the four remaining provider migrations. --- agent/web_search_provider.py | 70 ++++++++++++++++++++++++++++++++++-- agent/web_search_registry.py | 18 +++++++++- 2 files changed, 84 insertions(+), 4 deletions(-) diff --git a/agent/web_search_provider.py b/agent/web_search_provider.py index 605af9d5cf5..0e8b31547fa 100644 --- a/agent/web_search_provider.py +++ b/agent/web_search_provider.py @@ -99,7 +99,32 @@ class WebSearchProvider(abc.ABC): return True def supports_extract(self) -> bool: - """Return True if this provider implements :meth:`extract`.""" + """Return True if this provider implements :meth:`extract`. + + Both sync and async :meth:`extract` implementations are valid — the + dispatcher detects coroutine functions via + :func:`inspect.iscoroutinefunction` and awaits as needed. Sync + implementations that perform blocking I/O (HTTP, SDK calls) should + ideally wrap in :func:`asyncio.to_thread` at the call site; small + providers can keep their sync shape and let the dispatcher handle + threading. + """ + return False + + def supports_crawl(self) -> bool: + """Return True if this provider implements :meth:`crawl`. + + Crawl differs from extract in that the agent provides a *seed URL* + and the provider walks linked pages on its own — useful for + documentation sites where the agent doesn't know all relevant + URLs upfront. Tavily is the only built-in backend that natively + crawls today; Firecrawl provides a similar capability that we + don't currently surface as a tool. + + Providers that don't crawl should leave this as False; the + dispatcher in :func:`tools.web_tools.web_crawl_tool` will fall + back to its auxiliary-model summarization path. + """ return False def search(self, query: str, limit: int = 5) -> Dict[str, Any]: @@ -113,20 +138,59 @@ class WebSearchProvider(abc.ABC): f"{self.name} does not support search (override supports_search)" ) - def extract(self, urls: List[str], **kwargs: Any) -> Dict[str, Any]: + def extract(self, urls: List[str], **kwargs: Any) -> Any: """Extract content from one or more URLs. Override when :meth:`supports_extract` returns True. The default raises NotImplementedError; callers should gate on :meth:`supports_extract` before calling. - ``kwargs`` may carry forward-compat fields (e.g. ``include_raw``, + Return shape: a list of result dicts matching what the legacy + :func:`tools.web_tools.web_extract_tool` post-processing pipeline + expects:: + + [ + { + "url": str, + "title": str, + "content": str, + "raw_content": str, + "metadata": dict, # optional + "error": str, # optional, only on per-URL failure + }, + ... + ] + + Implementations MAY be ``async def`` — the dispatcher detects + coroutines via :func:`inspect.iscoroutinefunction` and awaits. + + ``kwargs`` may carry forward-compat fields (``format``, ``include_raw``, ``max_chars``) — implementations should ignore unknown keys. """ raise NotImplementedError( f"{self.name} does not support extract (override supports_extract)" ) + def crawl(self, url: str, **kwargs: Any) -> Any: + """Crawl a seed URL and return results. + + Override when :meth:`supports_crawl` returns True. The default + raises NotImplementedError; callers should gate on + :meth:`supports_crawl` before calling. + + Return shape: ``{"results": [{"url": str, "title": str, + "content": str, ...}, ...]}`` matching what + :func:`tools.web_tools.web_crawl_tool` post-processing expects. + + Implementations MAY be ``async def``. + + ``kwargs`` may carry forward-compat fields (e.g. ``max_depth``, + ``include_domains``) — implementations should ignore unknown keys. + """ + raise NotImplementedError( + f"{self.name} does not support crawl (override supports_crawl)" + ) + def get_setup_schema(self) -> Dict[str, Any]: """Return provider metadata for the ``hermes tools`` picker. diff --git a/agent/web_search_registry.py b/agent/web_search_registry.py index 45f2a0f8883..8425c129910 100644 --- a/agent/web_search_registry.py +++ b/agent/web_search_registry.py @@ -114,7 +114,7 @@ _LEGACY_PREFERENCE = ("brave-free", "firecrawl", "searxng", "ddgs") def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearchProvider]: - """Resolve the active provider for a capability ("search" | "extract"). + """Resolve the active provider for a capability ("search" | "extract" | "crawl"). Resolution rules (in order): @@ -147,6 +147,8 @@ def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearc return bool(p.supports_search()) if capability == "extract": return bool(p.supports_extract()) + if capability == "crawl": + return bool(p.supports_crawl()) return False def _is_available_safe(p: WebSearchProvider) -> bool: @@ -218,6 +220,20 @@ def get_active_extract_provider() -> Optional[WebSearchProvider]: return _resolve(explicit, capability="extract") +def get_active_crawl_provider() -> Optional[WebSearchProvider]: + """Resolve the currently-active web crawl provider. + + Reads ``web.crawl_backend`` (preferred) or ``web.backend`` (shared + fallback) from config.yaml; falls back per the module docstring. + + Crawl is a niche capability — only Tavily implements it among built-in + providers. Most callers should expect ``None`` and fall back to a + different strategy (e.g. summarize-via-LLM). + """ + explicit = _read_config_key("web", "crawl_backend") or _read_config_key("web", "backend") + return _resolve(explicit, capability="crawl") + + def _reset_for_tests() -> None: """Clear the registry. **Test-only.**""" with _lock: