mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-25 05:52:34 +00:00
feat(web): extend ABC with supports_crawl and async-extract semantics
Two ABC additions to cover the surface area of the remaining four providers (exa, parallel, tavily, firecrawl) which were untouched by the initial spike: 1. supports_crawl() + crawl() — Tavily natively crawls a seed URL via its /crawl endpoint. Exposing supports_crawl=True lets the crawl tool's dispatcher route to Tavily when configured, falling back to the auxiliary-model summarization path otherwise. Firecrawl could add this in a follow-up (the SDK supports it; we just don't surface it as a tool today). 2. Async-or-sync extract() — Parallel's SDK is natively async (AsyncParallel.beta.extract); Exa and Tavily are sync; Firecrawl is sync but called inside asyncio.to_thread() with a 60s timeout. The ABC docstring now permits either shape: implementations declare their own sync/async signature and the dispatcher uses inspect.iscoroutinefunction to detect and await. Also adds get_active_crawl_provider() to web_search_registry mirroring the search/extract resolvers, with web.crawl_backend as the explicit override config key. No behavior change on its own — these are scaffolds for the four remaining provider migrations.
This commit is contained in:
parent
0a7cbd3342
commit
e3f0a88891
2 changed files with 84 additions and 4 deletions
|
|
@ -99,7 +99,32 @@ class WebSearchProvider(abc.ABC):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def supports_extract(self) -> bool:
|
def supports_extract(self) -> bool:
|
||||||
"""Return True if this provider implements :meth:`extract`."""
|
"""Return True if this provider implements :meth:`extract`.
|
||||||
|
|
||||||
|
Both sync and async :meth:`extract` implementations are valid — the
|
||||||
|
dispatcher detects coroutine functions via
|
||||||
|
:func:`inspect.iscoroutinefunction` and awaits as needed. Sync
|
||||||
|
implementations that perform blocking I/O (HTTP, SDK calls) should
|
||||||
|
ideally wrap in :func:`asyncio.to_thread` at the call site; small
|
||||||
|
providers can keep their sync shape and let the dispatcher handle
|
||||||
|
threading.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
def supports_crawl(self) -> bool:
|
||||||
|
"""Return True if this provider implements :meth:`crawl`.
|
||||||
|
|
||||||
|
Crawl differs from extract in that the agent provides a *seed URL*
|
||||||
|
and the provider walks linked pages on its own — useful for
|
||||||
|
documentation sites where the agent doesn't know all relevant
|
||||||
|
URLs upfront. Tavily is the only built-in backend that natively
|
||||||
|
crawls today; Firecrawl provides a similar capability that we
|
||||||
|
don't currently surface as a tool.
|
||||||
|
|
||||||
|
Providers that don't crawl should leave this as False; the
|
||||||
|
dispatcher in :func:`tools.web_tools.web_crawl_tool` will fall
|
||||||
|
back to its auxiliary-model summarization path.
|
||||||
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
||||||
|
|
@ -113,20 +138,59 @@ class WebSearchProvider(abc.ABC):
|
||||||
f"{self.name} does not support search (override supports_search)"
|
f"{self.name} does not support search (override supports_search)"
|
||||||
)
|
)
|
||||||
|
|
||||||
def extract(self, urls: List[str], **kwargs: Any) -> Dict[str, Any]:
|
def extract(self, urls: List[str], **kwargs: Any) -> Any:
|
||||||
"""Extract content from one or more URLs.
|
"""Extract content from one or more URLs.
|
||||||
|
|
||||||
Override when :meth:`supports_extract` returns True. The default
|
Override when :meth:`supports_extract` returns True. The default
|
||||||
raises NotImplementedError; callers should gate on
|
raises NotImplementedError; callers should gate on
|
||||||
:meth:`supports_extract` before calling.
|
:meth:`supports_extract` before calling.
|
||||||
|
|
||||||
``kwargs`` may carry forward-compat fields (e.g. ``include_raw``,
|
Return shape: a list of result dicts matching what the legacy
|
||||||
|
:func:`tools.web_tools.web_extract_tool` post-processing pipeline
|
||||||
|
expects::
|
||||||
|
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"url": str,
|
||||||
|
"title": str,
|
||||||
|
"content": str,
|
||||||
|
"raw_content": str,
|
||||||
|
"metadata": dict, # optional
|
||||||
|
"error": str, # optional, only on per-URL failure
|
||||||
|
},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
|
||||||
|
Implementations MAY be ``async def`` — the dispatcher detects
|
||||||
|
coroutines via :func:`inspect.iscoroutinefunction` and awaits.
|
||||||
|
|
||||||
|
``kwargs`` may carry forward-compat fields (``format``, ``include_raw``,
|
||||||
``max_chars``) — implementations should ignore unknown keys.
|
``max_chars``) — implementations should ignore unknown keys.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
f"{self.name} does not support extract (override supports_extract)"
|
f"{self.name} does not support extract (override supports_extract)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def crawl(self, url: str, **kwargs: Any) -> Any:
|
||||||
|
"""Crawl a seed URL and return results.
|
||||||
|
|
||||||
|
Override when :meth:`supports_crawl` returns True. The default
|
||||||
|
raises NotImplementedError; callers should gate on
|
||||||
|
:meth:`supports_crawl` before calling.
|
||||||
|
|
||||||
|
Return shape: ``{"results": [{"url": str, "title": str,
|
||||||
|
"content": str, ...}, ...]}`` matching what
|
||||||
|
:func:`tools.web_tools.web_crawl_tool` post-processing expects.
|
||||||
|
|
||||||
|
Implementations MAY be ``async def``.
|
||||||
|
|
||||||
|
``kwargs`` may carry forward-compat fields (e.g. ``max_depth``,
|
||||||
|
``include_domains``) — implementations should ignore unknown keys.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"{self.name} does not support crawl (override supports_crawl)"
|
||||||
|
)
|
||||||
|
|
||||||
def get_setup_schema(self) -> Dict[str, Any]:
|
def get_setup_schema(self) -> Dict[str, Any]:
|
||||||
"""Return provider metadata for the ``hermes tools`` picker.
|
"""Return provider metadata for the ``hermes tools`` picker.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -114,7 +114,7 @@ _LEGACY_PREFERENCE = ("brave-free", "firecrawl", "searxng", "ddgs")
|
||||||
|
|
||||||
|
|
||||||
def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearchProvider]:
|
def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearchProvider]:
|
||||||
"""Resolve the active provider for a capability ("search" | "extract").
|
"""Resolve the active provider for a capability ("search" | "extract" | "crawl").
|
||||||
|
|
||||||
Resolution rules (in order):
|
Resolution rules (in order):
|
||||||
|
|
||||||
|
|
@ -147,6 +147,8 @@ def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearc
|
||||||
return bool(p.supports_search())
|
return bool(p.supports_search())
|
||||||
if capability == "extract":
|
if capability == "extract":
|
||||||
return bool(p.supports_extract())
|
return bool(p.supports_extract())
|
||||||
|
if capability == "crawl":
|
||||||
|
return bool(p.supports_crawl())
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _is_available_safe(p: WebSearchProvider) -> bool:
|
def _is_available_safe(p: WebSearchProvider) -> bool:
|
||||||
|
|
@ -218,6 +220,20 @@ def get_active_extract_provider() -> Optional[WebSearchProvider]:
|
||||||
return _resolve(explicit, capability="extract")
|
return _resolve(explicit, capability="extract")
|
||||||
|
|
||||||
|
|
||||||
|
def get_active_crawl_provider() -> Optional[WebSearchProvider]:
|
||||||
|
"""Resolve the currently-active web crawl provider.
|
||||||
|
|
||||||
|
Reads ``web.crawl_backend`` (preferred) or ``web.backend`` (shared
|
||||||
|
fallback) from config.yaml; falls back per the module docstring.
|
||||||
|
|
||||||
|
Crawl is a niche capability — only Tavily implements it among built-in
|
||||||
|
providers. Most callers should expect ``None`` and fall back to a
|
||||||
|
different strategy (e.g. summarize-via-LLM).
|
||||||
|
"""
|
||||||
|
explicit = _read_config_key("web", "crawl_backend") or _read_config_key("web", "backend")
|
||||||
|
return _resolve(explicit, capability="crawl")
|
||||||
|
|
||||||
|
|
||||||
def _reset_for_tests() -> None:
|
def _reset_for_tests() -> None:
|
||||||
"""Clear the registry. **Test-only.**"""
|
"""Clear the registry. **Test-only.**"""
|
||||||
with _lock:
|
with _lock:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue