mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-27 06:11:40 +00:00
feat(web): tavily plugin — first three-capability plugin (search + extract + crawl)
Migrates Tavily from inline _tavily_request() / _normalize_tavily_*
helpers in tools/web_tools.py to a bundled plugin at plugins/web/tavily/.
First plugin in the codebase to advertise supports_crawl=True. Tavily is
unique among built-in backends in offering a native /crawl endpoint that
walks linked pages from a seed URL with optional natural-language
instructions and depth ("basic" or "advanced").
Capabilities:
- supports_search() -> True (Tavily /search)
- supports_extract() -> True (Tavily /extract)
- supports_crawl() -> True (Tavily /crawl)
All sync (httpx.post under the hood).
The crawl method accepts forward-compat kwargs (instructions, depth,
limit) and is gated against unsafe URLs/policy by the dispatcher in
web_crawl_tool — exactly as before.
Behavior preserved:
- TAVILY_API_KEY required (ValueError → typed error response)
- TAVILY_BASE_URL env override honored
- /crawl requires both body auth AND Bearer header — preserved
- failed_results[] and failed_urls[] response keys mapped to per-URL
items with error fields rather than raising
- max_results capped at 20 server-side
Adds "tavily" to _WEB_PLUGIN_SKIPLIST.
The legacy inline _tavily_request / _normalize_tavily_search_results /
_normalize_tavily_documents / _TAVILY_BASE_URL in tools/web_tools.py are
NOT deleted yet — search/extract dispatch and the entire web_crawl_tool
function still reference them. They go away when those dispatchers are
cut over to the registry.
E2E verified:
- Tavily registers with all 3 capabilities
- Provider list now: brave-free, ddgs, exa, parallel, searxng, tavily
This commit is contained in:
parent
4816646109
commit
31fcde876c
4 changed files with 308 additions and 1 deletions
|
|
@ -1586,7 +1586,7 @@ def _plugin_video_gen_providers() -> list[dict]:
|
||||||
# removed and this helper becomes the sole source of web-provider picker
|
# removed and this helper becomes the sole source of web-provider picker
|
||||||
# rows (matching how Spotify / Google Meet are surfaced today purely from
|
# rows (matching how Spotify / Google Meet are surfaced today purely from
|
||||||
# their plugins).
|
# their plugins).
|
||||||
_WEB_PLUGIN_SKIPLIST = frozenset({"brave-free", "ddgs", "searxng", "exa", "parallel"})
|
_WEB_PLUGIN_SKIPLIST = frozenset({"brave-free", "ddgs", "searxng", "exa", "parallel", "tavily"})
|
||||||
|
|
||||||
|
|
||||||
def _plugin_web_search_providers() -> list[dict]:
|
def _plugin_web_search_providers() -> list[dict]:
|
||||||
|
|
|
||||||
15
plugins/web/tavily/__init__.py
Normal file
15
plugins/web/tavily/__init__.py
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
"""Tavily web search + extract + crawl plugin — bundled, auto-loaded.
|
||||||
|
|
||||||
|
First plugin in this codebase to advertise ``supports_crawl=True``. The
|
||||||
|
crawl method maps to Tavily's ``/crawl`` endpoint, which accepts a seed
|
||||||
|
URL plus optional instructions and extract depth.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from plugins.web.tavily.provider import TavilyWebSearchProvider
|
||||||
|
|
||||||
|
|
||||||
|
def register(ctx) -> None:
|
||||||
|
"""Register the Tavily provider with the plugin context."""
|
||||||
|
ctx.register_web_search_provider(TavilyWebSearchProvider())
|
||||||
7
plugins/web/tavily/plugin.yaml
Normal file
7
plugins/web/tavily/plugin.yaml
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
name: web-tavily
|
||||||
|
version: 1.0.0
|
||||||
|
description: "Tavily web search + content extraction + crawl. Search + extract are mainstream; crawl is unique to Tavily among built-in providers. Requires TAVILY_API_KEY — sign up at https://app.tavily.com/home."
|
||||||
|
author: NousResearch
|
||||||
|
kind: backend
|
||||||
|
provides_web_providers:
|
||||||
|
- tavily
|
||||||
285
plugins/web/tavily/provider.py
Normal file
285
plugins/web/tavily/provider.py
Normal file
|
|
@ -0,0 +1,285 @@
|
||||||
|
"""Tavily web search + content extraction + crawl — plugin form.
|
||||||
|
|
||||||
|
Subclasses :class:`agent.web_search_provider.WebSearchProvider`. Three
|
||||||
|
capabilities advertised:
|
||||||
|
|
||||||
|
- ``supports_search()`` -> True (Tavily ``/search``)
|
||||||
|
- ``supports_extract()`` -> True (Tavily ``/extract``)
|
||||||
|
- ``supports_crawl()`` -> True (Tavily ``/crawl``) — Tavily is the only
|
||||||
|
built-in backend that natively crawls
|
||||||
|
|
||||||
|
All three are sync — the underlying call is ``httpx.post(...)``. The
|
||||||
|
dispatcher in :func:`tools.web_tools.web_crawl_tool` (which is itself
|
||||||
|
async) will run sync providers in a thread when appropriate.
|
||||||
|
|
||||||
|
Config keys this provider responds to::
|
||||||
|
|
||||||
|
web:
|
||||||
|
search_backend: "tavily" # explicit per-capability
|
||||||
|
extract_backend: "tavily" # explicit per-capability
|
||||||
|
crawl_backend: "tavily" # explicit per-capability
|
||||||
|
backend: "tavily" # shared fallback for all three
|
||||||
|
|
||||||
|
Env vars::
|
||||||
|
|
||||||
|
TAVILY_API_KEY=... # https://app.tavily.com/home (required)
|
||||||
|
TAVILY_BASE_URL=... # optional override of https://api.tavily.com
|
||||||
|
|
||||||
|
Auth note: Tavily uses ``api_key`` in the JSON body for /search and
|
||||||
|
/extract, but **also requires** ``Authorization: Bearer <key>`` for /crawl
|
||||||
|
(body-only auth returns 401 on /crawl). The plugin handles both.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from agent.web_search_provider import WebSearchProvider
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _tavily_request(endpoint: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""POST to the Tavily API and return the parsed JSON response.
|
||||||
|
|
||||||
|
Mirrors :func:`tools.web_tools._tavily_request`. Raises ``ValueError``
|
||||||
|
when ``TAVILY_API_KEY`` is unset; the caller catches and surfaces as
|
||||||
|
a typed error response.
|
||||||
|
"""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
api_key = os.getenv("TAVILY_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
raise ValueError(
|
||||||
|
"TAVILY_API_KEY environment variable not set. "
|
||||||
|
"Get your API key at https://app.tavily.com/home"
|
||||||
|
)
|
||||||
|
|
||||||
|
base_url = os.getenv("TAVILY_BASE_URL", "https://api.tavily.com")
|
||||||
|
payload = dict(payload) # don't mutate caller's dict
|
||||||
|
payload["api_key"] = api_key
|
||||||
|
url = f"{base_url}/{endpoint.lstrip('/')}"
|
||||||
|
logger.info("Tavily %s request to %s", endpoint, url)
|
||||||
|
|
||||||
|
# Tavily /crawl requires Bearer header auth in addition to body auth;
|
||||||
|
# /search and /extract are body-only.
|
||||||
|
headers = {"Authorization": f"Bearer {api_key}"} if endpoint.strip("/") == "crawl" else {}
|
||||||
|
|
||||||
|
response = httpx.post(url, json=payload, headers=headers, timeout=60)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_tavily_search_results(response: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Map Tavily ``/search`` response to ``{success, data: {web: [...]}}``."""
|
||||||
|
web_results = []
|
||||||
|
for i, result in enumerate(response.get("results", [])):
|
||||||
|
web_results.append(
|
||||||
|
{
|
||||||
|
"title": result.get("title", ""),
|
||||||
|
"url": result.get("url", ""),
|
||||||
|
"description": result.get("content", ""),
|
||||||
|
"position": i + 1,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return {"success": True, "data": {"web": web_results}}
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_tavily_documents(
|
||||||
|
response: Dict[str, Any], fallback_url: str = ""
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Map Tavily ``/extract`` or ``/crawl`` response to standard documents.
|
||||||
|
|
||||||
|
Documents follow the legacy LLM post-processing shape::
|
||||||
|
|
||||||
|
{"url", "title", "content", "raw_content", "metadata"}
|
||||||
|
|
||||||
|
Failures (``failed_results``, ``failed_urls``) become result entries
|
||||||
|
with an ``error`` field rather than raising.
|
||||||
|
"""
|
||||||
|
documents: List[Dict[str, Any]] = []
|
||||||
|
for result in response.get("results", []):
|
||||||
|
url = result.get("url", fallback_url)
|
||||||
|
raw = result.get("raw_content", "") or result.get("content", "")
|
||||||
|
documents.append(
|
||||||
|
{
|
||||||
|
"url": url,
|
||||||
|
"title": result.get("title", ""),
|
||||||
|
"content": raw,
|
||||||
|
"raw_content": raw,
|
||||||
|
"metadata": {"sourceURL": url, "title": result.get("title", "")},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
for fail in response.get("failed_results", []):
|
||||||
|
documents.append(
|
||||||
|
{
|
||||||
|
"url": fail.get("url", fallback_url),
|
||||||
|
"title": "",
|
||||||
|
"content": "",
|
||||||
|
"raw_content": "",
|
||||||
|
"error": fail.get("error", "extraction failed"),
|
||||||
|
"metadata": {"sourceURL": fail.get("url", fallback_url)},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
for fail_url in response.get("failed_urls", []):
|
||||||
|
url_str = fail_url if isinstance(fail_url, str) else str(fail_url)
|
||||||
|
documents.append(
|
||||||
|
{
|
||||||
|
"url": url_str,
|
||||||
|
"title": "",
|
||||||
|
"content": "",
|
||||||
|
"raw_content": "",
|
||||||
|
"error": "extraction failed",
|
||||||
|
"metadata": {"sourceURL": url_str},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
class TavilyWebSearchProvider(WebSearchProvider):
|
||||||
|
"""Tavily search + extract + crawl provider."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "tavily"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def display_name(self) -> str:
|
||||||
|
return "Tavily"
|
||||||
|
|
||||||
|
def is_available(self) -> bool:
|
||||||
|
"""Return True when ``TAVILY_API_KEY`` is set to a non-empty value."""
|
||||||
|
return bool(os.getenv("TAVILY_API_KEY", "").strip())
|
||||||
|
|
||||||
|
def supports_search(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def supports_extract(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def supports_crawl(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
||||||
|
"""Execute a Tavily search."""
|
||||||
|
try:
|
||||||
|
from tools.interrupt import is_interrupted
|
||||||
|
|
||||||
|
if is_interrupted():
|
||||||
|
return {"success": False, "error": "Interrupted"}
|
||||||
|
|
||||||
|
logger.info("Tavily search: '%s' (limit=%d)", query, limit)
|
||||||
|
raw = _tavily_request(
|
||||||
|
"search",
|
||||||
|
{
|
||||||
|
"query": query,
|
||||||
|
"max_results": min(limit, 20),
|
||||||
|
"include_raw_content": False,
|
||||||
|
"include_images": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return _normalize_tavily_search_results(raw)
|
||||||
|
except ValueError as exc:
|
||||||
|
return {"success": False, "error": str(exc)}
|
||||||
|
except Exception as exc: # noqa: BLE001 — including httpx errors
|
||||||
|
logger.warning("Tavily search error: %s", exc)
|
||||||
|
return {"success": False, "error": f"Tavily search failed: {exc}"}
|
||||||
|
|
||||||
|
def extract(self, urls: List[str], **kwargs: Any) -> List[Dict[str, Any]]:
|
||||||
|
"""Extract content from one or more URLs via Tavily.
|
||||||
|
|
||||||
|
Sync — the underlying call is httpx.post(...). Returns the legacy
|
||||||
|
list-of-results shape; per-URL failures become items with ``error``.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from tools.interrupt import is_interrupted
|
||||||
|
|
||||||
|
if is_interrupted():
|
||||||
|
return [
|
||||||
|
{"url": u, "error": "Interrupted", "title": ""} for u in urls
|
||||||
|
]
|
||||||
|
|
||||||
|
logger.info("Tavily extract: %d URL(s)", len(urls))
|
||||||
|
raw = _tavily_request(
|
||||||
|
"extract",
|
||||||
|
{
|
||||||
|
"urls": urls,
|
||||||
|
"include_images": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return _normalize_tavily_documents(
|
||||||
|
raw, fallback_url=urls[0] if urls else ""
|
||||||
|
)
|
||||||
|
except ValueError as exc:
|
||||||
|
return [{"url": u, "title": "", "content": "", "error": str(exc)} for u in urls]
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logger.warning("Tavily extract error: %s", exc)
|
||||||
|
return [
|
||||||
|
{"url": u, "title": "", "content": "", "error": f"Tavily extract failed: {exc}"}
|
||||||
|
for u in urls
|
||||||
|
]
|
||||||
|
|
||||||
|
def crawl(self, url: str, **kwargs: Any) -> Dict[str, Any]:
|
||||||
|
"""Crawl a seed URL via Tavily's ``/crawl`` endpoint.
|
||||||
|
|
||||||
|
Accepted kwargs (others ignored for forward compat):
|
||||||
|
- ``instructions``: str — natural-language guidance for the crawl
|
||||||
|
- ``depth``: str — ``"basic"`` (default) or ``"advanced"``
|
||||||
|
- ``limit``: int — max pages to crawl (default 20)
|
||||||
|
|
||||||
|
Returns ``{"results": [...]}`` shaped to match what
|
||||||
|
:func:`tools.web_tools.web_crawl_tool` post-processes.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from tools.interrupt import is_interrupted
|
||||||
|
|
||||||
|
if is_interrupted():
|
||||||
|
return {"results": [{"url": url, "title": "", "content": "", "error": "Interrupted"}]}
|
||||||
|
|
||||||
|
instructions = kwargs.get("instructions")
|
||||||
|
depth = kwargs.get("depth", "basic")
|
||||||
|
limit = kwargs.get("limit", 20)
|
||||||
|
|
||||||
|
logger.info("Tavily crawl: %s (depth=%s, limit=%d)", url, depth, limit)
|
||||||
|
payload: Dict[str, Any] = {
|
||||||
|
"url": url,
|
||||||
|
"limit": limit,
|
||||||
|
"extract_depth": depth,
|
||||||
|
}
|
||||||
|
if instructions:
|
||||||
|
payload["instructions"] = instructions
|
||||||
|
|
||||||
|
raw = _tavily_request("crawl", payload)
|
||||||
|
return {
|
||||||
|
"results": _normalize_tavily_documents(raw, fallback_url=url)
|
||||||
|
}
|
||||||
|
except ValueError as exc:
|
||||||
|
return {"results": [{"url": url, "title": "", "content": "", "error": str(exc)}]}
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logger.warning("Tavily crawl error: %s", exc)
|
||||||
|
return {
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"url": url,
|
||||||
|
"title": "",
|
||||||
|
"content": "",
|
||||||
|
"error": f"Tavily crawl failed: {exc}",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_setup_schema(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"name": "Tavily",
|
||||||
|
"badge": "paid",
|
||||||
|
"tag": "Search + extract + crawl in one provider.",
|
||||||
|
"env_vars": [
|
||||||
|
{
|
||||||
|
"key": "TAVILY_API_KEY",
|
||||||
|
"prompt": "Tavily API key",
|
||||||
|
"url": "https://app.tavily.com/home",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue