mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
Self-review of the plugin migration surfaced one warning and a handful of
doc/dead-code cleanups. None affect production behaviour through the main
dispatcher (which always calls `tools.web_tools._get_backend()` first and
preserves the full 7-provider walk), but direct callers of
`agent.web_search_registry.get_active_*_provider()` previously diverged
from the legacy order and could return `None` for users with credentials
but no explicit `web.backend` config key.
Changes
-------
1. `_LEGACY_PREFERENCE` was shipped as a 4-tuple
`("brave-free", "firecrawl", "searxng", "ddgs")` while the PR
description and the legacy `_get_backend()` candidate order both
call for the 7-tuple
`(firecrawl, parallel, tavily, exa, searxng, brave-free, ddgs)`.
Replaced with the 7-tuple. Verified empirically: with TAVILY+EXA keys
and no config, `get_active_search_provider()` now returns tavily
(was None); with EXA+PARALLEL it returns parallel (was None); with
BRAVE+FIRECRAWL it returns firecrawl (was brave-free).
2. `agent/web_search_registry.py` — module docstring, `_resolve` step-3
docstring, and inline comment all listed the old 4-tuple and claimed
"brave-free first because it was the shipped default". The legacy
default is `"firecrawl"`. Rewritten to match the new ordering and
reference `tools.web_tools._get_backend()` as the source of truth.
3. `agent/web_search_registry.py` — `get_active_crawl_provider`
docstring said "only Tavily implements it among built-in providers".
Firecrawl also advertises `supports_crawl=True` after the previous
commit. Updated to "Tavily and Firecrawl".
4. `plugins/web/tavily/provider.py` — module docstring said "Tavily is
the only built-in backend that natively crawls". Updated.
5. `agent/web_search_provider.py` — ABC docstring mentioned only
`search` / `extract` capabilities. Added `crawl` for accuracy.
6. `plugins/web/{firecrawl,parallel,exa}/provider.py` — dead plugin-level
cache globals (`_firecrawl_client`, `_parallel_client`,
`_async_parallel_client`, `_exa_client`) were declared but never read
(all reads/writes go through `_wt.*` per the `extracting-inline-
helpers-to-plugins` recipe). Removed the dead declarations; the
reset-for-tests helpers in firecrawl + parallel now clear the
canonical `_wt._<name>` slots, matching the pattern exa already used.
Tests
-----
218/218 web-targeted tests still pass (no test changes needed). 4910/4910
in `tests/tools/` still green.
212 lines
7.1 KiB
Python
212 lines
7.1 KiB
Python
"""Exa web search + content extraction — plugin form.
|
|
|
|
Subclasses :class:`agent.web_search_provider.WebSearchProvider`. Uses the
|
|
official Exa SDK (``exa-py``) which is lazy-loaded via
|
|
:func:`tools.lazy_deps.ensure` so that cold-start CLI users don't pay the
|
|
SDK import cost when Exa isn't configured.
|
|
|
|
Config keys this provider responds to::
|
|
|
|
web:
|
|
search_backend: "exa" # explicit per-capability
|
|
extract_backend: "exa" # explicit per-capability
|
|
backend: "exa" # shared fallback for both
|
|
|
|
Env var::
|
|
|
|
EXA_API_KEY=... # https://exa.ai (paid tier; free trial available)
|
|
|
|
The previous in-tree implementation lived at
|
|
``tools.web_tools._exa_search`` / ``_exa_extract``; this file is the
|
|
canonical replacement. Behavior is bit-for-bit identical aside from the
|
|
ABC method-name change.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
from typing import Any, Dict, List
|
|
|
|
from agent.web_search_provider import WebSearchProvider
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Module-level note: the canonical ``_exa_client`` cache slot lives on
|
|
# :mod:`tools.web_tools` so tests that do ``tools.web_tools._exa_client =
|
|
# None`` between cases see fresh state. The plugin reads/writes through
|
|
# that public module (see :func:`_get_exa_client`).
|
|
|
|
|
|
def _get_exa_client() -> Any:
|
|
"""Lazy-import and cache an Exa SDK client.
|
|
|
|
Cache lives on :mod:`tools.web_tools` (as ``_exa_client``) so unit
|
|
tests that reset that name between cases keep working. Raises
|
|
``ValueError`` when ``EXA_API_KEY`` is unset.
|
|
"""
|
|
import tools.web_tools as _wt
|
|
|
|
cached = getattr(_wt, "_exa_client", None)
|
|
if cached is not None:
|
|
return cached
|
|
|
|
api_key = os.getenv("EXA_API_KEY")
|
|
if not api_key:
|
|
raise ValueError(
|
|
"EXA_API_KEY environment variable not set. "
|
|
"Get your API key at https://exa.ai"
|
|
)
|
|
|
|
try:
|
|
from tools.lazy_deps import ensure as _lazy_ensure
|
|
|
|
_lazy_ensure("search.exa", prompt=False)
|
|
except ImportError:
|
|
pass
|
|
except Exception as exc: # noqa: BLE001 — lazy_deps surfaces install hints
|
|
raise ImportError(str(exc))
|
|
|
|
from exa_py import Exa # noqa: WPS433 — deliberately lazy
|
|
|
|
client = Exa(api_key=api_key)
|
|
client.headers["x-exa-integration"] = "hermes-agent"
|
|
_wt._exa_client = client
|
|
return client
|
|
|
|
|
|
def _reset_client_for_tests() -> None:
|
|
"""Drop the cached Exa client so tests can re-instantiate cleanly."""
|
|
import tools.web_tools as _wt
|
|
|
|
_wt._exa_client = None
|
|
|
|
|
|
class ExaWebSearchProvider(WebSearchProvider):
|
|
"""Exa search + extract provider.
|
|
|
|
Both methods are sync — Exa's SDK is sync-only. The web_extract_tool
|
|
dispatcher wraps sync extracts via ``asyncio.to_thread`` when it
|
|
needs to keep the event loop responsive.
|
|
"""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "exa"
|
|
|
|
@property
|
|
def display_name(self) -> str:
|
|
return "Exa"
|
|
|
|
def is_available(self) -> bool:
|
|
"""Return True when ``EXA_API_KEY`` is set to a non-empty value."""
|
|
return bool(os.getenv("EXA_API_KEY", "").strip())
|
|
|
|
def supports_search(self) -> bool:
|
|
return True
|
|
|
|
def supports_extract(self) -> bool:
|
|
return True
|
|
|
|
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
|
"""Execute an Exa search.
|
|
|
|
Returns ``{"success": True, "data": {"web": [{...}, ...]}}`` on
|
|
success, ``{"success": False, "error": str}`` on failure (incl.
|
|
missing API key and SDK install errors).
|
|
"""
|
|
try:
|
|
from tools.interrupt import is_interrupted
|
|
|
|
if is_interrupted():
|
|
return {"success": False, "error": "Interrupted"}
|
|
|
|
logger.info("Exa search: '%s' (limit=%d)", query, limit)
|
|
response = _get_exa_client().search(
|
|
query,
|
|
num_results=limit,
|
|
contents={"highlights": True},
|
|
)
|
|
|
|
web_results = []
|
|
for i, result in enumerate(response.results or []):
|
|
highlights = result.highlights or []
|
|
web_results.append(
|
|
{
|
|
"url": result.url or "",
|
|
"title": result.title or "",
|
|
"description": " ".join(highlights) if highlights else "",
|
|
"position": i + 1,
|
|
}
|
|
)
|
|
|
|
return {"success": True, "data": {"web": web_results}}
|
|
except ValueError as exc:
|
|
# Raised by _get_exa_client when EXA_API_KEY missing
|
|
return {"success": False, "error": str(exc)}
|
|
except ImportError as exc:
|
|
return {"success": False, "error": f"Exa SDK not installed: {exc}"}
|
|
except Exception as exc: # noqa: BLE001 — surface as failure
|
|
logger.warning("Exa search error: %s", exc)
|
|
return {"success": False, "error": f"Exa search failed: {exc}"}
|
|
|
|
def extract(self, urls: List[str], **kwargs: Any) -> List[Dict[str, Any]]:
|
|
"""Extract content from one or more URLs via Exa.
|
|
|
|
Returns a list of result dicts shaped for the legacy LLM
|
|
post-processing pipeline. On per-URL or whole-batch failure,
|
|
results carry an ``error`` field rather than raising.
|
|
"""
|
|
try:
|
|
from tools.interrupt import is_interrupted
|
|
|
|
if is_interrupted():
|
|
return [
|
|
{"url": u, "error": "Interrupted", "title": ""} for u in urls
|
|
]
|
|
|
|
logger.info("Exa extract: %d URL(s)", len(urls))
|
|
response = _get_exa_client().get_contents(urls, text=True)
|
|
|
|
results: List[Dict[str, Any]] = []
|
|
for result in response.results or []:
|
|
content = result.text or ""
|
|
url = result.url or ""
|
|
title = result.title or ""
|
|
results.append(
|
|
{
|
|
"url": url,
|
|
"title": title,
|
|
"content": content,
|
|
"raw_content": content,
|
|
"metadata": {"sourceURL": url, "title": title},
|
|
}
|
|
)
|
|
return results
|
|
except ValueError as exc:
|
|
return [{"url": u, "title": "", "content": "", "error": str(exc)} for u in urls]
|
|
except ImportError as exc:
|
|
return [
|
|
{"url": u, "title": "", "content": "", "error": f"Exa SDK not installed: {exc}"}
|
|
for u in urls
|
|
]
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("Exa extract error: %s", exc)
|
|
return [
|
|
{"url": u, "title": "", "content": "", "error": f"Exa extract failed: {exc}"}
|
|
for u in urls
|
|
]
|
|
|
|
def get_setup_schema(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": "Exa",
|
|
"badge": "paid",
|
|
"tag": "Semantic + neural web search with content extraction.",
|
|
"env_vars": [
|
|
{
|
|
"key": "EXA_API_KEY",
|
|
"prompt": "Exa API key",
|
|
"url": "https://exa.ai",
|
|
},
|
|
],
|
|
}
|