mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
Self-review of the plugin migration surfaced one warning and a handful of
doc/dead-code cleanups. None affect production behaviour through the main
dispatcher (which always calls `tools.web_tools._get_backend()` first and
preserves the full 7-provider walk), but direct callers of
`agent.web_search_registry.get_active_*_provider()` previously diverged
from the legacy order and could return `None` for users with credentials
but no explicit `web.backend` config key.
Changes
-------
1. `_LEGACY_PREFERENCE` was shipped as a 4-tuple
`("brave-free", "firecrawl", "searxng", "ddgs")` while the PR
description and the legacy `_get_backend()` candidate order both
call for the 7-tuple
`(firecrawl, parallel, tavily, exa, searxng, brave-free, ddgs)`.
Replaced with the 7-tuple. Verified empirically: with TAVILY+EXA keys
and no config, `get_active_search_provider()` now returns tavily
(was None); with EXA+PARALLEL it returns parallel (was None); with
BRAVE+FIRECRAWL it returns firecrawl (was brave-free).
2. `agent/web_search_registry.py` — module docstring, `_resolve` step-3
docstring, and inline comment all listed the old 4-tuple and claimed
"brave-free first because it was the shipped default". The legacy
default is `"firecrawl"`. Rewritten to match the new ordering and
reference `tools.web_tools._get_backend()` as the source of truth.
3. `agent/web_search_registry.py` — `get_active_crawl_provider`
docstring said "only Tavily implements it among built-in providers".
Firecrawl also advertises `supports_crawl=True` after the previous
commit. Updated to "Tavily and Firecrawl".
4. `plugins/web/tavily/provider.py` — module docstring said "Tavily is
the only built-in backend that natively crawls". Updated.
5. `agent/web_search_provider.py` — ABC docstring mentioned only
`search` / `extract` capabilities. Added `crawl` for accuracy.
6. `plugins/web/{firecrawl,parallel,exa}/provider.py` — dead plugin-level
cache globals (`_firecrawl_client`, `_parallel_client`,
`_async_parallel_client`, `_exa_client`) were declared but never read
(all reads/writes go through `_wt.*` per the `extracting-inline-
helpers-to-plugins` recipe). Removed the dead declarations; the
reset-for-tests helpers in firecrawl + parallel now clear the
canonical `_wt._<name>` slots, matching the pattern exa already used.
Tests
-----
218/218 web-targeted tests still pass (no test changes needed). 4910/4910
in `tests/tools/` still green.
767 lines
29 KiB
Python
767 lines
29 KiB
Python
"""Firecrawl web search + extract — plugin form.
|
|
|
|
Subclasses :class:`agent.web_search_provider.WebSearchProvider`. This is
|
|
the largest provider migrated in this PR; it captures the full inline
|
|
firecrawl implementation that previously lived in tools/web_tools.py:
|
|
|
|
- :data:`Firecrawl` lazy proxy that defers the ~200ms SDK import to
|
|
first use (re-exported by tools.web_tools for backward compat with
|
|
existing tests that mock that name).
|
|
- :func:`_get_firecrawl_client` with direct + managed-gateway dual
|
|
mode, controlled by ``web.use_gateway`` config when both are
|
|
configured.
|
|
- :func:`check_firecrawl_api_key` re-exported (tests + tools_config
|
|
setup hint depend on this name living in tools.web_tools).
|
|
- :func:`_extract_web_search_results` / :func:`_extract_scrape_payload`
|
|
response-shape normalizers that handle SDK / direct API / gateway
|
|
response variants.
|
|
- Per-URL extract loop with 60s timeout, redirect-aware SSRF re-check,
|
|
website-policy gating, and format-aware content selection.
|
|
|
|
Async note: the underlying SDK is sync. ``extract()`` is declared
|
|
``async def`` because it performs per-URL I/O that benefits from
|
|
running in an executor; the implementation wraps each scrape in
|
|
:func:`asyncio.to_thread` with :func:`asyncio.wait_for(timeout=60)` to
|
|
guard against hung fetches.
|
|
|
|
Config keys this provider responds to::
|
|
|
|
web:
|
|
search_backend: "firecrawl" # explicit per-capability
|
|
extract_backend: "firecrawl" # explicit per-capability
|
|
backend: "firecrawl" # shared fallback (default)
|
|
use_gateway: false # prefer managed gateway when both
|
|
# direct + gateway credentials exist
|
|
|
|
Env vars::
|
|
|
|
FIRECRAWL_API_KEY=... # direct cloud auth
|
|
FIRECRAWL_API_URL=... # self-hosted Firecrawl
|
|
FIRECRAWL_GATEWAY_URL=... # Nous tool-gateway (subscribers)
|
|
TOOL_GATEWAY_DOMAIN=... # alternate gateway env
|
|
TOOL_GATEWAY_SCHEME=...
|
|
TOOL_GATEWAY_USER_TOKEN=...
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
from typing import Any, Dict, List, Optional, TYPE_CHECKING
|
|
|
|
from agent.web_search_provider import WebSearchProvider
|
|
from tools.website_policy import check_website_access
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Lazy Firecrawl SDK proxy
|
|
# ---------------------------------------------------------------------------
|
|
# The firecrawl SDK pulls ~200ms of imports (httpcore, firecrawl.v1/v2 type
|
|
# trees) on a cold CLI. We only need it when the backend is actually
|
|
# "firecrawl", so defer the import to first use via a callable proxy.
|
|
#
|
|
# Tests that do ``patch("tools.web_tools.Firecrawl", ...)`` continue to
|
|
# work because tools/web_tools.py re-exports ``Firecrawl`` from this
|
|
# module — so the patched name still references the same proxy instance.
|
|
|
|
if TYPE_CHECKING:
|
|
from firecrawl import Firecrawl as FirecrawlSDK # noqa: F401 — type hints only
|
|
|
|
_FIRECRAWL_CLS_CACHE: Optional[type] = None
|
|
|
|
|
|
def _load_firecrawl_cls() -> type:
|
|
"""Import and cache ``firecrawl.Firecrawl``."""
|
|
global _FIRECRAWL_CLS_CACHE
|
|
if _FIRECRAWL_CLS_CACHE is None:
|
|
try:
|
|
from tools.lazy_deps import ensure as _lazy_ensure
|
|
|
|
_lazy_ensure("search.firecrawl", prompt=False)
|
|
except ImportError:
|
|
pass
|
|
except Exception as exc: # noqa: BLE001 — surface install hint
|
|
raise ImportError(str(exc))
|
|
from firecrawl import Firecrawl as _cls # noqa: WPS433 — deliberately lazy
|
|
|
|
_FIRECRAWL_CLS_CACHE = _cls
|
|
return _FIRECRAWL_CLS_CACHE
|
|
|
|
|
|
class _FirecrawlProxy:
|
|
"""Callable proxy that looks like ``firecrawl.Firecrawl`` but imports lazily."""
|
|
|
|
__slots__ = ()
|
|
|
|
def __call__(self, *args: Any, **kwargs: Any) -> Any:
|
|
return _load_firecrawl_cls()(*args, **kwargs)
|
|
|
|
def __instancecheck__(self, obj: Any) -> bool:
|
|
return isinstance(obj, _load_firecrawl_cls())
|
|
|
|
def __repr__(self) -> str:
|
|
return "<lazy firecrawl.Firecrawl proxy>"
|
|
|
|
|
|
Firecrawl = _FirecrawlProxy()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Client construction (direct vs managed-gateway)
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# The canonical cache slots live on :mod:`tools.web_tools` so tests that do
|
|
# ``tools.web_tools._firecrawl_client = None`` between cases see fresh
|
|
# state. The plugin reads/writes through that public module — see
|
|
# :func:`_get_firecrawl_client` below.
|
|
|
|
|
|
def _get_direct_firecrawl_config() -> Optional[tuple]:
|
|
"""Return explicit direct Firecrawl kwargs + cache key, or None when unset."""
|
|
api_key = os.getenv("FIRECRAWL_API_KEY", "").strip()
|
|
api_url = os.getenv("FIRECRAWL_API_URL", "").strip().rstrip("/")
|
|
|
|
if not api_key and not api_url:
|
|
return None
|
|
|
|
kwargs: Dict[str, str] = {}
|
|
if api_key:
|
|
kwargs["api_key"] = api_key
|
|
if api_url:
|
|
kwargs["api_url"] = api_url
|
|
|
|
return kwargs, ("direct", api_url or None, api_key or None)
|
|
|
|
|
|
def _get_firecrawl_gateway_url() -> str:
|
|
"""Return the configured Firecrawl gateway URL."""
|
|
import tools.web_tools as _wt
|
|
|
|
return _wt.build_vendor_gateway_url("firecrawl")
|
|
|
|
|
|
def _is_tool_gateway_ready() -> bool:
|
|
"""Return True when gateway URL + Nous Subscriber token are available.
|
|
|
|
Reads ``read_nous_access_token`` and ``resolve_managed_tool_gateway``
|
|
via :mod:`tools.web_tools` rather than direct imports, so unit tests
|
|
that ``patch("tools.web_tools._read_nous_access_token", ...)`` see
|
|
their patches honored. The names are re-exported on
|
|
:mod:`tools.web_tools` for exactly this reason.
|
|
"""
|
|
import tools.web_tools as _wt
|
|
|
|
return _wt.resolve_managed_tool_gateway(
|
|
"firecrawl", token_reader=_wt._read_nous_access_token
|
|
) is not None
|
|
|
|
|
|
def _has_direct_firecrawl_config() -> bool:
|
|
"""Return True when direct Firecrawl config is explicitly configured."""
|
|
return _get_direct_firecrawl_config() is not None
|
|
|
|
|
|
def check_firecrawl_api_key() -> bool:
|
|
"""Return True when Firecrawl backend (direct or gateway) is usable.
|
|
|
|
Re-exported by :mod:`tools.web_tools` for backward compatibility with
|
|
existing tests and the ``hermes tools`` setup flow.
|
|
"""
|
|
return _has_direct_firecrawl_config() or _is_tool_gateway_ready()
|
|
|
|
|
|
def _firecrawl_backend_help_suffix() -> str:
|
|
"""Return optional managed-gateway guidance for Firecrawl help text."""
|
|
import tools.web_tools as _wt
|
|
|
|
if not _wt.managed_nous_tools_enabled():
|
|
return ""
|
|
return (
|
|
", or use the Nous Tool Gateway via your subscription "
|
|
"(FIRECRAWL_GATEWAY_URL or TOOL_GATEWAY_DOMAIN)"
|
|
)
|
|
|
|
|
|
def _raise_web_backend_configuration_error() -> None:
|
|
"""Raise a clear error for unsupported web backend configuration."""
|
|
import tools.web_tools as _wt
|
|
|
|
message = (
|
|
"Web tools are not configured. "
|
|
"Set FIRECRAWL_API_KEY for cloud Firecrawl or set FIRECRAWL_API_URL "
|
|
"for a self-hosted Firecrawl instance."
|
|
)
|
|
if _wt.managed_nous_tools_enabled():
|
|
message += (
|
|
" With your Nous subscription you can also use the Tool Gateway — "
|
|
"run `hermes tools` and select Nous Subscription as the web provider."
|
|
)
|
|
raise ValueError(message)
|
|
|
|
|
|
def _get_firecrawl_client() -> Any:
|
|
"""Get or create the cached Firecrawl client.
|
|
|
|
When ``web.use_gateway`` is set in config, the managed Tool Gateway is
|
|
preferred even if direct Firecrawl credentials are present. Otherwise
|
|
direct Firecrawl takes precedence when explicitly configured.
|
|
|
|
Raises ValueError when neither path is usable.
|
|
|
|
The cached client is stored on :mod:`tools.web_tools` (as
|
|
``_firecrawl_client`` and ``_firecrawl_client_config``) rather than on
|
|
this plugin module so that unit tests that reset the cache via
|
|
``tools.web_tools._firecrawl_client = None`` keep working. Helper
|
|
functions (``prefers_gateway``, ``resolve_managed_tool_gateway``,
|
|
``_read_nous_access_token``, ``Firecrawl``) are also looked up via
|
|
:mod:`tools.web_tools` for the same reason — see
|
|
:func:`_is_tool_gateway_ready`.
|
|
"""
|
|
import tools.web_tools as _wt
|
|
|
|
direct_config = _get_direct_firecrawl_config()
|
|
if direct_config is not None and not _wt.prefers_gateway("web"):
|
|
kwargs, client_config = direct_config
|
|
else:
|
|
managed_gateway = _wt.resolve_managed_tool_gateway(
|
|
"firecrawl", token_reader=_wt._read_nous_access_token
|
|
)
|
|
if managed_gateway is None:
|
|
logger.error(
|
|
"Firecrawl client initialization failed: "
|
|
"missing direct config and tool-gateway auth."
|
|
)
|
|
_raise_web_backend_configuration_error()
|
|
|
|
kwargs = {
|
|
"api_key": managed_gateway.nous_user_token,
|
|
"api_url": managed_gateway.gateway_origin,
|
|
}
|
|
client_config = (
|
|
"tool-gateway",
|
|
kwargs["api_url"],
|
|
managed_gateway.nous_user_token,
|
|
)
|
|
|
|
cached = getattr(_wt, "_firecrawl_client", None)
|
|
cached_config = getattr(_wt, "_firecrawl_client_config", None)
|
|
if cached is not None and cached_config == client_config:
|
|
return cached
|
|
|
|
# Construct via the re-exported Firecrawl proxy on tools.web_tools so
|
|
# unit tests patching ``tools.web_tools.Firecrawl`` see their mock.
|
|
_wt._firecrawl_client = _wt.Firecrawl(**kwargs)
|
|
_wt._firecrawl_client_config = client_config
|
|
return _wt._firecrawl_client
|
|
|
|
|
|
def _reset_client_for_tests() -> None:
|
|
"""Drop the cached Firecrawl client so tests can re-instantiate cleanly.
|
|
|
|
Clears the canonical slots on :mod:`tools.web_tools` (where
|
|
:func:`_get_firecrawl_client` reads/writes them).
|
|
"""
|
|
import tools.web_tools as _wt
|
|
|
|
_wt._firecrawl_client = None
|
|
_wt._firecrawl_client_config = None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Response shape normalization (SDK / direct / gateway differ)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _to_plain_object(value: Any) -> Any:
|
|
"""Convert SDK objects to plain python data structures when possible."""
|
|
if value is None:
|
|
return None
|
|
|
|
if isinstance(value, (dict, list, str, int, float, bool)):
|
|
return value
|
|
|
|
if hasattr(value, "model_dump"):
|
|
try:
|
|
return value.model_dump()
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
|
|
if hasattr(value, "__dict__"):
|
|
try:
|
|
return {k: v for k, v in value.__dict__.items() if not k.startswith("_")}
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
|
|
return value
|
|
|
|
|
|
def _normalize_result_list(values: Any) -> List[Dict[str, Any]]:
|
|
"""Normalize mixed SDK/list payloads into a list of dicts."""
|
|
if not isinstance(values, list):
|
|
return []
|
|
|
|
normalized: List[Dict[str, Any]] = []
|
|
for item in values:
|
|
plain = _to_plain_object(item)
|
|
if isinstance(plain, dict):
|
|
normalized.append(plain)
|
|
return normalized
|
|
|
|
|
|
def _extract_web_search_results(response: Any) -> List[Dict[str, Any]]:
|
|
"""Extract Firecrawl search results across SDK/direct/gateway response shapes."""
|
|
response_plain = _to_plain_object(response)
|
|
|
|
if isinstance(response_plain, dict):
|
|
data = response_plain.get("data")
|
|
if isinstance(data, list):
|
|
return _normalize_result_list(data)
|
|
|
|
if isinstance(data, dict):
|
|
data_web = _normalize_result_list(data.get("web"))
|
|
if data_web:
|
|
return data_web
|
|
data_results = _normalize_result_list(data.get("results"))
|
|
if data_results:
|
|
return data_results
|
|
|
|
top_web = _normalize_result_list(response_plain.get("web"))
|
|
if top_web:
|
|
return top_web
|
|
|
|
top_results = _normalize_result_list(response_plain.get("results"))
|
|
if top_results:
|
|
return top_results
|
|
|
|
if hasattr(response, "web"):
|
|
return _normalize_result_list(getattr(response, "web", []))
|
|
|
|
return []
|
|
|
|
|
|
def _extract_scrape_payload(scrape_result: Any) -> Dict[str, Any]:
|
|
"""Normalize Firecrawl scrape payload shape across SDK and gateway variants."""
|
|
result_plain = _to_plain_object(scrape_result)
|
|
if not isinstance(result_plain, dict):
|
|
return {}
|
|
|
|
nested = result_plain.get("data")
|
|
if isinstance(nested, dict):
|
|
return nested
|
|
|
|
return result_plain
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Provider class
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class FirecrawlWebSearchProvider(WebSearchProvider):
|
|
"""Firecrawl search + extract provider with dual auth paths."""
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "firecrawl"
|
|
|
|
@property
|
|
def display_name(self) -> str:
|
|
return "Firecrawl"
|
|
|
|
def is_available(self) -> bool:
|
|
"""Return True when direct Firecrawl OR managed-gateway path is configured."""
|
|
return check_firecrawl_api_key()
|
|
|
|
def supports_search(self) -> bool:
|
|
return True
|
|
|
|
def supports_extract(self) -> bool:
|
|
return True
|
|
|
|
def supports_crawl(self) -> bool:
|
|
return True
|
|
|
|
def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
|
|
"""Execute a Firecrawl search.
|
|
|
|
Sync; matches the legacy ``_get_firecrawl_client().search(...)``
|
|
call directly. Normalizes the response across SDK/direct/gateway
|
|
shapes via :func:`_extract_web_search_results`.
|
|
"""
|
|
try:
|
|
from tools.interrupt import is_interrupted
|
|
|
|
if is_interrupted():
|
|
return {"success": False, "error": "Interrupted"}
|
|
|
|
logger.info("Firecrawl search: '%s' (limit=%d)", query, limit)
|
|
response = _get_firecrawl_client().search(query=query, limit=limit)
|
|
web_results = _extract_web_search_results(response)
|
|
logger.info("Firecrawl: found %d search results", len(web_results))
|
|
return {"success": True, "data": {"web": web_results}}
|
|
except ValueError as exc:
|
|
return {"success": False, "error": str(exc)}
|
|
except ImportError as exc:
|
|
return {"success": False, "error": f"Firecrawl SDK not installed: {exc}"}
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("Firecrawl search error: %s", exc)
|
|
return {"success": False, "error": f"Firecrawl search failed: {exc}"}
|
|
|
|
async def extract(self, urls: List[str], **kwargs: Any) -> List[Dict[str, Any]]:
|
|
"""Extract content from one or more URLs via Firecrawl.
|
|
|
|
Async; each URL is scraped in a background thread with a 60s
|
|
timeout. After scraping, the final URL (post-redirect) is
|
|
re-checked against website-access policy.
|
|
|
|
Accepted kwargs (others ignored for forward compat):
|
|
- ``format``: ``"markdown"`` or ``"html"``; default is both
|
|
(request both, return markdown when available).
|
|
|
|
Returns the legacy per-URL list-of-results shape. Per-URL failures
|
|
(timeout, SSRF block, scrape error, policy block) become items
|
|
with an ``error`` field rather than raising.
|
|
"""
|
|
from tools.interrupt import is_interrupted as _is_interrupted
|
|
|
|
if _is_interrupted():
|
|
return [{"url": u, "error": "Interrupted", "title": ""} for u in urls]
|
|
|
|
format = kwargs.get("format")
|
|
formats: List[str] = []
|
|
if format == "markdown":
|
|
formats = ["markdown"]
|
|
elif format == "html":
|
|
formats = ["html"]
|
|
else:
|
|
formats = ["markdown", "html"]
|
|
|
|
# check_website_access is the legacy policy gate; imported at
|
|
# module level (lazy-friendly because the website_policy import is
|
|
# cheap) so monkeypatching it in tests works as expected.
|
|
|
|
results: List[Dict[str, Any]] = []
|
|
|
|
for url in urls:
|
|
if _is_interrupted():
|
|
results.append({"url": url, "error": "Interrupted", "title": ""})
|
|
continue
|
|
|
|
# Pre-scrape website policy gate
|
|
blocked = check_website_access(url)
|
|
if blocked:
|
|
logger.info(
|
|
"Blocked web_extract for %s by rule %s",
|
|
blocked["host"],
|
|
blocked["rule"],
|
|
)
|
|
results.append(
|
|
{
|
|
"url": url,
|
|
"title": "",
|
|
"content": "",
|
|
"error": blocked["message"],
|
|
"blocked_by_policy": {
|
|
"host": blocked["host"],
|
|
"rule": blocked["rule"],
|
|
"source": blocked["source"],
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
try:
|
|
logger.info("Firecrawl scraping: %s", url)
|
|
try:
|
|
scrape_result = await asyncio.wait_for(
|
|
asyncio.to_thread(
|
|
_get_firecrawl_client().scrape,
|
|
url=url,
|
|
formats=formats,
|
|
),
|
|
timeout=60,
|
|
)
|
|
except asyncio.TimeoutError:
|
|
logger.warning("Firecrawl scrape timed out for %s", url)
|
|
results.append(
|
|
{
|
|
"url": url,
|
|
"title": "",
|
|
"content": "",
|
|
"error": (
|
|
"Scrape timed out after 60s — page may be too large "
|
|
"or unresponsive. Try browser_navigate instead."
|
|
),
|
|
}
|
|
)
|
|
continue
|
|
|
|
scrape_payload = _extract_scrape_payload(scrape_result)
|
|
metadata = scrape_payload.get("metadata", {})
|
|
content_markdown = scrape_payload.get("markdown")
|
|
content_html = scrape_payload.get("html")
|
|
|
|
# Ensure metadata is a dict (SDK may return a typed object)
|
|
if not isinstance(metadata, dict):
|
|
if hasattr(metadata, "model_dump"):
|
|
metadata = metadata.model_dump()
|
|
elif hasattr(metadata, "__dict__"):
|
|
metadata = metadata.__dict__
|
|
else:
|
|
metadata = {}
|
|
|
|
title = metadata.get("title", "")
|
|
final_url = metadata.get("sourceURL", url)
|
|
|
|
# Re-check website-access policy after any redirect
|
|
final_blocked = check_website_access(final_url)
|
|
if final_blocked:
|
|
logger.info(
|
|
"Blocked redirected web_extract for %s by rule %s",
|
|
final_blocked["host"],
|
|
final_blocked["rule"],
|
|
)
|
|
results.append(
|
|
{
|
|
"url": final_url,
|
|
"title": title,
|
|
"content": "",
|
|
"raw_content": "",
|
|
"error": final_blocked["message"],
|
|
"blocked_by_policy": {
|
|
"host": final_blocked["host"],
|
|
"rule": final_blocked["rule"],
|
|
"source": final_blocked["source"],
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
# Choose markdown vs html according to the requested format
|
|
if format == "markdown" or (format is None and content_markdown):
|
|
chosen_content = content_markdown
|
|
else:
|
|
chosen_content = content_html or content_markdown or ""
|
|
|
|
results.append(
|
|
{
|
|
"url": final_url,
|
|
"title": title,
|
|
"content": chosen_content,
|
|
"raw_content": chosen_content,
|
|
"metadata": metadata,
|
|
}
|
|
)
|
|
except Exception as scrape_err: # noqa: BLE001
|
|
logger.debug("Firecrawl scrape failed for %s: %s", url, scrape_err)
|
|
results.append(
|
|
{
|
|
"url": url,
|
|
"title": "",
|
|
"content": "",
|
|
"raw_content": "",
|
|
"error": str(scrape_err),
|
|
}
|
|
)
|
|
|
|
return results
|
|
|
|
async def crawl(self, url: str, **kwargs: Any) -> Dict[str, Any]:
|
|
"""Crawl a seed URL via Firecrawl's ``/crawl`` endpoint.
|
|
|
|
Sync SDK call wrapped in ``asyncio.to_thread`` because the dispatcher
|
|
in :func:`tools.web_tools.web_crawl_tool` is async and runs LLM
|
|
post-processing on the response. The dispatcher gates the seed URL
|
|
against SSRF + website-access policy before calling us; this method
|
|
re-checks every crawled page's URL against the policy after the
|
|
crawl returns to catch redirected pages that map to a blocked host.
|
|
|
|
Accepted kwargs (others ignored for forward compat):
|
|
- ``instructions``: str — logged then dropped. Firecrawl's /crawl
|
|
endpoint does NOT accept natural-language instructions (that's
|
|
an /extract feature), so we record the value for debugging and
|
|
proceed without it. Tavily's crawl IS instruction-aware; this
|
|
divergence is documented in both plugins' docstrings.
|
|
- ``limit``: int — max pages to crawl (default 20).
|
|
- ``depth``: str — accepted for API parity with Tavily; ignored
|
|
by Firecrawl's crawl endpoint.
|
|
|
|
Returns ``{"results": [...]}`` matching the shape that
|
|
:func:`tools.web_tools.web_crawl_tool`'s shared LLM-summarization
|
|
path expects. Per-page failures (policy block on redirected URL,
|
|
bad response shape) are included as items with an ``error`` field
|
|
rather than raising.
|
|
"""
|
|
try:
|
|
from tools.interrupt import is_interrupted
|
|
|
|
if is_interrupted():
|
|
return {"results": [{"url": url, "title": "", "content": "", "error": "Interrupted"}]}
|
|
|
|
instructions = kwargs.get("instructions")
|
|
limit = kwargs.get("limit", 20)
|
|
|
|
# Firecrawl's /crawl endpoint does not accept natural-language
|
|
# instructions (that's an /extract feature). Log + drop.
|
|
if instructions:
|
|
logger.info(
|
|
"Firecrawl crawl: 'instructions' parameter ignored "
|
|
"(not supported by Firecrawl /crawl)"
|
|
)
|
|
|
|
logger.info("Firecrawl crawl: %s (limit=%d)", url, limit)
|
|
|
|
crawl_params = {
|
|
"limit": limit,
|
|
"scrape_options": {"formats": ["markdown"]},
|
|
}
|
|
|
|
# The SDK call is sync; run in a thread so we don't block the
|
|
# gateway event loop on a multi-page crawl.
|
|
crawl_result = await asyncio.to_thread(
|
|
_get_firecrawl_client().crawl,
|
|
url=url,
|
|
**crawl_params,
|
|
)
|
|
|
|
# CrawlJob normalization across SDK + direct + gateway shapes.
|
|
data_list: List[Any] = []
|
|
if hasattr(crawl_result, "data"):
|
|
data_list = crawl_result.data if crawl_result.data else []
|
|
logger.info(
|
|
"Firecrawl crawl status: %s, %d pages",
|
|
getattr(crawl_result, "status", "unknown"),
|
|
len(data_list),
|
|
)
|
|
elif isinstance(crawl_result, dict) and "data" in crawl_result:
|
|
data_list = crawl_result.get("data", []) or []
|
|
else:
|
|
logger.warning(
|
|
"Firecrawl crawl: unexpected result type %r",
|
|
type(crawl_result).__name__,
|
|
)
|
|
|
|
pages: List[Dict[str, Any]] = []
|
|
for item in data_list:
|
|
# Pydantic model | typed object | dict — handle all shapes.
|
|
content_markdown = None
|
|
content_html = None
|
|
metadata: Any = {}
|
|
|
|
if hasattr(item, "model_dump"):
|
|
item_dict = item.model_dump()
|
|
content_markdown = item_dict.get("markdown")
|
|
content_html = item_dict.get("html")
|
|
metadata = item_dict.get("metadata", {})
|
|
elif hasattr(item, "__dict__"):
|
|
content_markdown = getattr(item, "markdown", None)
|
|
content_html = getattr(item, "html", None)
|
|
metadata_obj = getattr(item, "metadata", {})
|
|
if hasattr(metadata_obj, "model_dump"):
|
|
metadata = metadata_obj.model_dump()
|
|
elif hasattr(metadata_obj, "__dict__"):
|
|
metadata = metadata_obj.__dict__
|
|
elif isinstance(metadata_obj, dict):
|
|
metadata = metadata_obj
|
|
else:
|
|
metadata = {}
|
|
elif isinstance(item, dict):
|
|
content_markdown = item.get("markdown")
|
|
content_html = item.get("html")
|
|
metadata = item.get("metadata", {})
|
|
|
|
# Ensure metadata is a plain dict.
|
|
if not isinstance(metadata, dict):
|
|
if hasattr(metadata, "model_dump"):
|
|
metadata = metadata.model_dump()
|
|
elif hasattr(metadata, "__dict__"):
|
|
metadata = metadata.__dict__
|
|
else:
|
|
metadata = {}
|
|
|
|
page_url = metadata.get(
|
|
"sourceURL", metadata.get("url", "Unknown URL")
|
|
)
|
|
title = metadata.get("title", "")
|
|
|
|
# Per-page policy re-check (catches blocked redirects).
|
|
page_blocked = check_website_access(page_url)
|
|
if page_blocked:
|
|
logger.info(
|
|
"Blocked crawled page %s by rule %s",
|
|
page_blocked["host"],
|
|
page_blocked["rule"],
|
|
)
|
|
pages.append(
|
|
{
|
|
"url": page_url,
|
|
"title": title,
|
|
"content": "",
|
|
"raw_content": "",
|
|
"error": page_blocked["message"],
|
|
"blocked_by_policy": {
|
|
"host": page_blocked["host"],
|
|
"rule": page_blocked["rule"],
|
|
"source": page_blocked["source"],
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
content = content_markdown or content_html or ""
|
|
pages.append(
|
|
{
|
|
"url": page_url,
|
|
"title": title,
|
|
"content": content,
|
|
"raw_content": content,
|
|
"metadata": metadata,
|
|
}
|
|
)
|
|
|
|
return {"results": pages}
|
|
except ValueError as exc:
|
|
return {"results": [{"url": url, "title": "", "content": "", "error": str(exc)}]}
|
|
except ImportError as exc:
|
|
return {
|
|
"results": [
|
|
{
|
|
"url": url,
|
|
"title": "",
|
|
"content": "",
|
|
"error": f"Firecrawl SDK not installed: {exc}",
|
|
}
|
|
]
|
|
}
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.warning("Firecrawl crawl error: %s", exc)
|
|
return {
|
|
"results": [
|
|
{
|
|
"url": url,
|
|
"title": "",
|
|
"content": "",
|
|
"error": f"Firecrawl crawl failed: {exc}",
|
|
}
|
|
]
|
|
}
|
|
|
|
def get_setup_schema(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": "Firecrawl",
|
|
"badge": "paid · optional gateway",
|
|
"tag": (
|
|
"Full search + extract + crawl; supports direct API and "
|
|
"Nous tool-gateway routing."
|
|
),
|
|
"env_vars": [
|
|
{
|
|
"key": "FIRECRAWL_API_KEY",
|
|
"prompt": "Firecrawl API key (or leave blank for self-hosted)",
|
|
"url": "https://docs.firecrawl.dev/introduction",
|
|
},
|
|
],
|
|
}
|