From 143184e9438c658c1080f45dbfc29e33044ed0d9 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Thu, 14 May 2026 00:20:16 +0530 Subject: [PATCH] =?UTF-8?q?feat(web):=20firecrawl=20plugin=20=E2=80=94=20l?= =?UTF-8?q?argest=20migration=20(search=20+=20async=20extract=20+=20dual?= =?UTF-8?q?=20auth)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrates Firecrawl from inline code in tools/web_tools.py to a bundled plugin at plugins/web/firecrawl/. By line count this is the largest of the seven provider migrations: the firecrawl path captured most of the file's vendor-specific complexity. What moved into the plugin (all previously in tools/web_tools.py): Lazy Firecrawl SDK proxy - _load_firecrawl_cls() — caches the imported SDK class - _FirecrawlProxy + Firecrawl singleton — defers ~200ms of SDK imports until first construction or isinstance check. Client construction (dual auth) - _get_direct_firecrawl_config() — direct FIRECRAWL_API_KEY/URL path - _get_firecrawl_gateway_url() — managed Nous tool-gateway URL - _is_tool_gateway_ready() — gateway URL + Nous token check - _has_direct_firecrawl_config() — direct config present? - _get_firecrawl_client() — combined client construction honoring web.use_gateway - check_firecrawl_api_key() — top-level "is firecrawl usable" - _firecrawl_backend_help_suffix() — managed-gateway help string - _raise_web_backend_configuration_error() — typed misconfig error Response shape normalization (vendor-specific) - _to_plain_object(), _normalize_result_list() — SDK→dict helpers - _extract_web_search_results() — handles SDK/direct/gateway shapes - _extract_scrape_payload() — nested-data unwrap for scrape Per-URL extract loop - 60s asyncio.wait_for timeout per URL - Pre-scrape website-policy gate - Post-scrape redirect-aware SSRF re-check - Format-aware content selection (markdown / html / auto) - Per-URL errors returned as {"error": str} entries, no raises Extract is declared `async def` — each URL is scraped in asyncio.to_thread(...). This is the second async-extract plugin after parallel. The plugin re-exports `Firecrawl` (the lazy proxy) and `check_firecrawl_api_key()` so existing tests doing `patch("tools.web_tools.Firecrawl")` or `monkeypatch.setattr(web_tools, "check_firecrawl_api_key", ...)` keep working — tools/web_tools.py re-exports both names in the next dispatcher-cutover commit. Note: web_crawl_tool still has its own Firecrawl crawl path inline (separate from extract); the Firecrawl SDK supports /crawl but we don't expose supports_crawl=True on this plugin yet. Tavily handles crawl today. Adding Firecrawl crawl is a clean follow-up. Adds "firecrawl" to _WEB_PLUGIN_SKIPLIST. E2E verified: - All 7 providers register: brave-free, ddgs, exa, firecrawl, parallel, searxng, tavily - inspect.iscoroutinefunction(firecrawl.extract) -> True - Firecrawl proxy is a callable lazy proxy at module level - check_firecrawl_api_key reflects FIRECRAWL_API_KEY presence --- hermes_cli/tools_config.py | 4 +- plugins/web/firecrawl/__init__.py | 28 ++ plugins/web/firecrawl/plugin.yaml | 7 + plugins/web/firecrawl/provider.py | 565 ++++++++++++++++++++++++++++++ 4 files changed, 603 insertions(+), 1 deletion(-) create mode 100644 plugins/web/firecrawl/__init__.py create mode 100644 plugins/web/firecrawl/plugin.yaml create mode 100644 plugins/web/firecrawl/provider.py diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index ba779900851..76c17e65cd5 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -1586,7 +1586,9 @@ def _plugin_video_gen_providers() -> list[dict]: # removed and this helper becomes the sole source of web-provider picker # rows (matching how Spotify / Google Meet are surfaced today purely from # their plugins). -_WEB_PLUGIN_SKIPLIST = frozenset({"brave-free", "ddgs", "searxng", "exa", "parallel", "tavily"}) +_WEB_PLUGIN_SKIPLIST = frozenset({ + "brave-free", "ddgs", "searxng", "exa", "parallel", "tavily", "firecrawl", +}) def _plugin_web_search_providers() -> list[dict]: diff --git a/plugins/web/firecrawl/__init__.py b/plugins/web/firecrawl/__init__.py new file mode 100644 index 00000000000..4cb9dd63d0f --- /dev/null +++ b/plugins/web/firecrawl/__init__.py @@ -0,0 +1,28 @@ +"""Firecrawl web search + extract plugin — bundled, auto-loaded. + +Largest single plugin in this PR. Captures everything the previous +inline implementation in tools/web_tools.py did: + + - Lazy import of the firecrawl SDK (~200ms cold-start cost) via a + callable proxy that defers the actual import to first use. + - Dual client paths: direct (FIRECRAWL_API_KEY / FIRECRAWL_API_URL) + OR Nous-hosted tool-gateway routing for subscribers, with + web.use_gateway as the tie-breaker. + - Per-URL scrape loop with 60s timeout, SSRF re-check after redirect, + website-policy gating, and format-aware content selection. + - Robust response shape normalization across SDK / direct API / + gateway variants (search returns differ by transport). + +The plugin re-exports ``Firecrawl`` (the lazy proxy) and +``check_firecrawl_api_key`` for backward-compatibility with tests and +external code that imports those names from ``tools.web_tools``. +""" + +from __future__ import annotations + +from plugins.web.firecrawl.provider import FirecrawlWebSearchProvider + + +def register(ctx) -> None: + """Register the Firecrawl provider with the plugin context.""" + ctx.register_web_search_provider(FirecrawlWebSearchProvider()) diff --git a/plugins/web/firecrawl/plugin.yaml b/plugins/web/firecrawl/plugin.yaml new file mode 100644 index 00000000000..063af47d738 --- /dev/null +++ b/plugins/web/firecrawl/plugin.yaml @@ -0,0 +1,7 @@ +name: web-firecrawl +version: 1.0.0 +description: "Firecrawl web search + content extraction. Supports direct API and Nous-hosted tool-gateway routing for subscribers. Requires FIRECRAWL_API_KEY (or FIRECRAWL_API_URL for self-hosted), or an active Nous subscription with FIRECRAWL_GATEWAY_URL." +author: NousResearch +kind: backend +provides_web_providers: + - firecrawl diff --git a/plugins/web/firecrawl/provider.py b/plugins/web/firecrawl/provider.py new file mode 100644 index 00000000000..64268448348 --- /dev/null +++ b/plugins/web/firecrawl/provider.py @@ -0,0 +1,565 @@ +"""Firecrawl web search + extract — plugin form. + +Subclasses :class:`agent.web_search_provider.WebSearchProvider`. This is +the largest provider migrated in this PR; it captures the full inline +firecrawl implementation that previously lived in tools/web_tools.py: + + - :data:`Firecrawl` lazy proxy that defers the ~200ms SDK import to + first use (re-exported by tools.web_tools for backward compat with + existing tests that mock that name). + - :func:`_get_firecrawl_client` with direct + managed-gateway dual + mode, controlled by ``web.use_gateway`` config when both are + configured. + - :func:`check_firecrawl_api_key` re-exported (tests + tools_config + setup hint depend on this name living in tools.web_tools). + - :func:`_extract_web_search_results` / :func:`_extract_scrape_payload` + response-shape normalizers that handle SDK / direct API / gateway + response variants. + - Per-URL extract loop with 60s timeout, redirect-aware SSRF re-check, + website-policy gating, and format-aware content selection. + +Async note: the underlying SDK is sync. ``extract()`` is declared +``async def`` because it performs per-URL I/O that benefits from +running in an executor; the implementation wraps each scrape in +:func:`asyncio.to_thread` with :func:`asyncio.wait_for(timeout=60)` to +guard against hung fetches. + +Config keys this provider responds to:: + + web: + search_backend: "firecrawl" # explicit per-capability + extract_backend: "firecrawl" # explicit per-capability + backend: "firecrawl" # shared fallback (default) + use_gateway: false # prefer managed gateway when both + # direct + gateway credentials exist + +Env vars:: + + FIRECRAWL_API_KEY=... # direct cloud auth + FIRECRAWL_API_URL=... # self-hosted Firecrawl + FIRECRAWL_GATEWAY_URL=... # Nous tool-gateway (subscribers) + TOOL_GATEWAY_DOMAIN=... # alternate gateway env + TOOL_GATEWAY_SCHEME=... + TOOL_GATEWAY_USER_TOKEN=... +""" + +from __future__ import annotations + +import asyncio +import logging +import os +from typing import Any, Dict, List, Optional, TYPE_CHECKING + +from agent.web_search_provider import WebSearchProvider + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Lazy Firecrawl SDK proxy +# --------------------------------------------------------------------------- +# The firecrawl SDK pulls ~200ms of imports (httpcore, firecrawl.v1/v2 type +# trees) on a cold CLI. We only need it when the backend is actually +# "firecrawl", so defer the import to first use via a callable proxy. +# +# Tests that do ``patch("tools.web_tools.Firecrawl", ...)`` continue to +# work because tools/web_tools.py re-exports ``Firecrawl`` from this +# module — so the patched name still references the same proxy instance. + +if TYPE_CHECKING: + from firecrawl import Firecrawl as FirecrawlSDK # noqa: F401 — type hints only + +_FIRECRAWL_CLS_CACHE: Optional[type] = None + + +def _load_firecrawl_cls() -> type: + """Import and cache ``firecrawl.Firecrawl``.""" + global _FIRECRAWL_CLS_CACHE + if _FIRECRAWL_CLS_CACHE is None: + try: + from tools.lazy_deps import ensure as _lazy_ensure + + _lazy_ensure("search.firecrawl", prompt=False) + except ImportError: + pass + except Exception as exc: # noqa: BLE001 — surface install hint + raise ImportError(str(exc)) + from firecrawl import Firecrawl as _cls # noqa: WPS433 — deliberately lazy + + _FIRECRAWL_CLS_CACHE = _cls + return _FIRECRAWL_CLS_CACHE + + +class _FirecrawlProxy: + """Callable proxy that looks like ``firecrawl.Firecrawl`` but imports lazily.""" + + __slots__ = () + + def __call__(self, *args: Any, **kwargs: Any) -> Any: + return _load_firecrawl_cls()(*args, **kwargs) + + def __instancecheck__(self, obj: Any) -> bool: + return isinstance(obj, _load_firecrawl_cls()) + + def __repr__(self) -> str: + return "" + + +Firecrawl = _FirecrawlProxy() + + +# --------------------------------------------------------------------------- +# Client construction (direct vs managed-gateway) +# --------------------------------------------------------------------------- + +_firecrawl_client: Any = None +_firecrawl_client_config: Any = None + + +def _get_direct_firecrawl_config() -> Optional[tuple]: + """Return explicit direct Firecrawl kwargs + cache key, or None when unset.""" + api_key = os.getenv("FIRECRAWL_API_KEY", "").strip() + api_url = os.getenv("FIRECRAWL_API_URL", "").strip().rstrip("/") + + if not api_key and not api_url: + return None + + kwargs: Dict[str, str] = {} + if api_key: + kwargs["api_key"] = api_key + if api_url: + kwargs["api_url"] = api_url + + return kwargs, ("direct", api_url or None, api_key or None) + + +def _get_firecrawl_gateway_url() -> str: + """Return the configured Firecrawl gateway URL.""" + from tools.tool_backend_helpers import build_vendor_gateway_url + + return build_vendor_gateway_url("firecrawl") + + +def _is_tool_gateway_ready() -> bool: + """Return True when gateway URL + Nous Subscriber token are available.""" + from tools.managed_tool_gateway import ( + read_nous_access_token, + resolve_managed_tool_gateway, + ) + + return resolve_managed_tool_gateway( + "firecrawl", token_reader=read_nous_access_token + ) is not None + + +def _has_direct_firecrawl_config() -> bool: + """Return True when direct Firecrawl config is explicitly configured.""" + return _get_direct_firecrawl_config() is not None + + +def check_firecrawl_api_key() -> bool: + """Return True when Firecrawl backend (direct or gateway) is usable. + + Re-exported by :mod:`tools.web_tools` for backward compatibility with + existing tests and the ``hermes tools`` setup flow. + """ + return _has_direct_firecrawl_config() or _is_tool_gateway_ready() + + +def _firecrawl_backend_help_suffix() -> str: + """Return optional managed-gateway guidance for Firecrawl help text.""" + from tools.tool_backend_helpers import managed_nous_tools_enabled + + if not managed_nous_tools_enabled(): + return "" + return ( + ", or use the Nous Tool Gateway via your subscription " + "(FIRECRAWL_GATEWAY_URL or TOOL_GATEWAY_DOMAIN)" + ) + + +def _raise_web_backend_configuration_error() -> None: + """Raise a clear error for unsupported web backend configuration.""" + from tools.tool_backend_helpers import managed_nous_tools_enabled + + message = ( + "Web tools are not configured. " + "Set FIRECRAWL_API_KEY for cloud Firecrawl or set FIRECRAWL_API_URL " + "for a self-hosted Firecrawl instance." + ) + if managed_nous_tools_enabled(): + message += ( + " With your Nous subscription you can also use the Tool Gateway — " + "run `hermes tools` and select Nous Subscription as the web provider." + ) + raise ValueError(message) + + +def _get_firecrawl_client() -> Any: + """Get or create the cached Firecrawl client. + + When ``web.use_gateway`` is set in config, the managed Tool Gateway is + preferred even if direct Firecrawl credentials are present. Otherwise + direct Firecrawl takes precedence when explicitly configured. + + Raises ValueError when neither path is usable. + """ + global _firecrawl_client, _firecrawl_client_config + + from tools.managed_tool_gateway import ( + read_nous_access_token, + resolve_managed_tool_gateway, + ) + from tools.tool_backend_helpers import prefers_gateway + + direct_config = _get_direct_firecrawl_config() + if direct_config is not None and not prefers_gateway("web"): + kwargs, client_config = direct_config + else: + managed_gateway = resolve_managed_tool_gateway( + "firecrawl", token_reader=read_nous_access_token + ) + if managed_gateway is None: + logger.error( + "Firecrawl client initialization failed: " + "missing direct config and tool-gateway auth." + ) + _raise_web_backend_configuration_error() + + kwargs = { + "api_key": managed_gateway.nous_user_token, + "api_url": managed_gateway.gateway_origin, + } + client_config = ( + "tool-gateway", + kwargs["api_url"], + managed_gateway.nous_user_token, + ) + + if _firecrawl_client is not None and _firecrawl_client_config == client_config: + return _firecrawl_client + + _firecrawl_client = Firecrawl(**kwargs) + _firecrawl_client_config = client_config + return _firecrawl_client + + +def _reset_client_for_tests() -> None: + """Drop the cached Firecrawl client so tests can re-instantiate cleanly.""" + global _firecrawl_client, _firecrawl_client_config + _firecrawl_client = None + _firecrawl_client_config = None + + +# --------------------------------------------------------------------------- +# Response shape normalization (SDK / direct / gateway differ) +# --------------------------------------------------------------------------- + + +def _to_plain_object(value: Any) -> Any: + """Convert SDK objects to plain python data structures when possible.""" + if value is None: + return None + + if isinstance(value, (dict, list, str, int, float, bool)): + return value + + if hasattr(value, "model_dump"): + try: + return value.model_dump() + except Exception: # noqa: BLE001 + pass + + if hasattr(value, "__dict__"): + try: + return {k: v for k, v in value.__dict__.items() if not k.startswith("_")} + except Exception: # noqa: BLE001 + pass + + return value + + +def _normalize_result_list(values: Any) -> List[Dict[str, Any]]: + """Normalize mixed SDK/list payloads into a list of dicts.""" + if not isinstance(values, list): + return [] + + normalized: List[Dict[str, Any]] = [] + for item in values: + plain = _to_plain_object(item) + if isinstance(plain, dict): + normalized.append(plain) + return normalized + + +def _extract_web_search_results(response: Any) -> List[Dict[str, Any]]: + """Extract Firecrawl search results across SDK/direct/gateway response shapes.""" + response_plain = _to_plain_object(response) + + if isinstance(response_plain, dict): + data = response_plain.get("data") + if isinstance(data, list): + return _normalize_result_list(data) + + if isinstance(data, dict): + data_web = _normalize_result_list(data.get("web")) + if data_web: + return data_web + data_results = _normalize_result_list(data.get("results")) + if data_results: + return data_results + + top_web = _normalize_result_list(response_plain.get("web")) + if top_web: + return top_web + + top_results = _normalize_result_list(response_plain.get("results")) + if top_results: + return top_results + + if hasattr(response, "web"): + return _normalize_result_list(getattr(response, "web", [])) + + return [] + + +def _extract_scrape_payload(scrape_result: Any) -> Dict[str, Any]: + """Normalize Firecrawl scrape payload shape across SDK and gateway variants.""" + result_plain = _to_plain_object(scrape_result) + if not isinstance(result_plain, dict): + return {} + + nested = result_plain.get("data") + if isinstance(nested, dict): + return nested + + return result_plain + + +# --------------------------------------------------------------------------- +# Provider class +# --------------------------------------------------------------------------- + + +class FirecrawlWebSearchProvider(WebSearchProvider): + """Firecrawl search + extract provider with dual auth paths.""" + + @property + def name(self) -> str: + return "firecrawl" + + @property + def display_name(self) -> str: + return "Firecrawl" + + def is_available(self) -> bool: + """Return True when direct Firecrawl OR managed-gateway path is configured.""" + return check_firecrawl_api_key() + + def supports_search(self) -> bool: + return True + + def supports_extract(self) -> bool: + return True + + def search(self, query: str, limit: int = 5) -> Dict[str, Any]: + """Execute a Firecrawl search. + + Sync; matches the legacy ``_get_firecrawl_client().search(...)`` + call directly. Normalizes the response across SDK/direct/gateway + shapes via :func:`_extract_web_search_results`. + """ + try: + from tools.interrupt import is_interrupted + + if is_interrupted(): + return {"success": False, "error": "Interrupted"} + + logger.info("Firecrawl search: '%s' (limit=%d)", query, limit) + response = _get_firecrawl_client().search(query=query, limit=limit) + web_results = _extract_web_search_results(response) + logger.info("Firecrawl: found %d search results", len(web_results)) + return {"success": True, "data": {"web": web_results}} + except ValueError as exc: + return {"success": False, "error": str(exc)} + except ImportError as exc: + return {"success": False, "error": f"Firecrawl SDK not installed: {exc}"} + except Exception as exc: # noqa: BLE001 + logger.warning("Firecrawl search error: %s", exc) + return {"success": False, "error": f"Firecrawl search failed: {exc}"} + + async def extract(self, urls: List[str], **kwargs: Any) -> List[Dict[str, Any]]: + """Extract content from one or more URLs via Firecrawl. + + Async; each URL is scraped in a background thread with a 60s + timeout. After scraping, the final URL (post-redirect) is + re-checked against website-access policy. + + Accepted kwargs (others ignored for forward compat): + - ``format``: ``"markdown"`` or ``"html"``; default is both + (request both, return markdown when available). + + Returns the legacy per-URL list-of-results shape. Per-URL failures + (timeout, SSRF block, scrape error, policy block) become items + with an ``error`` field rather than raising. + """ + from tools.interrupt import is_interrupted as _is_interrupted + + if _is_interrupted(): + return [{"url": u, "error": "Interrupted", "title": ""} for u in urls] + + format = kwargs.get("format") + formats: List[str] = [] + if format == "markdown": + formats = ["markdown"] + elif format == "html": + formats = ["html"] + else: + formats = ["markdown", "html"] + + # check_website_access is the legacy policy gate; import inside + # the function so the plugin doesn't pay the cost when never used. + from tools.website_policy import check_website_access + + results: List[Dict[str, Any]] = [] + + for url in urls: + if _is_interrupted(): + results.append({"url": url, "error": "Interrupted", "title": ""}) + continue + + # Pre-scrape website policy gate + blocked = check_website_access(url) + if blocked: + logger.info( + "Blocked web_extract for %s by rule %s", + blocked["host"], + blocked["rule"], + ) + results.append( + { + "url": url, + "title": "", + "content": "", + "error": blocked["message"], + "blocked_by_policy": { + "host": blocked["host"], + "rule": blocked["rule"], + "source": blocked["source"], + }, + } + ) + continue + + try: + logger.info("Firecrawl scraping: %s", url) + try: + scrape_result = await asyncio.wait_for( + asyncio.to_thread( + _get_firecrawl_client().scrape, + url=url, + formats=formats, + ), + timeout=60, + ) + except asyncio.TimeoutError: + logger.warning("Firecrawl scrape timed out for %s", url) + results.append( + { + "url": url, + "title": "", + "content": "", + "error": ( + "Scrape timed out after 60s — page may be too large " + "or unresponsive. Try browser_navigate instead." + ), + } + ) + continue + + scrape_payload = _extract_scrape_payload(scrape_result) + metadata = scrape_payload.get("metadata", {}) + content_markdown = scrape_payload.get("markdown") + content_html = scrape_payload.get("html") + + # Ensure metadata is a dict (SDK may return a typed object) + if not isinstance(metadata, dict): + if hasattr(metadata, "model_dump"): + metadata = metadata.model_dump() + elif hasattr(metadata, "__dict__"): + metadata = metadata.__dict__ + else: + metadata = {} + + title = metadata.get("title", "") + final_url = metadata.get("sourceURL", url) + + # Re-check website-access policy after any redirect + final_blocked = check_website_access(final_url) + if final_blocked: + logger.info( + "Blocked redirected web_extract for %s by rule %s", + final_blocked["host"], + final_blocked["rule"], + ) + results.append( + { + "url": final_url, + "title": title, + "content": "", + "raw_content": "", + "error": final_blocked["message"], + "blocked_by_policy": { + "host": final_blocked["host"], + "rule": final_blocked["rule"], + "source": final_blocked["source"], + }, + } + ) + continue + + # Choose markdown vs html according to the requested format + if format == "markdown" or (format is None and content_markdown): + chosen_content = content_markdown + else: + chosen_content = content_html or content_markdown or "" + + results.append( + { + "url": final_url, + "title": title, + "content": chosen_content, + "raw_content": chosen_content, + "metadata": metadata, + } + ) + except Exception as scrape_err: # noqa: BLE001 + logger.debug("Firecrawl scrape failed for %s: %s", url, scrape_err) + results.append( + { + "url": url, + "title": "", + "content": "", + "raw_content": "", + "error": str(scrape_err), + } + ) + + return results + + def get_setup_schema(self) -> Dict[str, Any]: + return { + "name": "Firecrawl", + "badge": "paid · optional gateway", + "tag": ( + "Mainstream search + extract; supports direct API and Nous " + "tool-gateway routing." + ), + "env_vars": [ + { + "key": "FIRECRAWL_API_KEY", + "prompt": "Firecrawl API key (or leave blank for self-hosted)", + "url": "https://docs.firecrawl.dev/introduction", + }, + ], + }