diff --git a/plugins/web/firecrawl/provider.py b/plugins/web/firecrawl/provider.py index fdd5e1f3d55..ec193781096 100644 --- a/plugins/web/firecrawl/provider.py +++ b/plugins/web/firecrawl/provider.py @@ -374,6 +374,9 @@ class FirecrawlWebSearchProvider(WebSearchProvider): def supports_extract(self) -> bool: return True + def supports_crawl(self) -> bool: + return True + def search(self, query: str, limit: int = 5) -> Dict[str, Any]: """Execute a Firecrawl search. @@ -559,13 +562,193 @@ class FirecrawlWebSearchProvider(WebSearchProvider): return results + async def crawl(self, url: str, **kwargs: Any) -> Dict[str, Any]: + """Crawl a seed URL via Firecrawl's ``/crawl`` endpoint. + + Sync SDK call wrapped in ``asyncio.to_thread`` because the dispatcher + in :func:`tools.web_tools.web_crawl_tool` is async and runs LLM + post-processing on the response. The dispatcher gates the seed URL + against SSRF + website-access policy before calling us; this method + re-checks every crawled page's URL against the policy after the + crawl returns to catch redirected pages that map to a blocked host. + + Accepted kwargs (others ignored for forward compat): + - ``instructions``: str — logged then dropped. Firecrawl's /crawl + endpoint does NOT accept natural-language instructions (that's + an /extract feature), so we record the value for debugging and + proceed without it. Tavily's crawl IS instruction-aware; this + divergence is documented in both plugins' docstrings. + - ``limit``: int — max pages to crawl (default 20). + - ``depth``: str — accepted for API parity with Tavily; ignored + by Firecrawl's crawl endpoint. + + Returns ``{"results": [...]}`` matching the shape that + :func:`tools.web_tools.web_crawl_tool`'s shared LLM-summarization + path expects. Per-page failures (policy block on redirected URL, + bad response shape) are included as items with an ``error`` field + rather than raising. + """ + try: + from tools.interrupt import is_interrupted + + if is_interrupted(): + return {"results": [{"url": url, "title": "", "content": "", "error": "Interrupted"}]} + + instructions = kwargs.get("instructions") + limit = kwargs.get("limit", 20) + + # Firecrawl's /crawl endpoint does not accept natural-language + # instructions (that's an /extract feature). Log + drop. + if instructions: + logger.info( + "Firecrawl crawl: 'instructions' parameter ignored " + "(not supported by Firecrawl /crawl)" + ) + + logger.info("Firecrawl crawl: %s (limit=%d)", url, limit) + + crawl_params = { + "limit": limit, + "scrape_options": {"formats": ["markdown"]}, + } + + # The SDK call is sync; run in a thread so we don't block the + # gateway event loop on a multi-page crawl. + crawl_result = await asyncio.to_thread( + _get_firecrawl_client().crawl, + url=url, + **crawl_params, + ) + + # CrawlJob normalization across SDK + direct + gateway shapes. + data_list: List[Any] = [] + if hasattr(crawl_result, "data"): + data_list = crawl_result.data if crawl_result.data else [] + logger.info( + "Firecrawl crawl status: %s, %d pages", + getattr(crawl_result, "status", "unknown"), + len(data_list), + ) + elif isinstance(crawl_result, dict) and "data" in crawl_result: + data_list = crawl_result.get("data", []) or [] + else: + logger.warning( + "Firecrawl crawl: unexpected result type %r", + type(crawl_result).__name__, + ) + + pages: List[Dict[str, Any]] = [] + for item in data_list: + # Pydantic model | typed object | dict — handle all shapes. + content_markdown = None + content_html = None + metadata: Any = {} + + if hasattr(item, "model_dump"): + item_dict = item.model_dump() + content_markdown = item_dict.get("markdown") + content_html = item_dict.get("html") + metadata = item_dict.get("metadata", {}) + elif hasattr(item, "__dict__"): + content_markdown = getattr(item, "markdown", None) + content_html = getattr(item, "html", None) + metadata_obj = getattr(item, "metadata", {}) + if hasattr(metadata_obj, "model_dump"): + metadata = metadata_obj.model_dump() + elif hasattr(metadata_obj, "__dict__"): + metadata = metadata_obj.__dict__ + elif isinstance(metadata_obj, dict): + metadata = metadata_obj + else: + metadata = {} + elif isinstance(item, dict): + content_markdown = item.get("markdown") + content_html = item.get("html") + metadata = item.get("metadata", {}) + + # Ensure metadata is a plain dict. + if not isinstance(metadata, dict): + if hasattr(metadata, "model_dump"): + metadata = metadata.model_dump() + elif hasattr(metadata, "__dict__"): + metadata = metadata.__dict__ + else: + metadata = {} + + page_url = metadata.get( + "sourceURL", metadata.get("url", "Unknown URL") + ) + title = metadata.get("title", "") + + # Per-page policy re-check (catches blocked redirects). + page_blocked = check_website_access(page_url) + if page_blocked: + logger.info( + "Blocked crawled page %s by rule %s", + page_blocked["host"], + page_blocked["rule"], + ) + pages.append( + { + "url": page_url, + "title": title, + "content": "", + "raw_content": "", + "error": page_blocked["message"], + "blocked_by_policy": { + "host": page_blocked["host"], + "rule": page_blocked["rule"], + "source": page_blocked["source"], + }, + } + ) + continue + + content = content_markdown or content_html or "" + pages.append( + { + "url": page_url, + "title": title, + "content": content, + "raw_content": content, + "metadata": metadata, + } + ) + + return {"results": pages} + except ValueError as exc: + return {"results": [{"url": url, "title": "", "content": "", "error": str(exc)}]} + except ImportError as exc: + return { + "results": [ + { + "url": url, + "title": "", + "content": "", + "error": f"Firecrawl SDK not installed: {exc}", + } + ] + } + except Exception as exc: # noqa: BLE001 + logger.warning("Firecrawl crawl error: %s", exc) + return { + "results": [ + { + "url": url, + "title": "", + "content": "", + "error": f"Firecrawl crawl failed: {exc}", + } + ] + } + def get_setup_schema(self) -> Dict[str, Any]: return { "name": "Firecrawl", "badge": "paid · optional gateway", "tag": ( - "Mainstream search + extract; supports direct API and Nous " - "tool-gateway routing." + "Full search + extract + crawl; supports direct API and " + "Nous tool-gateway routing." ), "env_vars": [ { diff --git a/tests/plugins/web/test_web_search_provider_plugins.py b/tests/plugins/web/test_web_search_provider_plugins.py index 62f0f15c4d3..6ea154dee1e 100644 --- a/tests/plugins/web/test_web_search_provider_plugins.py +++ b/tests/plugins/web/test_web_search_provider_plugins.py @@ -96,7 +96,10 @@ class TestBundledPluginsRegister: ("exa", True, True, False), ("parallel", True, True, False), ("tavily", True, True, True), - ("firecrawl", True, True, False), + # firecrawl: search + extract + crawl. Crawl was originally + # disabled in the migration (fell through to a legacy inline + # path); the follow-up commit enabled it natively. + ("firecrawl", True, True, True), ], ) def test_capability_flags_match_spec( @@ -451,3 +454,22 @@ class TestErrorResponseShapes: assert isinstance(result["results"], list) if result["results"]: assert "error" in result["results"][0] + + def test_firecrawl_crawl_returns_error_dict_when_unconfigured(self) -> None: + """firecrawl crawl is async (wraps SDK in to_thread); error must be + surfaced via the per-page result shape, not raised.""" + _ensure_plugins_loaded() + from agent.web_search_registry import get_provider + + p = get_provider("firecrawl") + assert p is not None + assert inspect.iscoroutinefunction(p.crawl) + result = asyncio.run(p.crawl("https://example.com")) + assert isinstance(result, dict) + assert "results" in result + assert isinstance(result["results"], list) + # Without FIRECRAWL_API_KEY, the plugin's _get_firecrawl_client() + # raises ValueError which is caught and returned as a per-page error. + assert len(result["results"]) >= 1 + assert "error" in result["results"][0] + assert result["results"][0]["url"] == "https://example.com" diff --git a/tests/tools/test_website_policy.py b/tests/tools/test_website_policy.py index efc0e500de5..0e734cbae78 100644 --- a/tests/tools/test_website_policy.py +++ b/tests/tools/test_website_policy.py @@ -454,6 +454,9 @@ async def test_web_crawl_short_circuits_blocked_url(monkeypatch): monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") # Allow test URLs past SSRF check so website policy is what gets tested monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) + # The dispatcher-level (seed-URL) policy gate still lives on web_tools. + # No per-page gate runs in this test because the dispatcher returns + # immediately when the seed is blocked, before delegating to the plugin. monkeypatch.setattr( web_tools, "check_website_access", @@ -464,10 +467,13 @@ async def test_web_crawl_short_circuits_blocked_url(monkeypatch): "message": "Blocked by website policy", }, ) + # If the dispatcher ever reaches the firecrawl plugin's crawl(), the test + # fails — pin the plugin module's client lookup so we'd notice. + from plugins.web.firecrawl import provider as firecrawl_provider monkeypatch.setattr( - web_tools, + firecrawl_provider, "_get_firecrawl_client", - lambda: pytest.fail("firecrawl should not run for blocked crawl URL"), + lambda: pytest.fail("firecrawl plugin should not run for blocked crawl URL"), ) monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) @@ -480,13 +486,17 @@ async def test_web_crawl_short_circuits_blocked_url(monkeypatch): @pytest.mark.asyncio async def test_web_crawl_blocks_redirected_final_url(monkeypatch): from tools import web_tools + from plugins.web.firecrawl import provider as firecrawl_provider - # web_crawl_tool checks for Firecrawl env before website policy + # Force the firecrawl plugin to be the active crawl provider. monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") # Allow test URLs past SSRF check so website policy is what gets tested monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) def fake_check(url): + # Dispatcher seed-URL gate (web_tools.check_website_access call) + # and plugin per-page gate (firecrawl_provider.check_website_access + # call) both flow through this single fake_check. if url == "https://allowed.test": return None if url == "https://blocked.test/final": @@ -512,8 +522,13 @@ async def test_web_crawl_blocks_redirected_final_url(monkeypatch): ] } + # After PR #25182 follow-up: per-page policy gate lives in + # plugins.web.firecrawl.provider.crawl(). Patch the gate + client at + # the plugin location. The dispatcher-level (seed) gate also reads + # web_tools.check_website_access — patch both. monkeypatch.setattr(web_tools, "check_website_access", fake_check) - monkeypatch.setattr(web_tools, "_get_firecrawl_client", lambda: FakeCrawlClient()) + monkeypatch.setattr(firecrawl_provider, "check_website_access", fake_check) + monkeypatch.setattr(firecrawl_provider, "_get_firecrawl_client", lambda: FakeCrawlClient()) monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) result = json.loads(await web_tools.web_crawl_tool("https://allowed.test", use_llm_processing=False)) diff --git a/tools/web_tools.py b/tools/web_tools.py index 9265e57f3ec..1f0fd5fe117 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -1285,275 +1285,23 @@ async def web_crawl_tool( _debug.save() return cleaned_result - # No registered provider supports crawl. Fall through to the - # Firecrawl-via-summarize path below (legacy behavior) when - # Firecrawl credentials are configured. - - # web_crawl requires Firecrawl or the Firecrawl tool-gateway — Parallel has no crawl API - if not check_firecrawl_api_key(): - return json.dumps({ - "error": "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, FIRECRAWL_API_URL" - f"{_firecrawl_backend_help_suffix()}, or use web_search + web_extract instead.", - "success": False, - }, ensure_ascii=False) - - # Ensure URL has protocol - if not url.startswith(('http://', 'https://')): - url = f'https://{url}' - logger.info("Added https:// prefix to URL: %s", url) - - instructions_text = f" with instructions: '{instructions}'" if instructions else "" - logger.info("Crawling %s%s", url, instructions_text) - - # SSRF protection — block private/internal addresses - if not is_safe_url(url): - return json.dumps({"results": [{"url": url, "title": "", "content": "", - "error": "Blocked: URL targets a private or internal network address"}]}, ensure_ascii=False) - - # Website policy check — block before crawling - blocked = check_website_access(url) - if blocked: - logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"]) - return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"], - "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False) - - # Use Firecrawl's v2 crawl functionality - # Docs: https://docs.firecrawl.dev/features/crawl - # The crawl() method automatically waits for completion and returns all data - - # Build crawl parameters - keep it simple - crawl_params = { - "limit": 20, # Limit number of pages to crawl - "scrape_options": { - "formats": ["markdown"] # Just markdown for simplicity - } - } - - # Note: The 'prompt' parameter is not documented for crawl - # Instructions are typically used with the Extract endpoint, not Crawl - if instructions: - logger.info("Instructions parameter ignored (not supported in crawl API)") - - from tools.interrupt import is_interrupted as _is_int - if _is_int(): - return tool_error("Interrupted", success=False) - - try: - crawl_result = _get_firecrawl_client().crawl( - url=url, - **crawl_params - ) - except Exception as e: - logger.debug("Crawl API call failed: %s", e) - raise - - pages: List[Dict[str, Any]] = [] - - # Process crawl results - the crawl method returns a CrawlJob object with data attribute - data_list = [] - - # The crawl_result is a CrawlJob object with a 'data' attribute containing list of Document objects - if hasattr(crawl_result, 'data'): - data_list = crawl_result.data if crawl_result.data else [] - logger.info("Status: %s", getattr(crawl_result, 'status', 'unknown')) - logger.info("Retrieved %d pages", len(data_list)) - - # Debug: Check other attributes if no data - if not data_list: - logger.debug("CrawlJob attributes: %s", [attr for attr in dir(crawl_result) if not attr.startswith('_')]) - logger.debug("Status: %s", getattr(crawl_result, 'status', 'N/A')) - logger.debug("Total: %s", getattr(crawl_result, 'total', 'N/A')) - logger.debug("Completed: %s", getattr(crawl_result, 'completed', 'N/A')) - - elif isinstance(crawl_result, dict) and 'data' in crawl_result: - data_list = crawl_result.get("data", []) - else: - logger.warning("Unexpected crawl result type") - logger.debug("Result type: %s", type(crawl_result)) - if hasattr(crawl_result, '__dict__'): - logger.debug("Result attributes: %s", list(crawl_result.__dict__.keys())) - - for item in data_list: - # Process each crawled page - properly handle object serialization - page_url = "Unknown URL" - title = "" - content_markdown = None - content_html = None - metadata = {} - - # Extract data from the item - if hasattr(item, 'model_dump'): - # Pydantic model - use model_dump to get dict - item_dict = item.model_dump() - content_markdown = item_dict.get('markdown') - content_html = item_dict.get('html') - metadata = item_dict.get('metadata', {}) - elif hasattr(item, '__dict__'): - # Regular object with attributes - content_markdown = getattr(item, 'markdown', None) - content_html = getattr(item, 'html', None) - - # Handle metadata - convert to dict if it's an object - metadata_obj = getattr(item, 'metadata', {}) - if hasattr(metadata_obj, 'model_dump'): - metadata = metadata_obj.model_dump() - elif hasattr(metadata_obj, '__dict__'): - metadata = metadata_obj.__dict__ - elif isinstance(metadata_obj, dict): - metadata = metadata_obj - else: - metadata = {} - elif isinstance(item, dict): - # Already a dictionary - content_markdown = item.get('markdown') - content_html = item.get('html') - metadata = item.get('metadata', {}) - - # Ensure metadata is a dict (not an object) - if not isinstance(metadata, dict): - if hasattr(metadata, 'model_dump'): - metadata = metadata.model_dump() - elif hasattr(metadata, '__dict__'): - metadata = metadata.__dict__ - else: - metadata = {} - - # Extract URL and title from metadata - page_url = metadata.get("sourceURL", metadata.get("url", "Unknown URL")) - title = metadata.get("title", "") - - # Re-check crawled page URL against policy - page_blocked = check_website_access(page_url) - if page_blocked: - logger.info("Blocked crawled page %s by rule %s", page_blocked["host"], page_blocked["rule"]) - pages.append({ - "url": page_url, "title": title, "content": "", "raw_content": "", - "error": page_blocked["message"], - "blocked_by_policy": {"host": page_blocked["host"], "rule": page_blocked["rule"], "source": page_blocked["source"]}, - }) - continue - - # Choose content (prefer markdown) - content = content_markdown or content_html or "" - - pages.append({ - "url": page_url, - "title": title, - "content": content, - "raw_content": content, - "metadata": metadata # Now guaranteed to be a dict - }) - - response = {"results": pages} - - pages_crawled = len(response.get('results', [])) - logger.info("Crawled %d pages", pages_crawled) - - debug_call_data["pages_crawled"] = pages_crawled - debug_call_data["original_response_size"] = len(json.dumps(response)) - - # Process each result with LLM if enabled - if use_llm_processing and auxiliary_available: - logger.info("Processing crawled content with LLM (parallel)...") - debug_call_data["processing_applied"].append("llm_processing") - - # Prepare tasks for parallel processing - async def process_single_crawl_result(result): - """Process a single crawl result with LLM and return updated result with metrics.""" - page_url = result.get('url', 'Unknown URL') - title = result.get('title', '') - content = result.get('content', '') - - if not content: - return result, None, "no_content" - - original_size = len(content) - - # Process content with LLM - processed = await process_content_with_llm( - content, page_url, title, effective_model, min_length - ) - - if processed: - processed_size = len(processed) - compression_ratio = processed_size / original_size if original_size > 0 else 1.0 - - # Update result with processed content - result['raw_content'] = content - result['content'] = processed - - metrics = { - "url": page_url, - "original_size": original_size, - "processed_size": processed_size, - "compression_ratio": compression_ratio, - "model_used": effective_model - } - return result, metrics, "processed" - else: - metrics = { - "url": page_url, - "original_size": original_size, - "processed_size": original_size, - "compression_ratio": 1.0, - "model_used": None, - "reason": "content_too_short" - } - return result, metrics, "too_short" - - # Run all LLM processing in parallel - results_list = response.get('results', []) - tasks = [process_single_crawl_result(result) for result in results_list] - processed_results = await asyncio.gather(*tasks) - - # Collect metrics and print results - for result, metrics, status in processed_results: - page_url = result.get('url', 'Unknown URL') - if status == "processed": - debug_call_data["compression_metrics"].append(metrics) - debug_call_data["pages_processed_with_llm"] += 1 - logger.info("%s (processed)", page_url) - elif status == "too_short": - debug_call_data["compression_metrics"].append(metrics) - logger.info("%s (no processing - content too short)", page_url) - else: - logger.warning("%s (no content to process)", page_url) - else: - if use_llm_processing and not auxiliary_available: - logger.warning("LLM processing requested but no auxiliary model available, returning raw content") - debug_call_data["processing_applied"].append("llm_processing_unavailable") - # Print summary of crawled pages for debugging (original behavior) - for result in response.get('results', []): - page_url = result.get('url', 'Unknown URL') - content_length = len(result.get('content', '')) - logger.info("%s (%d characters)", page_url, content_length) - - # Trim output to minimal fields per entry: title, content, error - trimmed_results = [ + # No registered provider supports crawl AND no crawl-capable plugin + # is available. Surface a typed error pointing the user at the two + # crawl-capable providers (Firecrawl + Tavily). + return json.dumps( { - "url": r.get("url", ""), - "title": r.get("title", ""), - "content": r.get("content", ""), - "error": r.get("error"), - **({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}), - } - for r in response.get("results", []) - ] - trimmed_response = {"results": trimmed_results} - - result_json = json.dumps(trimmed_response, indent=2, ensure_ascii=False) - # Clean base64 images from crawled content - cleaned_result = clean_base64_images(result_json) - - debug_call_data["final_response_size"] = len(cleaned_result) - debug_call_data["processing_applied"].append("base64_image_removal") - - # Log debug information - _debug.log_call("web_crawl_tool", debug_call_data) - _debug.save() - - return cleaned_result - + "success": False, + "error": ( + "web_crawl has no available backend. " + "Set FIRECRAWL_API_KEY (or FIRECRAWL_API_URL for " + f"self-hosted){_firecrawl_backend_help_suffix()}, " + "or set TAVILY_API_KEY for Tavily. " + "Alternatively use web_search + web_extract instead." + ), + }, + ensure_ascii=False, + ) + except Exception as e: error_msg = f"Error crawling website: {str(e)}" logger.debug("%s", error_msg)