From 5e54330e27d670dbf922c6d16498ca0c7d6ad08e Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Thu, 14 May 2026 00:34:28 +0530 Subject: [PATCH] fix(web): preserve firecrawl crawl + website-policy gate after migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two regressions discovered by running the full tests/tools/ suite after the dispatcher cutover, both fixed in this commit: 1. web_crawl_tool incorrectly errored "search-only" for firecrawl --------------------------------------------------------------------- The cutover treated any provider with supports_crawl()==False as a search-only backend and returned the typed search-only error. But firecrawl can crawl via the legacy multi-page-extract path inside web_crawl_tool — it just doesn't expose supports_crawl on the plugin (adding native firecrawl crawl is a clean follow-up). Fix: only emit the search-only error when the provider supports NEITHER crawl NOR extract (brave-free / ddgs / searxng). When the provider supports extract but not crawl (firecrawl), fall through to the legacy firecrawl-via-extract path below. 2. firecrawl plugin's check_website_access wasn't patchable --------------------------------------------------------------------- The plugin imported `from tools.website_policy import check_website_access` INSIDE the extract() function body, so monkeypatching the name on plugins.web.firecrawl.provider had no effect — the inner import re-bound the name on every call. Fix: hoist the import to module level. Cheap (website_policy itself has no heavy deps) and makes the standard monkeypatch.setattr(firecrawl_provider, "check_website_access", ...) pattern work. Test updates (tests/tools/test_website_policy.py — 4 tests): - test_web_extract_short_circuits_blocked_url - test_web_extract_blocks_redirected_final_url Both: patch the gate at plugins.web.firecrawl.provider (where it runs after migration) and force the firecrawl plugin to be the active extract provider via FIRECRAWL_API_KEY. - test_web_crawl_short_circuits_blocked_url - test_web_crawl_blocks_redirected_final_url Both: unchanged — the dispatcher-level gate at tools.web_tools.py line 1651 still uses the imported `check_website_access` name and the firecrawl-fallthrough path is exercised as before. Verified: 22/22 tests/tools/test_website_policy.py pass. --- plugins/web/firecrawl/provider.py | 7 +++--- tests/tools/test_website_policy.py | 19 ++++++++++++---- tools/web_tools.py | 36 +++++++++++++++++------------- 3 files changed, 39 insertions(+), 23 deletions(-) diff --git a/plugins/web/firecrawl/provider.py b/plugins/web/firecrawl/provider.py index 64268448348..5c092f4e4e3 100644 --- a/plugins/web/firecrawl/provider.py +++ b/plugins/web/firecrawl/provider.py @@ -51,6 +51,7 @@ import os from typing import Any, Dict, List, Optional, TYPE_CHECKING from agent.web_search_provider import WebSearchProvider +from tools.website_policy import check_website_access logger = logging.getLogger(__name__) @@ -417,9 +418,9 @@ class FirecrawlWebSearchProvider(WebSearchProvider): else: formats = ["markdown", "html"] - # check_website_access is the legacy policy gate; import inside - # the function so the plugin doesn't pay the cost when never used. - from tools.website_policy import check_website_access + # check_website_access is the legacy policy gate; imported at + # module level (lazy-friendly because the website_policy import is + # cheap) so monkeypatching it in tests works as expected. results: List[Dict[str, Any]] = [] diff --git a/tests/tools/test_website_policy.py b/tests/tools/test_website_policy.py index 4573e027650..efc0e500de5 100644 --- a/tests/tools/test_website_policy.py +++ b/tests/tools/test_website_policy.py @@ -350,11 +350,16 @@ def test_browser_navigate_allows_when_shared_file_missing(monkeypatch, tmp_path) @pytest.mark.asyncio async def test_web_extract_short_circuits_blocked_url(monkeypatch): from tools import web_tools + from plugins.web.firecrawl import provider as firecrawl_provider # Allow test URLs past SSRF check so website policy is what gets tested monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) + # The per-URL website-policy gate moved into the firecrawl plugin's + # extract() during the web-provider migration. Patch it at the new + # location; the dispatcher-level gate (used by web_crawl_tool's + # pre-flight) still lives on tools.web_tools. monkeypatch.setattr( - web_tools, + firecrawl_provider, "check_website_access", lambda url: { "host": "blocked.test", @@ -364,11 +369,13 @@ async def test_web_extract_short_circuits_blocked_url(monkeypatch): }, ) monkeypatch.setattr( - web_tools, + firecrawl_provider, "_get_firecrawl_client", lambda: pytest.fail("firecrawl should not run for blocked URL"), ) monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) + # Force the firecrawl plugin to be the active extract provider. + monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"], use_llm_processing=False)) @@ -398,6 +405,7 @@ def test_check_website_access_fails_open_on_malformed_config(tmp_path, monkeypat @pytest.mark.asyncio async def test_web_extract_blocks_redirected_final_url(monkeypatch): from tools import web_tools + from plugins.web.firecrawl import provider as firecrawl_provider # Allow test URLs past SSRF check so website policy is what gets tested monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) @@ -424,9 +432,12 @@ async def test_web_extract_blocks_redirected_final_url(monkeypatch): }, } - monkeypatch.setattr(web_tools, "check_website_access", fake_check) - monkeypatch.setattr(web_tools, "_get_firecrawl_client", lambda: FakeFirecrawlClient()) + # After the web-provider migration, the per-URL gate + firecrawl client + # live in the plugin. Patch both at the plugin location. + monkeypatch.setattr(firecrawl_provider, "check_website_access", fake_check) + monkeypatch.setattr(firecrawl_provider, "_get_firecrawl_client", lambda: FakeFirecrawlClient()) monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) + monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"], use_llm_processing=False)) diff --git a/tools/web_tools.py b/tools/web_tools.py index d0d9919d25b..2fc9ebdf99f 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -1618,22 +1618,26 @@ async def web_crawl_tool( crawl_provider = _wsp_get_provider(backend) if backend else None if crawl_provider is not None and not crawl_provider.supports_crawl(): - # Configured name IS registered but doesn't support crawl - # (search-only providers like brave-free / ddgs / searxng). - # Surface a typed error rather than silently switching to a - # different crawl backend. - return json.dumps( - { - "success": False, - "error": ( - f"{crawl_provider.display_name} is a search-only " - "backend and cannot crawl URLs. " - "Set FIRECRAWL_API_KEY for crawling, or use " - "web_search instead." - ), - }, - ensure_ascii=False, - ) + # When the configured provider is search-only AND cannot + # extract URLs either (brave-free / ddgs / searxng), surface a + # typed "search-only" error rather than silently switching to + # a different crawl backend. When the provider supports extract + # but not crawl (e.g. firecrawl), fall through to the legacy + # firecrawl-via-extract path below. + if not crawl_provider.supports_extract(): + return json.dumps( + { + "success": False, + "error": ( + f"{crawl_provider.display_name} is a search-only " + "backend and cannot crawl URLs. " + "Set FIRECRAWL_API_KEY for crawling, or use " + "web_search instead." + ), + }, + ensure_ascii=False, + ) + crawl_provider = None # let legacy firecrawl path handle it if crawl_provider is None: crawl_provider = get_active_crawl_provider()