From 5e54330e27d670dbf922c6d16498ca0c7d6ad08e Mon Sep 17 00:00:00 2001
From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
Date: Thu, 14 May 2026 00:34:28 +0530
Subject: [PATCH] fix(web): preserve firecrawl crawl + website-policy gate
 after migration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two regressions discovered by running the full tests/tools/ suite after
the dispatcher cutover, both fixed in this commit:

1. web_crawl_tool incorrectly errored "search-only" for firecrawl
---------------------------------------------------------------------
The cutover treated any provider with supports_crawl()==False as a
search-only backend and returned the typed search-only error. But
firecrawl can crawl via the legacy multi-page-extract path inside
web_crawl_tool — it just doesn't expose supports_crawl on the plugin
(adding native firecrawl crawl is a clean follow-up).

Fix: only emit the search-only error when the provider supports
NEITHER crawl NOR extract (brave-free / ddgs / searxng). When the
provider supports extract but not crawl (firecrawl), fall through to
the legacy firecrawl-via-extract path below.

2. firecrawl plugin's check_website_access wasn't patchable
---------------------------------------------------------------------
The plugin imported `from tools.website_policy import check_website_access`
INSIDE the extract() function body, so monkeypatching the name on
plugins.web.firecrawl.provider had no effect — the inner import re-bound
the name on every call.

Fix: hoist the import to module level. Cheap (website_policy itself
has no heavy deps) and makes the standard
monkeypatch.setattr(firecrawl_provider, "check_website_access", ...)
pattern work.

Test updates (tests/tools/test_website_policy.py — 4 tests):
  - test_web_extract_short_circuits_blocked_url
  - test_web_extract_blocks_redirected_final_url
    Both: patch the gate at plugins.web.firecrawl.provider (where it
    runs after migration) and force the firecrawl plugin to be the
    active extract provider via FIRECRAWL_API_KEY.
  - test_web_crawl_short_circuits_blocked_url
  - test_web_crawl_blocks_redirected_final_url
    Both: unchanged — the dispatcher-level gate at tools.web_tools.py
    line 1651 still uses the imported `check_website_access` name and
    the firecrawl-fallthrough path is exercised as before.

Verified: 22/22 tests/tools/test_website_policy.py pass.
---
 plugins/web/firecrawl/provider.py  |  7 +++---
 tests/tools/test_website_policy.py | 19 ++++++++++++----
 tools/web_tools.py                 | 36 +++++++++++++++++-------------
 3 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/plugins/web/firecrawl/provider.py b/plugins/web/firecrawl/provider.py
index 64268448348..5c092f4e4e3 100644
--- a/plugins/web/firecrawl/provider.py
+++ b/plugins/web/firecrawl/provider.py
@@ -51,6 +51,7 @@ import os
 from typing import Any, Dict, List, Optional, TYPE_CHECKING
 
 from agent.web_search_provider import WebSearchProvider
+from tools.website_policy import check_website_access
 
 logger = logging.getLogger(__name__)
 
@@ -417,9 +418,9 @@ class FirecrawlWebSearchProvider(WebSearchProvider):
         else:
             formats = ["markdown", "html"]
 
-        # check_website_access is the legacy policy gate; import inside
-        # the function so the plugin doesn't pay the cost when never used.
-        from tools.website_policy import check_website_access
+        # check_website_access is the legacy policy gate; imported at
+        # module level (lazy-friendly because the website_policy import is
+        # cheap) so monkeypatching it in tests works as expected.
 
         results: List[Dict[str, Any]] = []
 
diff --git a/tests/tools/test_website_policy.py b/tests/tools/test_website_policy.py
index 4573e027650..efc0e500de5 100644
--- a/tests/tools/test_website_policy.py
+++ b/tests/tools/test_website_policy.py
@@ -350,11 +350,16 @@ def test_browser_navigate_allows_when_shared_file_missing(monkeypatch, tmp_path)
 @pytest.mark.asyncio
 async def test_web_extract_short_circuits_blocked_url(monkeypatch):
     from tools import web_tools
+    from plugins.web.firecrawl import provider as firecrawl_provider
 
     # Allow test URLs past SSRF check so website policy is what gets tested
     monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True)
+    # The per-URL website-policy gate moved into the firecrawl plugin's
+    # extract() during the web-provider migration. Patch it at the new
+    # location; the dispatcher-level gate (used by web_crawl_tool's
+    # pre-flight) still lives on tools.web_tools.
     monkeypatch.setattr(
-        web_tools,
+        firecrawl_provider,
         "check_website_access",
         lambda url: {
             "host": "blocked.test",
@@ -364,11 +369,13 @@ async def test_web_extract_short_circuits_blocked_url(monkeypatch):
         },
     )
     monkeypatch.setattr(
-        web_tools,
+        firecrawl_provider,
         "_get_firecrawl_client",
         lambda: pytest.fail("firecrawl should not run for blocked URL"),
     )
     monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False)
+    # Force the firecrawl plugin to be the active extract provider.
+    monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
 
     result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"], use_llm_processing=False))
 
@@ -398,6 +405,7 @@ def test_check_website_access_fails_open_on_malformed_config(tmp_path, monkeypat
 @pytest.mark.asyncio
 async def test_web_extract_blocks_redirected_final_url(monkeypatch):
     from tools import web_tools
+    from plugins.web.firecrawl import provider as firecrawl_provider
 
     # Allow test URLs past SSRF check so website policy is what gets tested
     monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True)
@@ -424,9 +432,12 @@ async def test_web_extract_blocks_redirected_final_url(monkeypatch):
                 },
             }
 
-    monkeypatch.setattr(web_tools, "check_website_access", fake_check)
-    monkeypatch.setattr(web_tools, "_get_firecrawl_client", lambda: FakeFirecrawlClient())
+    # After the web-provider migration, the per-URL gate + firecrawl client
+    # live in the plugin. Patch both at the plugin location.
+    monkeypatch.setattr(firecrawl_provider, "check_website_access", fake_check)
+    monkeypatch.setattr(firecrawl_provider, "_get_firecrawl_client", lambda: FakeFirecrawlClient())
     monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False)
+    monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
 
     result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"], use_llm_processing=False))
 
diff --git a/tools/web_tools.py b/tools/web_tools.py
index d0d9919d25b..2fc9ebdf99f 100644
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -1618,22 +1618,26 @@ async def web_crawl_tool(
 
         crawl_provider = _wsp_get_provider(backend) if backend else None
         if crawl_provider is not None and not crawl_provider.supports_crawl():
-            # Configured name IS registered but doesn't support crawl
-            # (search-only providers like brave-free / ddgs / searxng).
-            # Surface a typed error rather than silently switching to a
-            # different crawl backend.
-            return json.dumps(
-                {
-                    "success": False,
-                    "error": (
-                        f"{crawl_provider.display_name} is a search-only "
-                        "backend and cannot crawl URLs. "
-                        "Set FIRECRAWL_API_KEY for crawling, or use "
-                        "web_search instead."
-                    ),
-                },
-                ensure_ascii=False,
-            )
+            # When the configured provider is search-only AND cannot
+            # extract URLs either (brave-free / ddgs / searxng), surface a
+            # typed "search-only" error rather than silently switching to
+            # a different crawl backend. When the provider supports extract
+            # but not crawl (e.g. firecrawl), fall through to the legacy
+            # firecrawl-via-extract path below.
+            if not crawl_provider.supports_extract():
+                return json.dumps(
+                    {
+                        "success": False,
+                        "error": (
+                            f"{crawl_provider.display_name} is a search-only "
+                            "backend and cannot crawl URLs. "
+                            "Set FIRECRAWL_API_KEY for crawling, or use "
+                            "web_search instead."
+                        ),
+                    },
+                    ensure_ascii=False,
+                )
+            crawl_provider = None  # let legacy firecrawl path handle it
         if crawl_provider is None:
             crawl_provider = get_active_crawl_provider()