diff --git a/plugins/web/firecrawl/provider.py b/plugins/web/firecrawl/provider.py index 64268448348..5c092f4e4e3 100644 --- a/plugins/web/firecrawl/provider.py +++ b/plugins/web/firecrawl/provider.py @@ -51,6 +51,7 @@ import os from typing import Any, Dict, List, Optional, TYPE_CHECKING from agent.web_search_provider import WebSearchProvider +from tools.website_policy import check_website_access logger = logging.getLogger(__name__) @@ -417,9 +418,9 @@ class FirecrawlWebSearchProvider(WebSearchProvider): else: formats = ["markdown", "html"] - # check_website_access is the legacy policy gate; import inside - # the function so the plugin doesn't pay the cost when never used. - from tools.website_policy import check_website_access + # check_website_access is the legacy policy gate; imported at + # module level (lazy-friendly because the website_policy import is + # cheap) so monkeypatching it in tests works as expected. results: List[Dict[str, Any]] = [] diff --git a/tests/tools/test_website_policy.py b/tests/tools/test_website_policy.py index 4573e027650..efc0e500de5 100644 --- a/tests/tools/test_website_policy.py +++ b/tests/tools/test_website_policy.py @@ -350,11 +350,16 @@ def test_browser_navigate_allows_when_shared_file_missing(monkeypatch, tmp_path) @pytest.mark.asyncio async def test_web_extract_short_circuits_blocked_url(monkeypatch): from tools import web_tools + from plugins.web.firecrawl import provider as firecrawl_provider # Allow test URLs past SSRF check so website policy is what gets tested monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) + # The per-URL website-policy gate moved into the firecrawl plugin's + # extract() during the web-provider migration. Patch it at the new + # location; the dispatcher-level gate (used by web_crawl_tool's + # pre-flight) still lives on tools.web_tools. monkeypatch.setattr( - web_tools, + firecrawl_provider, "check_website_access", lambda url: { "host": "blocked.test", @@ -364,11 +369,13 @@ async def test_web_extract_short_circuits_blocked_url(monkeypatch): }, ) monkeypatch.setattr( - web_tools, + firecrawl_provider, "_get_firecrawl_client", lambda: pytest.fail("firecrawl should not run for blocked URL"), ) monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) + # Force the firecrawl plugin to be the active extract provider. + monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"], use_llm_processing=False)) @@ -398,6 +405,7 @@ def test_check_website_access_fails_open_on_malformed_config(tmp_path, monkeypat @pytest.mark.asyncio async def test_web_extract_blocks_redirected_final_url(monkeypatch): from tools import web_tools + from plugins.web.firecrawl import provider as firecrawl_provider # Allow test URLs past SSRF check so website policy is what gets tested monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) @@ -424,9 +432,12 @@ async def test_web_extract_blocks_redirected_final_url(monkeypatch): }, } - monkeypatch.setattr(web_tools, "check_website_access", fake_check) - monkeypatch.setattr(web_tools, "_get_firecrawl_client", lambda: FakeFirecrawlClient()) + # After the web-provider migration, the per-URL gate + firecrawl client + # live in the plugin. Patch both at the plugin location. + monkeypatch.setattr(firecrawl_provider, "check_website_access", fake_check) + monkeypatch.setattr(firecrawl_provider, "_get_firecrawl_client", lambda: FakeFirecrawlClient()) monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) + monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"], use_llm_processing=False)) diff --git a/tools/web_tools.py b/tools/web_tools.py index d0d9919d25b..2fc9ebdf99f 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -1618,22 +1618,26 @@ async def web_crawl_tool( crawl_provider = _wsp_get_provider(backend) if backend else None if crawl_provider is not None and not crawl_provider.supports_crawl(): - # Configured name IS registered but doesn't support crawl - # (search-only providers like brave-free / ddgs / searxng). - # Surface a typed error rather than silently switching to a - # different crawl backend. - return json.dumps( - { - "success": False, - "error": ( - f"{crawl_provider.display_name} is a search-only " - "backend and cannot crawl URLs. " - "Set FIRECRAWL_API_KEY for crawling, or use " - "web_search instead." - ), - }, - ensure_ascii=False, - ) + # When the configured provider is search-only AND cannot + # extract URLs either (brave-free / ddgs / searxng), surface a + # typed "search-only" error rather than silently switching to + # a different crawl backend. When the provider supports extract + # but not crawl (e.g. firecrawl), fall through to the legacy + # firecrawl-via-extract path below. + if not crawl_provider.supports_extract(): + return json.dumps( + { + "success": False, + "error": ( + f"{crawl_provider.display_name} is a search-only " + "backend and cannot crawl URLs. " + "Set FIRECRAWL_API_KEY for crawling, or use " + "web_search instead." + ), + }, + ensure_ascii=False, + ) + crawl_provider = None # let legacy firecrawl path handle it if crawl_provider is None: crawl_provider = get_active_crawl_provider()