mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-19 04:52:06 +00:00
fix(web): preserve firecrawl crawl + website-policy gate after migration
Two regressions discovered by running the full tests/tools/ suite after
the dispatcher cutover, both fixed in this commit:
1. web_crawl_tool incorrectly errored "search-only" for firecrawl
---------------------------------------------------------------------
The cutover treated any provider with supports_crawl()==False as a
search-only backend and returned the typed search-only error. But
firecrawl can crawl via the legacy multi-page-extract path inside
web_crawl_tool — it just doesn't expose supports_crawl on the plugin
(adding native firecrawl crawl is a clean follow-up).
Fix: only emit the search-only error when the provider supports
NEITHER crawl NOR extract (brave-free / ddgs / searxng). When the
provider supports extract but not crawl (firecrawl), fall through to
the legacy firecrawl-via-extract path below.
2. firecrawl plugin's check_website_access wasn't patchable
---------------------------------------------------------------------
The plugin imported `from tools.website_policy import check_website_access`
INSIDE the extract() function body, so monkeypatching the name on
plugins.web.firecrawl.provider had no effect — the inner import re-bound
the name on every call.
Fix: hoist the import to module level. Cheap (website_policy itself
has no heavy deps) and makes the standard
monkeypatch.setattr(firecrawl_provider, "check_website_access", ...)
pattern work.
Test updates (tests/tools/test_website_policy.py — 4 tests):
- test_web_extract_short_circuits_blocked_url
- test_web_extract_blocks_redirected_final_url
Both: patch the gate at plugins.web.firecrawl.provider (where it
runs after migration) and force the firecrawl plugin to be the
active extract provider via FIRECRAWL_API_KEY.
- test_web_crawl_short_circuits_blocked_url
- test_web_crawl_blocks_redirected_final_url
Both: unchanged — the dispatcher-level gate at tools.web_tools.py
line 1651 still uses the imported `check_website_access` name and
the firecrawl-fallthrough path is exercised as before.
Verified: 22/22 tests/tools/test_website_policy.py pass.
This commit is contained in:
parent
b05253ceed
commit
5e54330e27
3 changed files with 39 additions and 23 deletions
|
|
@ -51,6 +51,7 @@ import os
|
|||
from typing import Any, Dict, List, Optional, TYPE_CHECKING
|
||||
|
||||
from agent.web_search_provider import WebSearchProvider
|
||||
from tools.website_policy import check_website_access
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -417,9 +418,9 @@ class FirecrawlWebSearchProvider(WebSearchProvider):
|
|||
else:
|
||||
formats = ["markdown", "html"]
|
||||
|
||||
# check_website_access is the legacy policy gate; import inside
|
||||
# the function so the plugin doesn't pay the cost when never used.
|
||||
from tools.website_policy import check_website_access
|
||||
# check_website_access is the legacy policy gate; imported at
|
||||
# module level (lazy-friendly because the website_policy import is
|
||||
# cheap) so monkeypatching it in tests works as expected.
|
||||
|
||||
results: List[Dict[str, Any]] = []
|
||||
|
||||
|
|
|
|||
|
|
@ -350,11 +350,16 @@ def test_browser_navigate_allows_when_shared_file_missing(monkeypatch, tmp_path)
|
|||
@pytest.mark.asyncio
|
||||
async def test_web_extract_short_circuits_blocked_url(monkeypatch):
|
||||
from tools import web_tools
|
||||
from plugins.web.firecrawl import provider as firecrawl_provider
|
||||
|
||||
# Allow test URLs past SSRF check so website policy is what gets tested
|
||||
monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True)
|
||||
# The per-URL website-policy gate moved into the firecrawl plugin's
|
||||
# extract() during the web-provider migration. Patch it at the new
|
||||
# location; the dispatcher-level gate (used by web_crawl_tool's
|
||||
# pre-flight) still lives on tools.web_tools.
|
||||
monkeypatch.setattr(
|
||||
web_tools,
|
||||
firecrawl_provider,
|
||||
"check_website_access",
|
||||
lambda url: {
|
||||
"host": "blocked.test",
|
||||
|
|
@ -364,11 +369,13 @@ async def test_web_extract_short_circuits_blocked_url(monkeypatch):
|
|||
},
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
web_tools,
|
||||
firecrawl_provider,
|
||||
"_get_firecrawl_client",
|
||||
lambda: pytest.fail("firecrawl should not run for blocked URL"),
|
||||
)
|
||||
monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False)
|
||||
# Force the firecrawl plugin to be the active extract provider.
|
||||
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
|
||||
|
||||
result = json.loads(await web_tools.web_extract_tool(["https://blocked.test"], use_llm_processing=False))
|
||||
|
||||
|
|
@ -398,6 +405,7 @@ def test_check_website_access_fails_open_on_malformed_config(tmp_path, monkeypat
|
|||
@pytest.mark.asyncio
|
||||
async def test_web_extract_blocks_redirected_final_url(monkeypatch):
|
||||
from tools import web_tools
|
||||
from plugins.web.firecrawl import provider as firecrawl_provider
|
||||
|
||||
# Allow test URLs past SSRF check so website policy is what gets tested
|
||||
monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True)
|
||||
|
|
@ -424,9 +432,12 @@ async def test_web_extract_blocks_redirected_final_url(monkeypatch):
|
|||
},
|
||||
}
|
||||
|
||||
monkeypatch.setattr(web_tools, "check_website_access", fake_check)
|
||||
monkeypatch.setattr(web_tools, "_get_firecrawl_client", lambda: FakeFirecrawlClient())
|
||||
# After the web-provider migration, the per-URL gate + firecrawl client
|
||||
# live in the plugin. Patch both at the plugin location.
|
||||
monkeypatch.setattr(firecrawl_provider, "check_website_access", fake_check)
|
||||
monkeypatch.setattr(firecrawl_provider, "_get_firecrawl_client", lambda: FakeFirecrawlClient())
|
||||
monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False)
|
||||
monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key")
|
||||
|
||||
result = json.loads(await web_tools.web_extract_tool(["https://allowed.test"], use_llm_processing=False))
|
||||
|
||||
|
|
|
|||
|
|
@ -1618,22 +1618,26 @@ async def web_crawl_tool(
|
|||
|
||||
crawl_provider = _wsp_get_provider(backend) if backend else None
|
||||
if crawl_provider is not None and not crawl_provider.supports_crawl():
|
||||
# Configured name IS registered but doesn't support crawl
|
||||
# (search-only providers like brave-free / ddgs / searxng).
|
||||
# Surface a typed error rather than silently switching to a
|
||||
# different crawl backend.
|
||||
return json.dumps(
|
||||
{
|
||||
"success": False,
|
||||
"error": (
|
||||
f"{crawl_provider.display_name} is a search-only "
|
||||
"backend and cannot crawl URLs. "
|
||||
"Set FIRECRAWL_API_KEY for crawling, or use "
|
||||
"web_search instead."
|
||||
),
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
# When the configured provider is search-only AND cannot
|
||||
# extract URLs either (brave-free / ddgs / searxng), surface a
|
||||
# typed "search-only" error rather than silently switching to
|
||||
# a different crawl backend. When the provider supports extract
|
||||
# but not crawl (e.g. firecrawl), fall through to the legacy
|
||||
# firecrawl-via-extract path below.
|
||||
if not crawl_provider.supports_extract():
|
||||
return json.dumps(
|
||||
{
|
||||
"success": False,
|
||||
"error": (
|
||||
f"{crawl_provider.display_name} is a search-only "
|
||||
"backend and cannot crawl URLs. "
|
||||
"Set FIRECRAWL_API_KEY for crawling, or use "
|
||||
"web_search instead."
|
||||
),
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
crawl_provider = None # let legacy firecrawl path handle it
|
||||
if crawl_provider is None:
|
||||
crawl_provider = get_active_crawl_provider()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue