chore(web): remove web_crawl tool + provider crawl plumbing (#33824)

The web_crawl_tool() function was an orphan — no model schema registered
it, no skill or CLI command called it, and the agent had no way to invoke
it. PR #32608 proposed wiring it up as a model-callable tool; we've
decided not to expose crawl as a separate capability since web_search +
web_extract cover the use cases we want models to have.

Removed:
- tools/web_tools.py: web_crawl_tool() (~230 LOC)
- plugins/web/firecrawl/provider.py: supports_crawl() + crawl()
- plugins/web/tavily/provider.py: supports_crawl() + crawl()
- plugins/web/xai/provider.py: supports_crawl() override
- agent/web_search_provider.py: supports_crawl() + crawl() ABC methods
- agent/web_search_registry.py: get_active_crawl_provider() +
  the 'crawl' branch in _resolve()
- agent/display.py: web_crawl tool-progress rendering
- hermes_cli/config.py: 'web_crawl' from TAVILY_API_KEY.tools
- tools/website_policy.py: stale comment reference
- Tests: removed TestWebCrawlTavily class, the two website-policy
  web_crawl tests, the searxng/ddgs/brave-free crawl-error tests,
  the integration test_web_crawl method, and the
  test_unconfigured_crawl_emits_top_level_error test. Trimmed the
  capability-flag parametrize list and the WebSearchProvider ABC
  conformance tests.
- Docs: trimmed the Crawl column from capability tables in both EN
  and zh-Hans, updated the developer-guide ABC table.

Net: 25 files, +115/-1067.

Closes #33762 (the schema-text bug only existed if #32608 landed).
Supersedes #32608.
This commit is contained in:
Teknium 2026-05-28 04:52:42 -07:00 committed by GitHub
parent b243afb68b
commit 5e1f793430
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 115 additions and 1067 deletions

View file

@ -90,20 +90,17 @@ class TestBundledPluginsRegister:
]
@pytest.mark.parametrize(
"plugin_name,expected_search,expected_extract,expected_crawl",
"plugin_name,expected_search,expected_extract",
[
("brave-free", True, False, False),
("ddgs", True, False, False),
("searxng", True, False, False),
("exa", True, True, False),
("parallel", True, True, False),
("tavily", True, True, True),
# firecrawl: search + extract + crawl. Crawl was originally
# disabled in the migration (fell through to a legacy inline
# path); the follow-up commit enabled it natively.
("firecrawl", True, True, True),
("brave-free", True, False),
("ddgs", True, False),
("searxng", True, False),
("exa", True, True),
("parallel", True, True),
("tavily", True, True),
("firecrawl", True, True),
# xai: search-only via Grok's agentic web_search tool.
("xai", True, False, False),
("xai", True, False),
],
)
def test_capability_flags_match_spec(
@ -111,7 +108,6 @@ class TestBundledPluginsRegister:
plugin_name: str,
expected_search: bool,
expected_extract: bool,
expected_crawl: bool,
) -> None:
_ensure_plugins_loaded()
from agent.web_search_registry import get_provider
@ -120,7 +116,6 @@ class TestBundledPluginsRegister:
assert provider is not None, f"plugin {plugin_name!r} not registered"
assert provider.supports_search() is expected_search
assert provider.supports_extract() is expected_extract
assert provider.supports_crawl() is expected_crawl
@pytest.mark.parametrize(
"plugin_name",
@ -457,38 +452,6 @@ class TestErrorResponseShapes:
if result: # if anything came back, it should be an error entry
assert "error" in result[0]
def test_tavily_crawl_returns_error_dict_when_unconfigured(self) -> None:
_ensure_plugins_loaded()
from agent.web_search_registry import get_provider
p = get_provider("tavily")
assert p is not None
result = p.crawl("https://example.com")
assert isinstance(result, dict)
assert "results" in result
assert isinstance(result["results"], list)
if result["results"]:
assert "error" in result["results"][0]
def test_firecrawl_crawl_returns_error_dict_when_unconfigured(self):
"""firecrawl crawl is async (wraps SDK in to_thread); error must be
surfaced via the per-page result shape, not raised."""
_ensure_plugins_loaded()
from agent.web_search_registry import get_provider
p = get_provider("firecrawl")
assert p is not None
assert inspect.iscoroutinefunction(p.crawl)
result = asyncio.run(p.crawl("https://example.com"))
assert isinstance(result, dict)
assert "results" in result
assert isinstance(result["results"], list)
# Without FIRECRAWL_API_KEY, the plugin's _get_firecrawl_client()
# raises ValueError which is caught and returned as a per-page error.
assert len(result["results"]) >= 1
assert "error" in result["results"][0]
assert result["results"][0]["url"] == "https://example.com"
def test_firecrawl_config_error_points_paid_users_to_nous_subscription(self, monkeypatch):
from plugins.web.firecrawl import provider as firecrawl_provider