mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-24 05:41:40 +00:00
refactor(web): dispatch all three tools through web_search_registry
Cuts over web_search_tool, web_extract_tool, and web_crawl_tool in
tools/web_tools.py to dispatch through agent.web_search_registry
instead of the legacy hardcoded if-elif backend chains.
Per-tool changes:
web_search_tool (sync)
Replace 5 backend branches (parallel, exa, registry-3-providers,
tavily, firecrawl-fallthrough) with a single registry path:
1. _get_search_backend() resolves the configured name
2. _wsp_get_provider(name) for explicit-config-wins semantics
3. get_active_search_provider() fallback for typo / unknown name
4. provider.search(query, limit) — sync for all 7 providers
web_extract_tool (async)
Replace 4 backend branches (parallel-async, exa-sync, tavily-sync,
search-only-error, firecrawl-perurl-loop) with:
1. Same provider resolution as search.
2. When configured backend IS registered but doesn't support
extract (search-only providers like brave-free), surface a
typed "search-only" error matching the legacy text — tests
assert that wording.
3. inspect.iscoroutinefunction(provider.extract) detects sync vs
async: parallel + firecrawl are async; exa + tavily are sync.
Sync extracts run in asyncio.to_thread() so we don't block.
web_crawl_tool (async)
Replace tavily-specific branch + search-only-error block with:
1. _wsp_get_provider(backend) — explicit config first
2. Search-only typed error when the configured name doesn't
support crawl (matches legacy phrasing)
3. get_active_crawl_provider() fallback otherwise
4. provider.crawl(url, **kwargs) — async-or-sync dispatch as above
5. Response post-processing (LLM summarization, trimming) stays
unchanged — it's not provider-specific.
When no plugin advertises supports_crawl, falls through to the
existing Firecrawl-via-web-summarize path below (unchanged).
Test updates (2 tests in tests/tools/test_web_tools_config.py):
- test_web_search_clamps_limit_before_backend_call:
patch("tools.web_tools._parallel_search") -> patch the registry
provider returned by agent.web_search_registry.get_provider
- test_search_error_response_does_not_expose_diagnostics:
patch("tools.web_tools._get_firecrawl_client") -> same pattern
Tests unchanged (still pass):
- All TestXBackendWiring classes (test _get_backend / _is_backend_available
config-resolution, independent of dispatch)
- All TestXSearchOnlyErrors classes (test the search-only error path
via web_extract_tool / web_crawl_tool — error text preserved)
- 141 passing web tests total, 0 regressions.
Dead-code cleanup deferred to a follow-up commit so this diff stays
focused on the cutover. After this commit:
- tools.web_tools._exa_search / _exa_extract / _parallel_search /
_parallel_extract / _tavily_request / _normalize_tavily_* /
_get_firecrawl_client / _extract_web_search_results /
_extract_scrape_payload / _to_plain_object / _normalize_result_list
are no longer called by the dispatchers, but still exist.
- The config-resolution layer (_get_backend, _is_backend_available,
_is_tool_gateway_ready, _has_direct_firecrawl_config) IS still in
use and must stay.
- The Firecrawl proxy and check_firecrawl_api_key are still imported
by integration tests and patched by unit tests — must stay (or be
re-exported from the plugin).
This commit is contained in:
parent
143184e943
commit
b05253ceed
2 changed files with 175 additions and 238 deletions
|
|
@ -485,15 +485,28 @@ class TestWebSearchSchema:
|
||||||
def test_web_search_clamps_limit_before_backend_call(self):
|
def test_web_search_clamps_limit_before_backend_call(self):
|
||||||
import tools.web_tools
|
import tools.web_tools
|
||||||
|
|
||||||
with patch("tools.web_tools._get_backend", return_value="parallel"), \
|
# After the web-provider plugin migration, _parallel_search lives in
|
||||||
patch("tools.web_tools._parallel_search", return_value={"success": True, "data": {"web": []}}) as mock_search, \
|
# plugins.web.parallel.provider.ParallelWebSearchProvider.search; the
|
||||||
|
# tool dispatcher resolves a provider from the registry and calls
|
||||||
|
# provider.search(query, limit). Mock the provider lookup so we can
|
||||||
|
# assert the limit is clamped before reaching the backend.
|
||||||
|
fake_search = MagicMock(return_value={"success": True, "data": {"web": []}})
|
||||||
|
fake_provider = MagicMock(
|
||||||
|
name="ParallelWebSearchProvider",
|
||||||
|
supports_search=MagicMock(return_value=True),
|
||||||
|
)
|
||||||
|
fake_provider.search = fake_search
|
||||||
|
fake_provider.name = "parallel"
|
||||||
|
|
||||||
|
with patch("tools.web_tools._get_search_backend", return_value="parallel"), \
|
||||||
|
patch("agent.web_search_registry.get_provider", return_value=fake_provider), \
|
||||||
patch("tools.interrupt.is_interrupted", return_value=False), \
|
patch("tools.interrupt.is_interrupted", return_value=False), \
|
||||||
patch.object(tools.web_tools._debug, "log_call"), \
|
patch.object(tools.web_tools._debug, "log_call"), \
|
||||||
patch.object(tools.web_tools._debug, "save"):
|
patch.object(tools.web_tools._debug, "save"):
|
||||||
result = json.loads(tools.web_tools.web_search_tool("docs", limit=500))
|
result = json.loads(tools.web_tools.web_search_tool("docs", limit=500))
|
||||||
|
|
||||||
assert result == {"success": True, "data": {"web": []}}
|
assert result == {"success": True, "data": {"web": []}}
|
||||||
mock_search.assert_called_once_with("docs", 100)
|
fake_search.assert_called_once_with("docs", 100)
|
||||||
|
|
||||||
|
|
||||||
class TestWebSearchErrorHandling:
|
class TestWebSearchErrorHandling:
|
||||||
|
|
@ -502,11 +515,19 @@ class TestWebSearchErrorHandling:
|
||||||
def test_search_error_response_does_not_expose_diagnostics(self):
|
def test_search_error_response_does_not_expose_diagnostics(self):
|
||||||
import tools.web_tools
|
import tools.web_tools
|
||||||
|
|
||||||
firecrawl_client = MagicMock()
|
# After the web-provider plugin migration, the firecrawl client lives
|
||||||
firecrawl_client.search.side_effect = RuntimeError("boom")
|
# at plugins.web.firecrawl.provider._get_firecrawl_client. We mock the
|
||||||
|
# registry's get_provider to return a fake provider whose .search()
|
||||||
|
# raises so we can verify error sanitization.
|
||||||
|
fake_provider = MagicMock(
|
||||||
|
name="FirecrawlWebSearchProvider",
|
||||||
|
supports_search=MagicMock(return_value=True),
|
||||||
|
)
|
||||||
|
fake_provider.search.side_effect = RuntimeError("boom")
|
||||||
|
fake_provider.name = "firecrawl"
|
||||||
|
|
||||||
with patch("tools.web_tools._get_backend", return_value="firecrawl"), \
|
with patch("tools.web_tools._get_search_backend", return_value="firecrawl"), \
|
||||||
patch("tools.web_tools._get_firecrawl_client", return_value=firecrawl_client), \
|
patch("agent.web_search_registry.get_provider", return_value=fake_provider), \
|
||||||
patch("tools.interrupt.is_interrupted", return_value=False), \
|
patch("tools.interrupt.is_interrupted", return_value=False), \
|
||||||
patch.object(tools.web_tools._debug, "log_call") as mock_log_call, \
|
patch.object(tools.web_tools._debug, "log_call") as mock_log_call, \
|
||||||
patch.object(tools.web_tools._debug, "save"):
|
patch.object(tools.web_tools._debug, "save"):
|
||||||
|
|
|
||||||
|
|
@ -1229,100 +1229,43 @@ def web_search_tool(query: str, limit: int = 5) -> str:
|
||||||
if is_interrupted():
|
if is_interrupted():
|
||||||
return tool_error("Interrupted", success=False)
|
return tool_error("Interrupted", success=False)
|
||||||
|
|
||||||
# Dispatch to the configured search backend
|
# Dispatch through the web search registry. All 7 providers
|
||||||
backend = _get_search_backend()
|
# (brave-free, ddgs, searxng, exa, parallel, tavily, firecrawl)
|
||||||
if backend == "parallel":
|
# now live as plugins; the dispatcher is just a registry lookup +
|
||||||
response_data = _parallel_search(query, limit)
|
# delegation. Sync only — every provider's search() is sync.
|
||||||
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
from agent.web_search_registry import (
|
||||||
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
get_active_search_provider,
|
||||||
debug_call_data["final_response_size"] = len(result_json)
|
get_provider as _wsp_get_provider,
|
||||||
_debug.log_call("web_search_tool", debug_call_data)
|
|
||||||
_debug.save()
|
|
||||||
return result_json
|
|
||||||
|
|
||||||
if backend == "exa":
|
|
||||||
response_data = _exa_search(query, limit)
|
|
||||||
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
|
||||||
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
|
||||||
debug_call_data["final_response_size"] = len(result_json)
|
|
||||||
_debug.log_call("web_search_tool", debug_call_data)
|
|
||||||
_debug.save()
|
|
||||||
return result_json
|
|
||||||
|
|
||||||
# Plugin-backed providers (brave-free, ddgs, searxng) — dispatched
|
|
||||||
# through agent.web_search_registry. Inline providers (parallel,
|
|
||||||
# exa, tavily, firecrawl) keep their own branches below until they
|
|
||||||
# too migrate to plugins. Spike scope: only the three providers
|
|
||||||
# already living in tools/web_providers/ are moved to plugins; the
|
|
||||||
# rest follow in the real migration PR.
|
|
||||||
if backend in {"brave-free", "ddgs", "searxng"}:
|
|
||||||
from agent.web_search_registry import get_provider as _wsp_get_provider
|
|
||||||
|
|
||||||
provider = _wsp_get_provider(backend)
|
|
||||||
if provider is None or not provider.supports_search():
|
|
||||||
response_data = {
|
|
||||||
"success": False,
|
|
||||||
"error": (
|
|
||||||
f"Web search provider '{backend}' is not registered. "
|
|
||||||
"Run `hermes tools` to set up a provider."
|
|
||||||
),
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
response_data = provider.search(query, limit)
|
|
||||||
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
|
||||||
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
|
||||||
debug_call_data["final_response_size"] = len(result_json)
|
|
||||||
_debug.log_call("web_search_tool", debug_call_data)
|
|
||||||
_debug.save()
|
|
||||||
return result_json
|
|
||||||
|
|
||||||
if backend == "tavily":
|
|
||||||
logger.info("Tavily search: '%s' (limit: %d)", query, limit)
|
|
||||||
raw = _tavily_request("search", {
|
|
||||||
"query": query,
|
|
||||||
"max_results": min(limit, 20),
|
|
||||||
"include_raw_content": False,
|
|
||||||
"include_images": False,
|
|
||||||
})
|
|
||||||
response_data = _normalize_tavily_search_results(raw)
|
|
||||||
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
|
||||||
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
|
||||||
debug_call_data["final_response_size"] = len(result_json)
|
|
||||||
_debug.log_call("web_search_tool", debug_call_data)
|
|
||||||
_debug.save()
|
|
||||||
return result_json
|
|
||||||
|
|
||||||
logger.info("Searching the web for: '%s' (limit: %d)", query, limit)
|
|
||||||
|
|
||||||
response = _get_firecrawl_client().search(
|
|
||||||
query=query,
|
|
||||||
limit=limit
|
|
||||||
)
|
)
|
||||||
|
|
||||||
web_results = _extract_web_search_results(response)
|
backend = _get_search_backend()
|
||||||
results_count = len(web_results)
|
provider = _wsp_get_provider(backend) if backend else None
|
||||||
logger.info("Found %d search results", results_count)
|
if provider is None or not provider.supports_search():
|
||||||
|
# Fall back to availability-walked active provider when the
|
||||||
|
# configured backend isn't a registered search provider (typo,
|
||||||
|
# uninstalled plugin, or capability mismatch).
|
||||||
|
provider = get_active_search_provider()
|
||||||
|
|
||||||
# Build response with just search metadata (URLs, titles, descriptions)
|
if provider is None:
|
||||||
response_data = {
|
response_data = {
|
||||||
"success": True,
|
"success": False,
|
||||||
"data": {
|
"error": (
|
||||||
"web": web_results
|
"No web search provider configured. "
|
||||||
|
"Run `hermes tools` to set one up."
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
else:
|
||||||
|
logger.info(
|
||||||
|
"Web search via %s: '%s' (limit: %d)",
|
||||||
|
provider.name, query, limit,
|
||||||
|
)
|
||||||
|
response_data = provider.search(query, limit)
|
||||||
|
|
||||||
# Capture debug information
|
debug_call_data["results_count"] = len(response_data.get("data", {}).get("web", []))
|
||||||
debug_call_data["results_count"] = results_count
|
|
||||||
|
|
||||||
# Convert to JSON
|
|
||||||
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
result_json = json.dumps(response_data, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
debug_call_data["final_response_size"] = len(result_json)
|
debug_call_data["final_response_size"] = len(result_json)
|
||||||
|
|
||||||
# Log debug information
|
|
||||||
_debug.log_call("web_search_tool", debug_call_data)
|
_debug.log_call("web_search_tool", debug_call_data)
|
||||||
_debug.save()
|
_debug.save()
|
||||||
|
|
||||||
return result_json
|
return result_json
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -1415,129 +1358,68 @@ async def web_extract_tool(
|
||||||
else:
|
else:
|
||||||
backend = _get_extract_backend()
|
backend = _get_extract_backend()
|
||||||
|
|
||||||
if backend == "parallel":
|
# All seven providers (brave-free, ddgs, searxng, exa, parallel,
|
||||||
results = await _parallel_extract(safe_urls)
|
# tavily, firecrawl) now live as plugins. The dispatcher is a
|
||||||
elif backend == "exa":
|
# registry lookup + delegation. Some providers' extract() is
|
||||||
results = _exa_extract(safe_urls)
|
# async (parallel, firecrawl), others sync (exa, tavily) — we
|
||||||
elif backend == "tavily":
|
# detect coroutine functions and await; sync functions run
|
||||||
logger.info("Tavily extract: %d URL(s)", len(safe_urls))
|
# inline (the policy gate, SSRF re-check, etc. live inside the
|
||||||
raw = _tavily_request("extract", {
|
# provider itself for the firecrawl per-URL loop).
|
||||||
"urls": safe_urls,
|
from agent.web_search_registry import (
|
||||||
"include_images": False,
|
get_active_extract_provider,
|
||||||
})
|
get_provider as _wsp_get_provider,
|
||||||
results = _normalize_tavily_documents(raw, fallback_url=safe_urls[0] if safe_urls else "")
|
)
|
||||||
elif backend in {"searxng", "brave-free", "ddgs"}:
|
|
||||||
# These backends are search-only — they cannot extract URL content
|
provider = _wsp_get_provider(backend) if backend else None
|
||||||
_label = {"searxng": "SearXNG", "brave-free": "Brave Search (free tier)", "ddgs": "DuckDuckGo (ddgs)"}[backend]
|
if provider is None or not provider.supports_extract():
|
||||||
return json.dumps({
|
# When the configured name IS registered but doesn't support
|
||||||
"success": False,
|
# extract (search-only providers like brave-free / ddgs /
|
||||||
"error": f"{_label} is a search-only backend and cannot extract URL content. "
|
# searxng), surface that as a typed "search-only" error
|
||||||
"Set web.extract_backend to firecrawl, tavily, exa, or parallel.",
|
# rather than silently switching backends. When the name
|
||||||
}, ensure_ascii=False)
|
# isn't registered at all (typo / uninstalled plugin), fall
|
||||||
|
# through to the active-provider walk.
|
||||||
|
if provider is not None and not provider.supports_extract():
|
||||||
|
return json.dumps(
|
||||||
|
{
|
||||||
|
"success": False,
|
||||||
|
"error": (
|
||||||
|
f"{provider.display_name} is a search-only "
|
||||||
|
"backend and cannot extract URL content. "
|
||||||
|
"Set web.extract_backend to firecrawl, "
|
||||||
|
"tavily, exa, or parallel."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
provider = get_active_extract_provider()
|
||||||
|
if provider is None:
|
||||||
|
return json.dumps(
|
||||||
|
{
|
||||||
|
"success": False,
|
||||||
|
"error": (
|
||||||
|
"No web extract provider configured. "
|
||||||
|
"Set web.extract_backend to firecrawl, "
|
||||||
|
"tavily, exa, or parallel."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Web extract via %s: %d URL(s)", provider.name, len(safe_urls)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Async-or-sync dispatch: parallel + firecrawl have async
|
||||||
|
# extract(); exa + tavily are sync.
|
||||||
|
import inspect
|
||||||
|
if inspect.iscoroutinefunction(provider.extract):
|
||||||
|
results = await provider.extract(safe_urls, format=format)
|
||||||
else:
|
else:
|
||||||
# ── Firecrawl extraction ──
|
# Run sync extract() in a thread so we don't block the
|
||||||
# Determine requested formats for Firecrawl v2
|
# event loop on network I/O.
|
||||||
formats: List[str] = []
|
results = await asyncio.to_thread(
|
||||||
if format == "markdown":
|
provider.extract, safe_urls, format=format
|
||||||
formats = ["markdown"]
|
)
|
||||||
elif format == "html":
|
|
||||||
formats = ["html"]
|
|
||||||
else:
|
|
||||||
# Default: request markdown for LLM-readiness and include html as backup
|
|
||||||
formats = ["markdown", "html"]
|
|
||||||
|
|
||||||
# Always use individual scraping for simplicity and reliability
|
|
||||||
# Batch scraping adds complexity without much benefit for small numbers of URLs
|
|
||||||
results: List[Dict[str, Any]] = []
|
|
||||||
|
|
||||||
from tools.interrupt import is_interrupted as _is_interrupted
|
|
||||||
for url in safe_urls:
|
|
||||||
if _is_interrupted():
|
|
||||||
results.append({"url": url, "error": "Interrupted", "title": ""})
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Website policy check — block before fetching
|
|
||||||
blocked = check_website_access(url)
|
|
||||||
if blocked:
|
|
||||||
logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"])
|
|
||||||
results.append({
|
|
||||||
"url": url, "title": "", "content": "",
|
|
||||||
"error": blocked["message"],
|
|
||||||
"blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]},
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
logger.info("Scraping: %s", url)
|
|
||||||
# Run synchronous Firecrawl scrape in a thread with a
|
|
||||||
# 60s timeout so a hung fetch doesn't block the session.
|
|
||||||
try:
|
|
||||||
scrape_result = await asyncio.wait_for(
|
|
||||||
asyncio.to_thread(
|
|
||||||
_get_firecrawl_client().scrape,
|
|
||||||
url=url,
|
|
||||||
formats=formats,
|
|
||||||
),
|
|
||||||
timeout=60,
|
|
||||||
)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
logger.warning("Firecrawl scrape timed out for %s", url)
|
|
||||||
results.append({
|
|
||||||
"url": url, "title": "", "content": "",
|
|
||||||
"error": "Scrape timed out after 60s — page may be too large or unresponsive. Try browser_navigate instead.",
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
scrape_payload = _extract_scrape_payload(scrape_result)
|
|
||||||
metadata = scrape_payload.get("metadata", {})
|
|
||||||
title = ""
|
|
||||||
content_markdown = scrape_payload.get("markdown")
|
|
||||||
content_html = scrape_payload.get("html")
|
|
||||||
|
|
||||||
# Ensure metadata is a dict (not an object)
|
|
||||||
if not isinstance(metadata, dict):
|
|
||||||
if hasattr(metadata, 'model_dump'):
|
|
||||||
metadata = metadata.model_dump()
|
|
||||||
elif hasattr(metadata, '__dict__'):
|
|
||||||
metadata = metadata.__dict__
|
|
||||||
else:
|
|
||||||
metadata = {}
|
|
||||||
|
|
||||||
# Get title from metadata
|
|
||||||
title = metadata.get("title", "")
|
|
||||||
|
|
||||||
# Re-check final URL after redirect
|
|
||||||
final_url = metadata.get("sourceURL", url)
|
|
||||||
final_blocked = check_website_access(final_url)
|
|
||||||
if final_blocked:
|
|
||||||
logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"])
|
|
||||||
results.append({
|
|
||||||
"url": final_url, "title": title, "content": "", "raw_content": "",
|
|
||||||
"error": final_blocked["message"],
|
|
||||||
"blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]},
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Choose content based on requested format
|
|
||||||
chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
"url": final_url,
|
|
||||||
"title": title,
|
|
||||||
"content": chosen_content,
|
|
||||||
"raw_content": chosen_content,
|
|
||||||
"metadata": metadata # Now guaranteed to be a dict
|
|
||||||
})
|
|
||||||
|
|
||||||
except Exception as scrape_err:
|
|
||||||
logger.debug("Scrape failed for %s: %s", url, scrape_err)
|
|
||||||
results.append({
|
|
||||||
"url": url,
|
|
||||||
"title": "",
|
|
||||||
"content": "",
|
|
||||||
"raw_content": "",
|
|
||||||
"error": str(scrape_err)
|
|
||||||
})
|
|
||||||
|
|
||||||
# Merge any SSRF-blocked results back in
|
# Merge any SSRF-blocked results back in
|
||||||
if ssrf_blocked:
|
if ssrf_blocked:
|
||||||
|
|
@ -1725,8 +1607,37 @@ async def web_crawl_tool(
|
||||||
auxiliary_available = check_auxiliary_model()
|
auxiliary_available = check_auxiliary_model()
|
||||||
backend = _get_backend()
|
backend = _get_backend()
|
||||||
|
|
||||||
# Tavily supports crawl via its /crawl endpoint
|
# Tavily (and any future plugin advertising supports_crawl=True)
|
||||||
if backend == "tavily":
|
# dispatches through agent.web_search_registry. The crawl response
|
||||||
|
# shape — {"results": [{"url", "title", "content", ...}]} — is then
|
||||||
|
# post-processed by the shared LLM-summarization path below.
|
||||||
|
from agent.web_search_registry import (
|
||||||
|
get_active_crawl_provider,
|
||||||
|
get_provider as _wsp_get_provider,
|
||||||
|
)
|
||||||
|
|
||||||
|
crawl_provider = _wsp_get_provider(backend) if backend else None
|
||||||
|
if crawl_provider is not None and not crawl_provider.supports_crawl():
|
||||||
|
# Configured name IS registered but doesn't support crawl
|
||||||
|
# (search-only providers like brave-free / ddgs / searxng).
|
||||||
|
# Surface a typed error rather than silently switching to a
|
||||||
|
# different crawl backend.
|
||||||
|
return json.dumps(
|
||||||
|
{
|
||||||
|
"success": False,
|
||||||
|
"error": (
|
||||||
|
f"{crawl_provider.display_name} is a search-only "
|
||||||
|
"backend and cannot crawl URLs. "
|
||||||
|
"Set FIRECRAWL_API_KEY for crawling, or use "
|
||||||
|
"web_search instead."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
ensure_ascii=False,
|
||||||
|
)
|
||||||
|
if crawl_provider is None:
|
||||||
|
crawl_provider = get_active_crawl_provider()
|
||||||
|
|
||||||
|
if crawl_provider is not None:
|
||||||
# Ensure URL has protocol
|
# Ensure URL has protocol
|
||||||
if not url.startswith(('http://', 'https://')):
|
if not url.startswith(('http://', 'https://')):
|
||||||
url = f'https://{url}'
|
url = f'https://{url}'
|
||||||
|
|
@ -1747,18 +1658,28 @@ async def web_crawl_tool(
|
||||||
if _is_int():
|
if _is_int():
|
||||||
return tool_error("Interrupted", success=False)
|
return tool_error("Interrupted", success=False)
|
||||||
|
|
||||||
logger.info("Tavily crawl: %s", url)
|
logger.info("Web crawl via %s: %s", crawl_provider.name, url)
|
||||||
payload: Dict[str, Any] = {
|
|
||||||
"url": url,
|
# Async-or-sync dispatch — Tavily's crawl is sync, but a future
|
||||||
"limit": 20,
|
# async-crawl provider works transparently.
|
||||||
"extract_depth": depth,
|
import inspect
|
||||||
}
|
crawl_kwargs = {"depth": depth, "limit": 20}
|
||||||
if instructions:
|
if instructions:
|
||||||
payload["instructions"] = instructions
|
crawl_kwargs["instructions"] = instructions
|
||||||
raw = _tavily_request("crawl", payload)
|
|
||||||
results = _normalize_tavily_documents(raw, fallback_url=url)
|
if inspect.iscoroutinefunction(crawl_provider.crawl):
|
||||||
|
response = await crawl_provider.crawl(url, **crawl_kwargs)
|
||||||
|
else:
|
||||||
|
response = await asyncio.to_thread(
|
||||||
|
crawl_provider.crawl, url, **crawl_kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
# Provider returns {"results": [...]} matching what the shared
|
||||||
|
# LLM post-processing below expects.
|
||||||
|
if not isinstance(response, dict):
|
||||||
|
response = {"results": []}
|
||||||
|
response.setdefault("results", [])
|
||||||
|
|
||||||
response = {"results": results}
|
|
||||||
# Fall through to the shared LLM processing and trimming below
|
# Fall through to the shared LLM processing and trimming below
|
||||||
# (skip the Firecrawl-specific crawl logic)
|
# (skip the Firecrawl-specific crawl logic)
|
||||||
pages_crawled = len(response.get('results', []))
|
pages_crawled = len(response.get('results', []))
|
||||||
|
|
@ -1809,14 +1730,9 @@ async def web_crawl_tool(
|
||||||
_debug.save()
|
_debug.save()
|
||||||
return cleaned_result
|
return cleaned_result
|
||||||
|
|
||||||
# SearXNG / Brave Search (free tier) / DuckDuckGo (ddgs) are search-only — they cannot crawl
|
# No registered provider supports crawl. Fall through to the
|
||||||
if backend in {"searxng", "brave-free", "ddgs"}:
|
# Firecrawl-via-summarize path below (legacy behavior) when
|
||||||
_label = {"searxng": "SearXNG", "brave-free": "Brave Search (free tier)", "ddgs": "DuckDuckGo (ddgs)"}[backend]
|
# Firecrawl credentials are configured.
|
||||||
return json.dumps({
|
|
||||||
"error": f"{_label} is a search-only backend and cannot crawl URLs. "
|
|
||||||
"Set FIRECRAWL_API_KEY for crawling, or use web_search instead.",
|
|
||||||
"success": False,
|
|
||||||
}, ensure_ascii=False)
|
|
||||||
|
|
||||||
# web_crawl requires Firecrawl or the Firecrawl tool-gateway — Parallel has no crawl API
|
# web_crawl requires Firecrawl or the Firecrawl tool-gateway — Parallel has no crawl API
|
||||||
if not check_firecrawl_api_key():
|
if not check_firecrawl_api_key():
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue