From 5e1f793430ccab74808b9f7019e071ea3c638381 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 28 May 2026 04:52:42 -0700 Subject: [PATCH] chore(web): remove web_crawl tool + provider crawl plumbing (#33824) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The web_crawl_tool() function was an orphan — no model schema registered it, no skill or CLI command called it, and the agent had no way to invoke it. PR #32608 proposed wiring it up as a model-callable tool; we've decided not to expose crawl as a separate capability since web_search + web_extract cover the use cases we want models to have. Removed: - tools/web_tools.py: web_crawl_tool() (~230 LOC) - plugins/web/firecrawl/provider.py: supports_crawl() + crawl() - plugins/web/tavily/provider.py: supports_crawl() + crawl() - plugins/web/xai/provider.py: supports_crawl() override - agent/web_search_provider.py: supports_crawl() + crawl() ABC methods - agent/web_search_registry.py: get_active_crawl_provider() + the 'crawl' branch in _resolve() - agent/display.py: web_crawl tool-progress rendering - hermes_cli/config.py: 'web_crawl' from TAVILY_API_KEY.tools - tools/website_policy.py: stale comment reference - Tests: removed TestWebCrawlTavily class, the two website-policy web_crawl tests, the searxng/ddgs/brave-free crawl-error tests, the integration test_web_crawl method, and the test_unconfigured_crawl_emits_top_level_error test. Trimmed the capability-flag parametrize list and the WebSearchProvider ABC conformance tests. - Docs: trimmed the Crawl column from capability tables in both EN and zh-Hans, updated the developer-guide ABC table. Net: 25 files, +115/-1067. Closes #33762 (the schema-text bug only existed if #32608 landed). Supersedes #32608. --- agent/display.py | 4 - agent/web_search_provider.py | 48 +--- agent/web_search_registry.py | 29 +- hermes_cli/config.py | 4 +- plugins/web/firecrawl/provider.py | 185 +------------ plugins/web/tavily/__init__.py | 7 +- plugins/web/tavily/provider.py | 81 +----- plugins/web/xai/provider.py | 3 - tests/integration/test_web_tools.py | 111 -------- .../web/test_web_search_provider_plugins.py | 55 +--- tests/tools/test_web_providers.py | 43 +-- tests/tools/test_web_providers_brave_free.py | 24 +- tests/tools/test_web_providers_ddgs.py | 24 +- tests/tools/test_web_providers_searxng.py | 22 +- tests/tools/test_web_providers_xai.py | 1 - tests/tools/test_web_tools_tavily.py | 63 +---- tests/tools/test_website_policy.py | 95 +------ tools/web_tools.py | 255 +----------------- tools/website_policy.py | 2 +- .../web-search-provider-plugin.md | 7 +- website/docs/user-guide/configuration.md | 18 +- .../docs/user-guide/features/web-search.md | 38 +-- .../web-search-provider-plugin.md | 7 +- .../current/user-guide/configuration.md | 18 +- .../current/user-guide/features/web-search.md | 38 +-- 25 files changed, 115 insertions(+), 1067 deletions(-) diff --git a/agent/display.py b/agent/display.py index 02880a83e0d..8514279888e 100644 --- a/agent/display.py +++ b/agent/display.py @@ -904,10 +904,6 @@ def get_cute_tool_message( extra = f" +{len(urls)-1}" if len(urls) > 1 else "" return _wrap(f"┊ 📄 fetch {_trunc(domain, 35)}{extra} {dur}") return _wrap(f"┊ 📄 fetch pages {dur}") - if tool_name == "web_crawl": - url = args.get("url", "") - domain = url.replace("https://", "").replace("http://", "").split("/")[0] - return _wrap(f"┊ 🕸️ crawl {_trunc(domain, 35)} {dur}") if tool_name == "terminal": return _wrap(f"┊ 💻 $ {_trunc(args.get('command', ''), 42)} {dur}") if tool_name == "process": diff --git a/agent/web_search_provider.py b/agent/web_search_provider.py index 7223bbf2cfe..685eb68b337 100644 --- a/agent/web_search_provider.py +++ b/agent/web_search_provider.py @@ -61,14 +61,14 @@ from typing import Any, Dict, List class WebSearchProvider(abc.ABC): - """Abstract base class for a web search/extract/crawl backend. + """Abstract base class for a web search/extract backend. Subclasses must implement :meth:`is_available` and at least one of - :meth:`search` / :meth:`extract` / :meth:`crawl`. The - :meth:`supports_search` / :meth:`supports_extract` / :meth:`supports_crawl` - capability flags let the registry route each tool call to the right - provider, and let multi-capability providers (Firecrawl, Tavily, Exa, - …) advertise multiple capabilities from a single class. + :meth:`search` / :meth:`extract`. The :meth:`supports_search` / + :meth:`supports_extract` capability flags let the registry route each + tool call to the right provider, and let multi-capability providers + (Firecrawl, Tavily, Exa, …) advertise multiple capabilities from a + single class. """ @property @@ -113,22 +113,6 @@ class WebSearchProvider(abc.ABC): """ return False - def supports_crawl(self) -> bool: - """Return True if this provider implements :meth:`crawl`. - - Crawl differs from extract in that the agent provides a *seed URL* - and the provider walks linked pages on its own — useful for - documentation sites where the agent doesn't know all relevant - URLs upfront. Tavily is the only built-in backend that natively - crawls today; Firecrawl provides a similar capability that we - don't currently surface as a tool. - - Providers that don't crawl should leave this as False; the - dispatcher in :func:`tools.web_tools.web_crawl_tool` will fall - back to its auxiliary-model summarization path. - """ - return False - def search(self, query: str, limit: int = 5) -> Dict[str, Any]: """Execute a web search. @@ -173,26 +157,6 @@ class WebSearchProvider(abc.ABC): f"{self.name} does not support extract (override supports_extract)" ) - def crawl(self, url: str, **kwargs: Any) -> Any: - """Crawl a seed URL and return results. - - Override when :meth:`supports_crawl` returns True. The default - raises NotImplementedError; callers should gate on - :meth:`supports_crawl` before calling. - - Return shape: ``{"results": [{"url": str, "title": str, - "content": str, ...}, ...]}`` matching what - :func:`tools.web_tools.web_crawl_tool` post-processing expects. - - Implementations MAY be ``async def``. - - ``kwargs`` may carry forward-compat fields (e.g. ``max_depth``, - ``include_domains``) — implementations should ignore unknown keys. - """ - raise NotImplementedError( - f"{self.name} does not support crawl (override supports_crawl)" - ) - def get_setup_schema(self) -> Dict[str, Any]: """Return provider metadata for the ``hermes tools`` picker. diff --git a/agent/web_search_registry.py b/agent/web_search_registry.py index c61c16cadb2..079c755787c 100644 --- a/agent/web_search_registry.py +++ b/agent/web_search_registry.py @@ -11,7 +11,7 @@ Active selection ---------------- The active provider is chosen by configuration with this precedence: -1. ``web.search_backend`` / ``web.extract_backend`` / ``web.crawl_backend`` +1. ``web.search_backend`` / ``web.extract_backend`` (per-capability override). 2. ``web.backend`` (shared fallback). 3. If exactly one capability-eligible provider is registered AND available, @@ -24,10 +24,10 @@ The active provider is chosen by configuration with this precedence: 5. Otherwise ``None`` — the tool surfaces a helpful error pointing at ``hermes tools``. -The capability filter (``supports_search`` / ``supports_extract`` / -``supports_crawl``) is applied at every step so a search-only provider -(``brave-free``) configured as ``web.extract_backend`` correctly falls -through to an extract-capable backend. +The capability filter (``supports_search`` / ``supports_extract``) is +applied at every step so a search-only provider (``brave-free``) +configured as ``web.extract_backend`` correctly falls through to an +extract-capable backend. """ from __future__ import annotations @@ -131,7 +131,7 @@ _LEGACY_PREFERENCE = ( def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearchProvider]: - """Resolve the active provider for a capability ("search" | "extract" | "crawl"). + """Resolve the active provider for a capability ("search" | "extract"). Resolution rules (in order): @@ -168,8 +168,6 @@ def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearc return bool(p.supports_search()) if capability == "extract": return bool(p.supports_extract()) - if capability == "crawl": - return bool(p.supports_crawl()) return False def _is_available_safe(p: WebSearchProvider) -> bool: @@ -241,21 +239,6 @@ def get_active_extract_provider() -> Optional[WebSearchProvider]: return _resolve(explicit, capability="extract") -def get_active_crawl_provider() -> Optional[WebSearchProvider]: - """Resolve the currently-active web crawl provider. - - Reads ``web.crawl_backend`` (preferred) or ``web.backend`` (shared - fallback) from config.yaml; falls back per the module docstring. - - Crawl is a niche capability — among built-in providers only Tavily and - Firecrawl implement it. Callers should expect ``None`` and fall back to - a different strategy (e.g. summarize-via-LLM) when neither is - configured. - """ - explicit = _read_config_key("web", "crawl_backend") or _read_config_key("web", "backend") - return _resolve(explicit, capability="crawl") - - def _reset_for_tests() -> None: """Clear the registry. **Test-only.**""" with _lock: diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 297f18b041e..96fb77b4c49 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -2505,10 +2505,10 @@ OPTIONAL_ENV_VARS = { "advanced": True, }, "TAVILY_API_KEY": { - "description": "Tavily API key for AI-native web search, extract, and crawl", + "description": "Tavily API key for AI-native web search and extract", "prompt": "Tavily API key", "url": "https://app.tavily.com/home", - "tools": ["web_search", "web_extract", "web_crawl"], + "tools": ["web_search", "web_extract"], "password": True, "category": "tool", }, diff --git a/plugins/web/firecrawl/provider.py b/plugins/web/firecrawl/provider.py index d0415781518..9e3f123e520 100644 --- a/plugins/web/firecrawl/provider.py +++ b/plugins/web/firecrawl/provider.py @@ -385,9 +385,6 @@ class FirecrawlWebSearchProvider(WebSearchProvider): def supports_extract(self) -> bool: return True - def supports_crawl(self) -> bool: - return True - def search(self, query: str, limit: int = 5) -> Dict[str, Any]: """Execute a Firecrawl search. @@ -579,192 +576,12 @@ class FirecrawlWebSearchProvider(WebSearchProvider): return results - async def crawl(self, url: str, **kwargs: Any) -> Dict[str, Any]: - """Crawl a seed URL via Firecrawl's ``/crawl`` endpoint. - - Sync SDK call wrapped in ``asyncio.to_thread`` because the dispatcher - in :func:`tools.web_tools.web_crawl_tool` is async and runs LLM - post-processing on the response. The dispatcher gates the seed URL - against SSRF + website-access policy before calling us; this method - re-checks every crawled page's URL against the policy after the - crawl returns to catch redirected pages that map to a blocked host. - - Accepted kwargs (others ignored for forward compat): - - ``instructions``: str — logged then dropped. Firecrawl's /crawl - endpoint does NOT accept natural-language instructions (that's - an /extract feature), so we record the value for debugging and - proceed without it. Tavily's crawl IS instruction-aware; this - divergence is documented in both plugins' docstrings. - - ``limit``: int — max pages to crawl (default 20). - - ``depth``: str — accepted for API parity with Tavily; ignored - by Firecrawl's crawl endpoint. - - Returns ``{"results": [...]}`` matching the shape that - :func:`tools.web_tools.web_crawl_tool`'s shared LLM-summarization - path expects. Per-page failures (policy block on redirected URL, - bad response shape) are included as items with an ``error`` field - rather than raising. - """ - try: - from tools.interrupt import is_interrupted - - if is_interrupted(): - return {"results": [{"url": url, "title": "", "content": "", "error": "Interrupted"}]} - - instructions = kwargs.get("instructions") - limit = kwargs.get("limit", 20) - - # Firecrawl's /crawl endpoint does not accept natural-language - # instructions (that's an /extract feature). Log + drop. - if instructions: - logger.info( - "Firecrawl crawl: 'instructions' parameter ignored " - "(not supported by Firecrawl /crawl)" - ) - - logger.info("Firecrawl crawl: %s (limit=%d)", url, limit) - - crawl_params = { - "limit": limit, - "scrape_options": {"formats": ["markdown"]}, - } - - # The SDK call is sync; run in a thread so we don't block the - # gateway event loop on a multi-page crawl. - crawl_result = await asyncio.to_thread( - _get_firecrawl_client().crawl, - url=url, - **crawl_params, - ) - - # CrawlJob normalization across SDK + direct + gateway shapes. - data_list: List[Any] = [] - if hasattr(crawl_result, "data"): - data_list = crawl_result.data if crawl_result.data else [] - logger.info( - "Firecrawl crawl status: %s, %d pages", - getattr(crawl_result, "status", "unknown"), - len(data_list), - ) - elif isinstance(crawl_result, dict) and "data" in crawl_result: - data_list = crawl_result.get("data", []) or [] - else: - logger.warning( - "Firecrawl crawl: unexpected result type %r", - type(crawl_result).__name__, - ) - - pages: List[Dict[str, Any]] = [] - for item in data_list: - # Pydantic model | typed object | dict — handle all shapes. - content_markdown = None - content_html = None - metadata: Any = {} - - if hasattr(item, "model_dump"): - item_dict = item.model_dump() - content_markdown = item_dict.get("markdown") - content_html = item_dict.get("html") - metadata = item_dict.get("metadata", {}) - elif hasattr(item, "__dict__"): - content_markdown = getattr(item, "markdown", None) - content_html = getattr(item, "html", None) - metadata_obj = getattr(item, "metadata", {}) - if hasattr(metadata_obj, "model_dump"): - metadata = metadata_obj.model_dump() - elif hasattr(metadata_obj, "__dict__"): - metadata = metadata_obj.__dict__ - elif isinstance(metadata_obj, dict): - metadata = metadata_obj - else: - metadata = {} - elif isinstance(item, dict): - content_markdown = item.get("markdown") - content_html = item.get("html") - metadata = item.get("metadata", {}) - - # Ensure metadata is a plain dict. - if not isinstance(metadata, dict): - if hasattr(metadata, "model_dump"): - metadata = metadata.model_dump() - elif hasattr(metadata, "__dict__"): - metadata = metadata.__dict__ - else: - metadata = {} - - page_url = metadata.get( - "sourceURL", metadata.get("url", "Unknown URL") - ) - title = metadata.get("title", "") - - # Per-page policy re-check (catches blocked redirects). - page_blocked = check_website_access(page_url) - if page_blocked: - logger.info( - "Blocked crawled page %s by rule %s", - page_blocked["host"], - page_blocked["rule"], - ) - pages.append( - { - "url": page_url, - "title": title, - "content": "", - "raw_content": "", - "error": page_blocked["message"], - "blocked_by_policy": { - "host": page_blocked["host"], - "rule": page_blocked["rule"], - "source": page_blocked["source"], - }, - } - ) - continue - - content = content_markdown or content_html or "" - pages.append( - { - "url": page_url, - "title": title, - "content": content, - "raw_content": content, - "metadata": metadata, - } - ) - - return {"results": pages} - except ValueError as exc: - return {"results": [{"url": url, "title": "", "content": "", "error": str(exc)}]} - except ImportError as exc: - return { - "results": [ - { - "url": url, - "title": "", - "content": "", - "error": f"Firecrawl SDK not installed: {exc}", - } - ] - } - except Exception as exc: # noqa: BLE001 - logger.warning("Firecrawl crawl error: %s", exc) - return { - "results": [ - { - "url": url, - "title": "", - "content": "", - "error": f"Firecrawl crawl failed: {exc}", - } - ] - } - def get_setup_schema(self) -> Dict[str, Any]: return { "name": "Firecrawl", "badge": "paid · optional gateway", "tag": ( - "Full search + extract + crawl; supports direct API and " + "Full search + extract; supports direct API and " "Nous tool-gateway routing." ), "env_vars": [ diff --git a/plugins/web/tavily/__init__.py b/plugins/web/tavily/__init__.py index be0b21dbe78..1e0ced61d12 100644 --- a/plugins/web/tavily/__init__.py +++ b/plugins/web/tavily/__init__.py @@ -1,9 +1,4 @@ -"""Tavily web search + extract + crawl plugin — bundled, auto-loaded. - -First plugin in this codebase to advertise ``supports_crawl=True``. The -crawl method maps to Tavily's ``/crawl`` endpoint, which accepts a seed -URL plus optional instructions and extract depth. -""" +"""Tavily web search + extract plugin — bundled, auto-loaded.""" from __future__ import annotations diff --git a/plugins/web/tavily/provider.py b/plugins/web/tavily/provider.py index 50e15973fb3..fe161a4a096 100644 --- a/plugins/web/tavily/provider.py +++ b/plugins/web/tavily/provider.py @@ -1,33 +1,24 @@ -"""Tavily web search + content extraction + crawl — plugin form. +"""Tavily web search + content extraction — plugin form. -Subclasses :class:`agent.web_search_provider.WebSearchProvider`. Three +Subclasses :class:`agent.web_search_provider.WebSearchProvider`. Two capabilities advertised: - ``supports_search()`` -> True (Tavily ``/search``) - ``supports_extract()`` -> True (Tavily ``/extract``) -- ``supports_crawl()`` -> True (Tavily ``/crawl``) — sync HTTP crawl; - Firecrawl also advertises ``supports_crawl=True`` (async) -All three are sync — the underlying call is ``httpx.post(...)``. The -dispatcher in :func:`tools.web_tools.web_crawl_tool` (which is itself -async) will run sync providers in a thread when appropriate. +Both are sync — the underlying call is ``httpx.post(...)``. Config keys this provider responds to:: web: search_backend: "tavily" # explicit per-capability extract_backend: "tavily" # explicit per-capability - crawl_backend: "tavily" # explicit per-capability - backend: "tavily" # shared fallback for all three + backend: "tavily" # shared fallback for both Env vars:: TAVILY_API_KEY=... # https://app.tavily.com/home (required) TAVILY_BASE_URL=... # optional override of https://api.tavily.com - -Auth note: Tavily uses ``api_key`` in the JSON body for /search and -/extract, but **also requires** ``Authorization: Bearer `` for /crawl -(body-only auth returns 401 on /crawl). The plugin handles both. """ from __future__ import annotations @@ -63,11 +54,7 @@ def _tavily_request(endpoint: str, payload: Dict[str, Any]) -> Dict[str, Any]: url = f"{base_url}/{endpoint.lstrip('/')}" logger.info("Tavily %s request to %s", endpoint, url) - # Tavily /crawl requires Bearer header auth in addition to body auth; - # /search and /extract are body-only. - headers = {"Authorization": f"Bearer {api_key}"} if endpoint.strip("/") == "crawl" else {} - - response = httpx.post(url, json=payload, headers=headers, timeout=60) + response = httpx.post(url, json=payload, timeout=60) response.raise_for_status() return response.json() @@ -90,7 +77,7 @@ def _normalize_tavily_search_results(response: Dict[str, Any]) -> Dict[str, Any] def _normalize_tavily_documents( response: Dict[str, Any], fallback_url: str = "" ) -> List[Dict[str, Any]]: - """Map Tavily ``/extract`` or ``/crawl`` response to standard documents. + """Map Tavily ``/extract`` response to standard documents. Documents follow the legacy LLM post-processing shape:: @@ -139,7 +126,7 @@ def _normalize_tavily_documents( class TavilyWebSearchProvider(WebSearchProvider): - """Tavily search + extract + crawl provider.""" + """Tavily search + extract provider.""" @property def name(self) -> str: @@ -159,9 +146,6 @@ class TavilyWebSearchProvider(WebSearchProvider): def supports_extract(self) -> bool: return True - def supports_crawl(self) -> bool: - return True - def search(self, query: str, limit: int = 5) -> Dict[str, Any]: """Execute a Tavily search.""" try: @@ -221,60 +205,11 @@ class TavilyWebSearchProvider(WebSearchProvider): for u in urls ] - def crawl(self, url: str, **kwargs: Any) -> Dict[str, Any]: - """Crawl a seed URL via Tavily's ``/crawl`` endpoint. - - Accepted kwargs (others ignored for forward compat): - - ``instructions``: str — natural-language guidance for the crawl - - ``depth``: str — ``"basic"`` (default) or ``"advanced"`` - - ``limit``: int — max pages to crawl (default 20) - - Returns ``{"results": [...]}`` shaped to match what - :func:`tools.web_tools.web_crawl_tool` post-processes. - """ - try: - from tools.interrupt import is_interrupted - - if is_interrupted(): - return {"results": [{"url": url, "title": "", "content": "", "error": "Interrupted"}]} - - instructions = kwargs.get("instructions") - depth = kwargs.get("depth", "basic") - limit = kwargs.get("limit", 20) - - logger.info("Tavily crawl: %s (depth=%s, limit=%d)", url, depth, limit) - payload: Dict[str, Any] = { - "url": url, - "limit": limit, - "extract_depth": depth, - } - if instructions: - payload["instructions"] = instructions - - raw = _tavily_request("crawl", payload) - return { - "results": _normalize_tavily_documents(raw, fallback_url=url) - } - except ValueError as exc: - return {"results": [{"url": url, "title": "", "content": "", "error": str(exc)}]} - except Exception as exc: # noqa: BLE001 - logger.warning("Tavily crawl error: %s", exc) - return { - "results": [ - { - "url": url, - "title": "", - "content": "", - "error": f"Tavily crawl failed: {exc}", - } - ] - } - def get_setup_schema(self) -> Dict[str, Any]: return { "name": "Tavily", "badge": "paid", - "tag": "Search + extract + crawl in one provider.", + "tag": "Search + extract in one provider.", "env_vars": [ { "key": "TAVILY_API_KEY", diff --git a/plugins/web/xai/provider.py b/plugins/web/xai/provider.py index a74b6a683e8..2b86238d11b 100644 --- a/plugins/web/xai/provider.py +++ b/plugins/web/xai/provider.py @@ -143,9 +143,6 @@ class XAIWebSearchProvider(WebSearchProvider): def supports_extract(self) -> bool: return False - def supports_crawl(self) -> bool: - return False - # -- Search ----------------------------------------------------------- def search(self, query: str, limit: int = 5) -> Dict[str, Any]: diff --git a/tests/integration/test_web_tools.py b/tests/integration/test_web_tools.py index 823be0392fa..f5281140066 100644 --- a/tests/integration/test_web_tools.py +++ b/tests/integration/test_web_tools.py @@ -30,7 +30,6 @@ from typing import List from tools.web_tools import ( web_search_tool, web_extract_tool, - web_crawl_tool, check_firecrawl_api_key, check_web_api_key, check_auxiliary_model, @@ -404,113 +403,6 @@ class WebToolsTester: except Exception as e: self.log_result("Extract (with LLM)", "failed", str(e)) - async def test_web_crawl(self): - """Test web crawling functionality""" - print_section("Test 4: Web Crawl") - - test_sites = [ - ("https://docs.firecrawl.dev", None, 2), # Test docs site - ("https://firecrawl.dev", None, 3), # Test main site - ] - - for url, instructions, expected_min_pages in test_sites: - try: - print(f"\n Testing crawl of: {url}") - if instructions: - print(f" Instructions: {instructions}") - else: - print(f" No instructions (general crawl)") - print(f" Expected minimum pages: {expected_min_pages}") - - # Show what's being called - if self.verbose: - print(f" Calling web_crawl_tool(url='{url}', instructions={instructions}, use_llm_processing=False)") - - result = await web_crawl_tool( - url, - instructions=instructions, - use_llm_processing=False # Disable LLM for faster testing - ) - - # Check if result is valid JSON - try: - data = json.loads(result) - except json.JSONDecodeError as e: - self.log_result(f"Crawl: {url}", "failed", f"Invalid JSON response: {e}") - if self.verbose: - print(f" Raw response (first 500 chars): {result[:500]}...") - continue - - # Check for errors - if "error" in data: - self.log_result(f"Crawl: {url}", "failed", f"API error: {data['error']}") - continue - - # Get results - results = data.get("results", []) - - if not results: - self.log_result(f"Crawl: {url}", "failed", "No pages in results array") - if self.verbose: - print(f" Full response: {json.dumps(data, indent=2)[:1000]}...") - continue - - # Analyze pages - valid_pages = 0 - empty_pages = 0 - total_content = 0 - page_details = [] - - for i, page in enumerate(results): - content = page.get("content", "") - title = page.get("title", "Untitled") - error = page.get("error") - - if error: - page_details.append(f"Page {i+1}: ERROR - {error}") - elif content: - valid_pages += 1 - content_len = len(content) - total_content += content_len - page_details.append(f"Page {i+1}: {title[:40]}... ({content_len} chars)") - else: - empty_pages += 1 - page_details.append(f"Page {i+1}: {title[:40]}... (EMPTY)") - - # Show detailed results if verbose - if self.verbose: - print(f"\n Crawl Results:") - print(f" Total pages returned: {len(results)}") - print(f" Valid pages (with content): {valid_pages}") - print(f" Empty pages: {empty_pages}") - print(f" Total content size: {total_content} characters") - print(f"\n Page Details:") - for detail in page_details[:10]: # Show first 10 pages - print(f" - {detail}") - if len(page_details) > 10: - print(f" ... and {len(page_details) - 10} more pages") - - # Determine pass/fail - if valid_pages >= expected_min_pages: - self.log_result( - f"Crawl: {url}", - "passed", - f"{valid_pages}/{len(results)} valid pages, {total_content} chars total" - ) - else: - self.log_result( - f"Crawl: {url}", - "failed", - f"Only {valid_pages} valid pages (expected >= {expected_min_pages}), {empty_pages} empty, {len(results)} total" - ) - - except Exception as e: - self.log_result(f"Crawl: {url}", "failed", f"Exception: {type(e).__name__}: {str(e)}") - if self.verbose: - import traceback - print(f" Traceback:") - print(" " + "\n ".join(traceback.format_exc().split("\n"))) - async def run_all_tests(self): """Run all tests""" self.start_time = datetime.now() @@ -533,9 +425,6 @@ class WebToolsTester: if self.test_llm: await self.test_web_extract_with_llm(urls if urls else None) - # Test crawling - await self.test_web_crawl() - # Print summary self.end_time = datetime.now() duration = (self.end_time - self.start_time).total_seconds() diff --git a/tests/plugins/web/test_web_search_provider_plugins.py b/tests/plugins/web/test_web_search_provider_plugins.py index 4c169e06e53..60f8463fd37 100644 --- a/tests/plugins/web/test_web_search_provider_plugins.py +++ b/tests/plugins/web/test_web_search_provider_plugins.py @@ -90,20 +90,17 @@ class TestBundledPluginsRegister: ] @pytest.mark.parametrize( - "plugin_name,expected_search,expected_extract,expected_crawl", + "plugin_name,expected_search,expected_extract", [ - ("brave-free", True, False, False), - ("ddgs", True, False, False), - ("searxng", True, False, False), - ("exa", True, True, False), - ("parallel", True, True, False), - ("tavily", True, True, True), - # firecrawl: search + extract + crawl. Crawl was originally - # disabled in the migration (fell through to a legacy inline - # path); the follow-up commit enabled it natively. - ("firecrawl", True, True, True), + ("brave-free", True, False), + ("ddgs", True, False), + ("searxng", True, False), + ("exa", True, True), + ("parallel", True, True), + ("tavily", True, True), + ("firecrawl", True, True), # xai: search-only via Grok's agentic web_search tool. - ("xai", True, False, False), + ("xai", True, False), ], ) def test_capability_flags_match_spec( @@ -111,7 +108,6 @@ class TestBundledPluginsRegister: plugin_name: str, expected_search: bool, expected_extract: bool, - expected_crawl: bool, ) -> None: _ensure_plugins_loaded() from agent.web_search_registry import get_provider @@ -120,7 +116,6 @@ class TestBundledPluginsRegister: assert provider is not None, f"plugin {plugin_name!r} not registered" assert provider.supports_search() is expected_search assert provider.supports_extract() is expected_extract - assert provider.supports_crawl() is expected_crawl @pytest.mark.parametrize( "plugin_name", @@ -457,38 +452,6 @@ class TestErrorResponseShapes: if result: # if anything came back, it should be an error entry assert "error" in result[0] - def test_tavily_crawl_returns_error_dict_when_unconfigured(self) -> None: - _ensure_plugins_loaded() - from agent.web_search_registry import get_provider - - p = get_provider("tavily") - assert p is not None - result = p.crawl("https://example.com") - assert isinstance(result, dict) - assert "results" in result - assert isinstance(result["results"], list) - if result["results"]: - assert "error" in result["results"][0] - - def test_firecrawl_crawl_returns_error_dict_when_unconfigured(self): - """firecrawl crawl is async (wraps SDK in to_thread); error must be - surfaced via the per-page result shape, not raised.""" - _ensure_plugins_loaded() - from agent.web_search_registry import get_provider - - p = get_provider("firecrawl") - assert p is not None - assert inspect.iscoroutinefunction(p.crawl) - result = asyncio.run(p.crawl("https://example.com")) - assert isinstance(result, dict) - assert "results" in result - assert isinstance(result["results"], list) - # Without FIRECRAWL_API_KEY, the plugin's _get_firecrawl_client() - # raises ValueError which is caught and returned as a per-page error. - assert len(result["results"]) >= 1 - assert "error" in result["results"][0] - assert result["results"][0]["url"] == "https://example.com" - def test_firecrawl_config_error_points_paid_users_to_nous_subscription(self, monkeypatch): from plugins.web.firecrawl import provider as firecrawl_provider diff --git a/tests/tools/test_web_providers.py b/tests/tools/test_web_providers.py index c94b5134ca3..b8f175a68f2 100644 --- a/tests/tools/test_web_providers.py +++ b/tests/tools/test_web_providers.py @@ -29,7 +29,7 @@ class TestWebProviderABCs: in-tree ABCs at ``tools.web_providers.base`` (separate ``WebSearchProvider`` + ``WebExtractProvider``) were deleted in the same PR — providers now advertise capabilities via - ``supports_search() / supports_extract() / supports_crawl()`` flags. + ``supports_search() / supports_extract()`` flags. """ def test_cannot_instantiate_abc_directly(self): @@ -65,7 +65,6 @@ class TestWebProviderABCs: assert d.is_available() is True assert d.supports_search() is True assert d.supports_extract() is False # default - assert d.supports_crawl() is False # default assert d.search("test")["success"] is True def test_concrete_multi_capability_provider_works(self): @@ -89,27 +88,19 @@ class TestWebProviderABCs: def supports_extract(self) -> bool: return True - def supports_crawl(self) -> bool: - return True - def search(self, query: str, limit: int = 5) -> Dict[str, Any]: return {"success": True, "data": {"web": []}} def extract(self, urls: List[str], **kwargs: Any) -> List[Dict[str, Any]]: return [{"url": urls[0], "content": "x"}] - def crawl(self, url: str, **kwargs: Any) -> Dict[str, Any]: - return {"results": [{"url": url, "content": "x"}]} - d = Dummy() assert d.supports_search() is True assert d.supports_extract() is True - assert d.supports_crawl() is True assert d.extract(["https://example.com"])[0]["url"] == "https://example.com" - assert d.crawl("https://example.com")["results"][0]["url"] == "https://example.com" - def test_search_only_provider_skips_extract_and_crawl(self): - """Search-only providers don't have to implement extract() / crawl().""" + def test_search_only_provider_skips_extract(self): + """Search-only providers don't have to implement extract().""" from agent.web_search_provider import WebSearchProvider class SearchOnly(WebSearchProvider): @@ -130,13 +121,12 @@ class TestWebProviderABCs: def search(self, query: str, limit: int = 5) -> Dict[str, Any]: return {"success": True, "data": {"web": []}} - # Should instantiate fine — extract/crawl have default - # supports_*() returning False and aren't required to be - # overridden when not advertised. + # Should instantiate fine — extract has default supports_*() + # returning False and isn't required to be overridden when not + # advertised. s = SearchOnly() assert s.supports_search() is True assert s.supports_extract() is False - assert s.supports_crawl() is False # --------------------------------------------------------------------------- @@ -322,24 +312,3 @@ class TestUnconfiguredErrorEnvelopeParity: # No per-result burying assert "results" not in result - def test_unconfigured_crawl_emits_top_level_error(self, monkeypatch): - """``web_crawl_tool`` with no creds returns ``{"success": False, "error": "web_crawl requires Firecrawl..."}`` - — the dispatcher gates on ``provider.is_available()`` BEFORE - delegating to the plugin so pre-config errors don't get wrapped - into ``results[]``. - """ - import asyncio - import json - from tools import web_tools - - self._clear_web_creds(monkeypatch) - monkeypatch.setattr(web_tools, "_firecrawl_client", None, raising=False) - monkeypatch.setattr(web_tools, "_firecrawl_client_config", None, raising=False) - monkeypatch.setattr(web_tools, "_load_web_config", lambda: {}) - - result = json.loads(asyncio.run(web_tools.web_crawl_tool("https://example.com", use_llm_processing=False))) - assert result.get("success") is False - assert "error" in result, f"expected top-level 'error' key, got {result}" - assert "web_crawl requires Firecrawl" in result["error"] - # Crucially: no per-page burying - assert "results" not in result diff --git a/tests/tools/test_web_providers_brave_free.py b/tests/tools/test_web_providers_brave_free.py index bd09dc5a4cd..a75b9d38e4f 100644 --- a/tests/tools/test_web_providers_brave_free.py +++ b/tests/tools/test_web_providers_brave_free.py @@ -8,7 +8,7 @@ Covers: - _is_backend_available("brave-free") integration - _get_backend() recognizes "brave-free" as a valid configured backend - check_web_api_key() includes brave-free in availability check -- web_extract / web_crawl return search-only errors when brave-free is active +- web_extract returns a search-only error when brave-free is active """ from __future__ import annotations @@ -238,7 +238,7 @@ class TestBraveFreeBackendWiring: # --------------------------------------------------------------------------- -# brave-free is search-only: web_extract / web_crawl return clear errors +# brave-free is search-only: web_extract returns a clear error # --------------------------------------------------------------------------- @@ -269,23 +269,3 @@ class TestBraveFreeSearchOnlyErrors: assert result["success"] is False assert "search-only" in result["error"].lower() assert "brave" in result["error"].lower() - - def test_web_crawl_returns_search_only_error(self, monkeypatch): - import asyncio - from tools import web_tools - - monkeypatch.setattr(web_tools, "_load_web_config", lambda: {"backend": "brave-free"}) - monkeypatch.setenv("BRAVE_SEARCH_API_KEY", "BSAkey123") - monkeypatch.setattr(web_tools, "_is_tool_gateway_ready", lambda: False) - monkeypatch.setattr(web_tools, "check_firecrawl_api_key", lambda: False) - monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) - monkeypatch.setattr(web_tools, "check_website_access", lambda url: None) - monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False, raising=False) - - result_str = asyncio.get_event_loop().run_until_complete( - web_tools.web_crawl_tool("https://example.com") - ) - result = json.loads(result_str) - assert result["success"] is False - assert "search-only" in result["error"].lower() - assert "brave" in result["error"].lower() diff --git a/tests/tools/test_web_providers_ddgs.py b/tests/tools/test_web_providers_ddgs.py index 465b608c90a..a2fdb1e1e76 100644 --- a/tests/tools/test_web_providers_ddgs.py +++ b/tests/tools/test_web_providers_ddgs.py @@ -5,7 +5,7 @@ Covers: - DDGSWebSearchProvider.search() — happy path, missing package, runtime error - Result normalization (title, url, description, position) - _is_backend_available("ddgs") / _get_backend() integration -- web_extract / web_crawl return search-only errors when ddgs is active +- web_extract returns a search-only error when ddgs is active """ from __future__ import annotations @@ -209,7 +209,7 @@ class TestDDGSBackendWiring: # --------------------------------------------------------------------------- -# ddgs is search-only: web_extract / web_crawl return clear errors +# ddgs is search-only: web_extract returns a clear error # --------------------------------------------------------------------------- @@ -240,23 +240,3 @@ class TestDDGSSearchOnlyErrors: assert result["success"] is False assert "search-only" in result["error"].lower() assert "duckduckgo" in result["error"].lower() or "ddgs" in result["error"].lower() - - def test_web_crawl_returns_search_only_error(self, monkeypatch): - import asyncio - from tools import web_tools - - monkeypatch.setattr(web_tools, "_load_web_config", lambda: {"backend": "ddgs"}) - monkeypatch.setattr(web_tools, "_ddgs_package_importable", lambda: True) - monkeypatch.setattr(web_tools, "_is_tool_gateway_ready", lambda: False) - monkeypatch.setattr(web_tools, "check_firecrawl_api_key", lambda: False) - monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) - monkeypatch.setattr(web_tools, "check_website_access", lambda url: None) - monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False, raising=False) - - result_str = asyncio.get_event_loop().run_until_complete( - web_tools.web_crawl_tool("https://example.com") - ) - result = json.loads(result_str) - assert result["success"] is False - assert "search-only" in result["error"].lower() - assert "duckduckgo" in result["error"].lower() or "ddgs" in result["error"].lower() diff --git a/tests/tools/test_web_providers_searxng.py b/tests/tools/test_web_providers_searxng.py index 8a5247f7beb..d237e682973 100644 --- a/tests/tools/test_web_providers_searxng.py +++ b/tests/tools/test_web_providers_searxng.py @@ -296,7 +296,7 @@ class TestCheckWebApiKey: # --------------------------------------------------------------------------- -# searxng-only: web_extract and web_crawl return clear errors +# searxng-only: web_extract returns a clear error # --------------------------------------------------------------------------- @@ -312,26 +312,6 @@ class TestSearXNGOnlyExtractCrawlErrors: from agent.web_search_registry import _reset_for_tests _reset_for_tests() - def test_web_crawl_searxng_returns_clear_error(self, monkeypatch): - import asyncio - from tools import web_tools - - monkeypatch.setattr(web_tools, "_load_web_config", lambda: {"backend": "searxng"}) - monkeypatch.setenv("SEARXNG_URL", "http://localhost:8080") - monkeypatch.setattr(web_tools, "_is_tool_gateway_ready", lambda: False) - monkeypatch.setattr(web_tools, "check_firecrawl_api_key", lambda: False) - monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) - monkeypatch.setattr(web_tools, "check_website_access", lambda url: None) - monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False, raising=False) - - import json - result_str = asyncio.get_event_loop().run_until_complete( - web_tools.web_crawl_tool("https://example.com") - ) - result = json.loads(result_str) - assert result["success"] is False - assert "search-only" in result["error"].lower() or "SearXNG" in result["error"] - def test_web_extract_searxng_returns_clear_error(self, monkeypatch): import asyncio from tools import web_tools diff --git a/tests/tools/test_web_providers_xai.py b/tests/tools/test_web_providers_xai.py index d5a3deaf689..2a6f0c63b81 100644 --- a/tests/tools/test_web_providers_xai.py +++ b/tests/tools/test_web_providers_xai.py @@ -66,7 +66,6 @@ class TestXAIProviderIdentity: p = XAIWebSearchProvider() assert p.supports_search() is True assert p.supports_extract() is False - assert p.supports_crawl() is False def test_display_name(self): from plugins.web.xai.provider import XAIWebSearchProvider diff --git a/tests/tools/test_web_tools_tavily.py b/tests/tools/test_web_tools_tavily.py index b8034efa064..de820794965 100644 --- a/tests/tools/test_web_tools_tavily.py +++ b/tests/tools/test_web_tools_tavily.py @@ -3,8 +3,8 @@ Coverage: _tavily_request() — API key handling, endpoint construction, error propagation. _normalize_tavily_search_results() — search response normalization. - _normalize_tavily_documents() — extract/crawl response normalization, failed_results. - web_search_tool / web_extract_tool / web_crawl_tool — Tavily dispatch paths. + _normalize_tavily_documents() — extract response normalization, failed_results. + web_search_tool / web_extract_tool — Tavily dispatch paths. """ import json @@ -225,62 +225,3 @@ class TestWebExtractTavily: assert len(result["results"]) == 1 assert result["results"][0]["url"] == "https://example.com" - -# ─── web_crawl_tool (Tavily dispatch) ───────────────────────────────────────── - -class TestWebCrawlTavily: - """Test web_crawl_tool dispatch to Tavily.""" - - _register_providers = staticmethod(register_all_web_providers) - - @pytest.fixture(autouse=True) - def _populate_web_registry(self): - self._register_providers() - yield - from agent.web_search_registry import _reset_for_tests - _reset_for_tests() - - def test_crawl_dispatches_to_tavily(self): - mock_response = MagicMock() - mock_response.json.return_value = { - "results": [ - {"url": "https://example.com/page1", "raw_content": "Page 1 content", "title": "Page 1"}, - {"url": "https://example.com/page2", "raw_content": "Page 2 content", "title": "Page 2"}, - ] - } - mock_response.raise_for_status = MagicMock() - - with patch("tools.web_tools._get_backend", return_value="tavily"), \ - patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \ - patch("tools.web_tools.httpx.post", return_value=mock_response), \ - patch("tools.web_tools.check_website_access", return_value=None), \ - patch("tools.web_tools.is_safe_url", return_value=True), \ - patch("tools.interrupt.is_interrupted", return_value=False): - from tools.web_tools import web_crawl_tool - result = json.loads(asyncio.get_event_loop().run_until_complete( - web_crawl_tool("https://example.com", use_llm_processing=False) - )) - assert "results" in result - assert len(result["results"]) == 2 - assert result["results"][0]["title"] == "Page 1" - - def test_crawl_sends_instructions(self): - """Instructions are included in the Tavily crawl payload.""" - mock_response = MagicMock() - mock_response.json.return_value = {"results": []} - mock_response.raise_for_status = MagicMock() - - with patch("tools.web_tools._get_backend", return_value="tavily"), \ - patch.dict(os.environ, {"TAVILY_API_KEY": "tvly-test"}), \ - patch("tools.web_tools.httpx.post", return_value=mock_response) as mock_post, \ - patch("tools.web_tools.check_website_access", return_value=None), \ - patch("tools.web_tools.is_safe_url", return_value=True), \ - patch("tools.interrupt.is_interrupted", return_value=False): - from tools.web_tools import web_crawl_tool - asyncio.get_event_loop().run_until_complete( - web_crawl_tool("https://example.com", instructions="Find docs", use_llm_processing=False) - ) - call_kwargs = mock_post.call_args - payload = call_kwargs.kwargs.get("json") or call_kwargs[1].get("json") - assert payload["instructions"] == "Find docs" - assert payload["url"] == "https://example.com" diff --git a/tests/tools/test_website_policy.py b/tests/tools/test_website_policy.py index 5a163b7dc9e..37257ad4017 100644 --- a/tests/tools/test_website_policy.py +++ b/tests/tools/test_website_policy.py @@ -350,7 +350,7 @@ def test_browser_navigate_allows_when_shared_file_missing(monkeypatch, tmp_path) class TestWebToolPolicy: - """Tests that exercise web_extract_tool / web_crawl_tool with website-policy gates. + """Tests that exercise web_extract_tool with website-policy gates. These tests need the bundled web providers to be registered in the agent.web_search_registry so the tool dispatchers can find an active @@ -376,8 +376,7 @@ class TestWebToolPolicy: monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) # The per-URL website-policy gate moved into the firecrawl plugin's # extract() during the web-provider migration. Patch it at the new - # location; the dispatcher-level gate (used by web_crawl_tool's - # pre-flight) still lives on tools.web_tools. + # location. monkeypatch.setattr( firecrawl_provider, "check_website_access", @@ -445,96 +444,6 @@ class TestWebToolPolicy: assert result["results"][0]["content"] == "" assert result["results"][0]["blocked_by_policy"]["rule"] == "blocked.test" - @pytest.mark.asyncio - async def test_web_crawl_short_circuits_blocked_url(self, monkeypatch): - from tools import web_tools - - # web_crawl_tool checks for Firecrawl env before website policy - monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") - # Allow test URLs past SSRF check so website policy is what gets tested - monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) - # The dispatcher-level (seed-URL) policy gate still lives on web_tools. - # No per-page gate runs in this test because the dispatcher returns - # immediately when the seed is blocked, before delegating to the plugin. - monkeypatch.setattr( - web_tools, - "check_website_access", - lambda url: { - "host": "blocked.test", - "rule": "blocked.test", - "source": "config", - "message": "Blocked by website policy", - }, - ) - # If the dispatcher ever reaches the firecrawl plugin's crawl(), the test - # fails — pin the plugin module's client lookup so we'd notice. - from plugins.web.firecrawl import provider as firecrawl_provider - monkeypatch.setattr( - firecrawl_provider, - "_get_firecrawl_client", - lambda: pytest.fail("firecrawl plugin should not run for blocked crawl URL"), - ) - monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) - - result = json.loads(await web_tools.web_crawl_tool("https://blocked.test", use_llm_processing=False)) - - assert result["results"][0]["url"] == "https://blocked.test" - assert result["results"][0]["blocked_by_policy"]["rule"] == "blocked.test" - - @pytest.mark.asyncio - async def test_web_crawl_blocks_redirected_final_url(self, monkeypatch): - from tools import web_tools - from plugins.web.firecrawl import provider as firecrawl_provider - - # Force the firecrawl plugin to be the active crawl provider. - monkeypatch.setenv("FIRECRAWL_API_KEY", "fake-key") - # Allow test URLs past SSRF check so website policy is what gets tested - monkeypatch.setattr(web_tools, "is_safe_url", lambda url: True) - - def fake_check(url): - # Dispatcher seed-URL gate (web_tools.check_website_access call) - # and plugin per-page gate (firecrawl_provider.check_website_access - # call) both flow through this single fake_check. - if url == "https://allowed.test": - return None - if url == "https://blocked.test/final": - return { - "host": "blocked.test", - "rule": "blocked.test", - "source": "config", - "message": "Blocked by website policy", - } - pytest.fail(f"unexpected URL checked: {url}") - - class FakeCrawlClient: - def crawl(self, url, **kwargs): - return { - "data": [ - { - "markdown": "secret crawl content", - "metadata": { - "title": "Redirected crawl page", - "sourceURL": "https://blocked.test/final", - }, - } - ] - } - - # After PR #25182 follow-up: per-page policy gate lives in - # plugins.web.firecrawl.provider.crawl(). Patch the gate + client at - # the plugin location. The dispatcher-level (seed) gate also reads - # web_tools.check_website_access — patch both. - monkeypatch.setattr(web_tools, "check_website_access", fake_check) - monkeypatch.setattr(firecrawl_provider, "check_website_access", fake_check) - monkeypatch.setattr(firecrawl_provider, "_get_firecrawl_client", lambda: FakeCrawlClient()) - monkeypatch.setattr("tools.interrupt.is_interrupted", lambda: False) - - result = json.loads(await web_tools.web_crawl_tool("https://allowed.test", use_llm_processing=False)) - - assert result["results"][0]["content"] == "" - assert result["results"][0]["error"] == "Blocked by website policy" - assert result["results"][0]["blocked_by_policy"]["rule"] == "blocked.test" - def test_check_website_access_fails_open_on_malformed_config(tmp_path, monkeypatch): """Malformed config with default path should fail open (return None), not crash.""" diff --git a/tools/web_tools.py b/tools/web_tools.py index a39fa482a35..cfe722c2ba7 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -10,13 +10,12 @@ for Nous Subscribers only. Available tools: - web_search_tool: Search the web for information - web_extract_tool: Extract content from specific web pages -- web_crawl_tool: Crawl websites with specific instructions Backend compatibility: - Exa: https://exa.ai (search, extract) -- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract, crawl; direct or derived firecrawl-gateway. for Nous Subscribers) +- Firecrawl: https://docs.firecrawl.dev/introduction (search, extract; direct or derived firecrawl-gateway. for Nous Subscribers) - Parallel: https://docs.parallel.ai (search, extract) -- Tavily: https://tavily.com (search, extract, crawl) +- Tavily: https://tavily.com (search, extract) LLM Processing: - Uses OpenRouter API with Gemini 3 Flash Preview for intelligent content extraction @@ -28,16 +27,13 @@ Debug Mode: - Captures all tool calls, results, and compression metrics Usage: - from web_tools import web_search_tool, web_extract_tool, web_crawl_tool + from web_tools import web_search_tool, web_extract_tool # Search the web results = web_search_tool("Python machine learning libraries", limit=3) # Extract content from URLs content = web_extract_tool(["https://example.com"], format="markdown") - - # Crawl a website - crawl_data = web_crawl_tool("example.com", "Find contact information") """ import json @@ -371,7 +367,7 @@ async def process_content_with_llm( if content_len > MAX_CONTENT_SIZE: size_mb = content_len / 1_000_000 logger.warning("Content too large (%.1fMB > 2MB limit). Refusing to process.", size_mb) - return f"[Content too large to process: {size_mb:.1f}MB. Try using web_crawl with specific extraction instructions, or search for a more focused source.]" + return f"[Content too large to process: {size_mb:.1f}MB. Try a more focused source URL.]" # Skip processing if content is too short if content_len < min_length: @@ -1134,239 +1130,6 @@ async def web_extract_tool( return tool_error(error_msg) -async def web_crawl_tool( - url: str, - instructions: str = None, - depth: str = "basic", - use_llm_processing: bool = True, - model: Optional[str] = None, - min_length: int = DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION -) -> str: - """ - Crawl a website with specific instructions using available crawling API backend. - - This function provides a generic interface for web crawling that can work - with multiple backends. Currently uses Firecrawl. - - Args: - url (str): The base URL to crawl (can include or exclude https://) - instructions (str): Instructions for what to crawl/extract using LLM intelligence (optional) - depth (str): Depth of extraction ("basic" or "advanced", default: "basic") - use_llm_processing (bool): Whether to process content with LLM for summarization (default: True) - model (Optional[str]): The model to use for LLM processing (defaults to current auxiliary backend model) - min_length (int): Minimum content length to trigger LLM processing (default: 5000) - - Returns: - str: JSON string containing crawled content. If LLM processing is enabled and successful, - the 'content' field will contain the processed markdown summary instead of raw content. - Each page is processed individually. - - Raises: - Exception: If crawling fails or API key is not set - """ - debug_call_data = { - "parameters": { - "url": url, - "instructions": instructions, - "depth": depth, - "use_llm_processing": use_llm_processing, - "model": model, - "min_length": min_length - }, - "error": None, - "pages_crawled": 0, - "pages_processed_with_llm": 0, - "original_response_size": 0, - "final_response_size": 0, - "compression_metrics": [], - "processing_applied": [] - } - - try: - effective_model = model or _get_default_summarizer_model() - auxiliary_available = check_auxiliary_model() - backend = _get_backend() - - # Tavily (and any future plugin advertising supports_crawl=True) - # dispatches through agent.web_search_registry. The crawl response - # shape — {"results": [{"url", "title", "content", ...}]} — is then - # post-processed by the shared LLM-summarization path below. - from agent.web_search_registry import ( - get_active_crawl_provider, - get_provider as _wsp_get_provider, - ) - - crawl_provider = _wsp_get_provider(backend) if backend else None - if crawl_provider is not None and not crawl_provider.supports_crawl(): - # When the configured provider is search-only AND cannot - # extract URLs either (brave-free / ddgs / searxng), surface a - # typed "search-only" error rather than silently switching to - # a different crawl backend. When the provider supports extract - # but not crawl (e.g. firecrawl), fall through to the legacy - # firecrawl-via-extract path below. - if not crawl_provider.supports_extract(): - return json.dumps( - { - "success": False, - "error": ( - f"{crawl_provider.display_name} is a search-only " - "backend and cannot crawl URLs. " - "Set FIRECRAWL_API_KEY for crawling, or use " - "web_search instead." - ), - }, - ensure_ascii=False, - ) - crawl_provider = None # let legacy firecrawl path handle it - if crawl_provider is None: - crawl_provider = get_active_crawl_provider() - - # Mirror main's upstream availability gate: when the resolved - # provider is configured-but-unavailable (e.g. firecrawl without - # FIRECRAWL_API_KEY), short-circuit BEFORE we dispatch so the - # error envelope matches the legacy top-level shape - # ``{"success": False, "error": "..."}`` rather than burying the - # configuration message inside a per-page ``results[]`` entry. - if crawl_provider is not None and not crawl_provider.is_available(): - return json.dumps( - { - "success": False, - "error": ( - "web_crawl requires Firecrawl. Set FIRECRAWL_API_KEY, " - f"FIRECRAWL_API_URL{_firecrawl_backend_help_suffix()}, " - "or use web_search + web_extract instead." - ), - }, - ensure_ascii=False, - ) - - if crawl_provider is not None: - # Ensure URL has protocol - if not url.startswith(('http://', 'https://')): - url = f'https://{url}' - - # SSRF protection — block private/internal addresses - if not is_safe_url(url): - return json.dumps({"results": [{"url": url, "title": "", "content": "", - "error": "Blocked: URL targets a private or internal network address"}]}, ensure_ascii=False) - - # Website policy check - blocked = check_website_access(url) - if blocked: - logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"]) - return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"], - "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False) - - from tools.interrupt import is_interrupted as _is_int - if _is_int(): - return tool_error("Interrupted", success=False) - - logger.info("Web crawl via %s: %s", crawl_provider.name, url) - - # Async-or-sync dispatch — Tavily's crawl is sync, but a future - # async-crawl provider works transparently. - import inspect - crawl_kwargs = {"depth": depth, "limit": 20} - if instructions: - crawl_kwargs["instructions"] = instructions - - if inspect.iscoroutinefunction(crawl_provider.crawl): - response = await crawl_provider.crawl(url, **crawl_kwargs) - else: - response = await asyncio.to_thread( - crawl_provider.crawl, url, **crawl_kwargs - ) - - # Provider returns {"results": [...]} matching what the shared - # LLM post-processing below expects. - if not isinstance(response, dict): - response = {"results": []} - response.setdefault("results", []) - - # Fall through to the shared LLM processing and trimming below - # (skip the Firecrawl-specific crawl logic) - pages_crawled = len(response.get('results', [])) - logger.info("Crawled %d pages", pages_crawled) - debug_call_data["pages_crawled"] = pages_crawled - debug_call_data["original_response_size"] = len(json.dumps(response)) - - # Process each result with LLM if enabled - if use_llm_processing and auxiliary_available: - logger.info("Processing crawled content with LLM (parallel)...") - debug_call_data["processing_applied"].append("llm_processing") - - async def _process_tavily_crawl(result): - page_url = result.get('url', 'Unknown URL') - title = result.get('title', '') - content = result.get('content', '') - if not content: - return result, None, "no_content" - original_size = len(content) - processed = await process_content_with_llm(content, page_url, title, effective_model, min_length) - if processed: - result['raw_content'] = content - result['content'] = processed - metrics = {"url": page_url, "original_size": original_size, "processed_size": len(processed), - "compression_ratio": len(processed) / original_size if original_size else 1.0, "model_used": effective_model} - return result, metrics, "processed" - metrics = {"url": page_url, "original_size": original_size, "processed_size": original_size, - "compression_ratio": 1.0, "model_used": None, "reason": "content_too_short"} - return result, metrics, "too_short" - - tasks = [_process_tavily_crawl(r) for r in response.get('results', [])] - # Use return_exceptions=True so a single task failure does not - # discard all other successfully processed crawl results. - processed_results = await asyncio.gather(*tasks, return_exceptions=True) - for result_item in processed_results: - if isinstance(result_item, BaseException): - logger.warning("Tavily crawl processing task failed: %s", result_item) - continue - result, metrics, status = result_item - if status == "processed": - debug_call_data["compression_metrics"].append(metrics) - debug_call_data["pages_processed_with_llm"] += 1 - - if use_llm_processing and not auxiliary_available: - logger.warning("LLM processing requested but no auxiliary model available, returning raw content") - debug_call_data["processing_applied"].append("llm_processing_unavailable") - - trimmed_results = [{"url": r.get("url", ""), "title": r.get("title", ""), "content": r.get("content", ""), "error": r.get("error"), - **({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {})} for r in response.get("results", [])] - result_json = json.dumps({"results": trimmed_results}, indent=2, ensure_ascii=False) - cleaned_result = clean_base64_images(result_json) - debug_call_data["final_response_size"] = len(cleaned_result) - _debug.log_call("web_crawl_tool", debug_call_data) - _debug.save() - return cleaned_result - - # No registered provider supports crawl AND no crawl-capable plugin - # is available. Surface a typed error pointing the user at the two - # crawl-capable providers (Firecrawl + Tavily). - return json.dumps( - { - "success": False, - "error": ( - "web_crawl has no available backend. " - "Set FIRECRAWL_API_KEY (or FIRECRAWL_API_URL for " - f"self-hosted){_firecrawl_backend_help_suffix()}, " - "or set TAVILY_API_KEY for Tavily. " - "Alternatively use web_search + web_extract instead." - ), - }, - ensure_ascii=False, - ) - - except Exception as e: - error_msg = f"Error crawling website: {str(e)}" - logger.debug("%s", error_msg) - - debug_call_data["error"] = error_msg - _debug.log_call("web_crawl_tool", debug_call_data) - _debug.save() - - return tool_error(error_msg) - - # Convenience function to check Firecrawl credentials def check_web_api_key() -> bool: """Check whether the configured web backend is available.""" @@ -1456,16 +1219,15 @@ if __name__ == "__main__": print("🐛 Debug mode disabled (set WEB_TOOLS_DEBUG=true to enable)") print("\nBasic usage:") - print(" from web_tools import web_search_tool, web_extract_tool, web_crawl_tool") + print(" from web_tools import web_search_tool, web_extract_tool") print(" import asyncio") print("") print(" # Search (synchronous)") print(" results = web_search_tool('Python tutorials')") print("") - print(" # Extract and crawl (asynchronous)") + print(" # Extract (asynchronous)") print(" async def main():") print(" content = await web_extract_tool(['https://example.com'])") - print(" crawl_data = await web_crawl_tool('example.com', 'Find docs')") print(" asyncio.run(main())") if nous_available: @@ -1474,9 +1236,8 @@ if __name__ == "__main__": print(" content = await web_extract_tool(['https://python.org/about/'])") print("") print(" # Customize processing parameters") - print(" crawl_data = await web_crawl_tool(") - print(" 'docs.python.org',") - print(" 'Find key concepts',") + print(" content = await web_extract_tool(") + print(" ['https://docs.python.org'],") print(" model='google/gemini-3-flash-preview',") print(" min_length=3000") print(" )") diff --git a/tools/website_policy.py b/tools/website_policy.py index 63fb7571007..c621dcbf3c0 100644 --- a/tools/website_policy.py +++ b/tools/website_policy.py @@ -29,7 +29,7 @@ _DEFAULT_WEBSITE_BLOCKLIST = { } # Cache: parsed policy + timestamp. Avoids re-reading config.yaml on every -# URL check (a web_crawl with 50 pages would otherwise mean 51 YAML parses). +# URL check (a multi-URL extract with 50 pages would otherwise mean 51 YAML parses). _CACHE_TTL_SECONDS = 30.0 _cache_lock = threading.Lock() _cached_policy: Optional[Dict[str, Any]] = None diff --git a/website/docs/developer-guide/web-search-provider-plugin.md b/website/docs/developer-guide/web-search-provider-plugin.md index 85387f50070..ba44af8f5f8 100644 --- a/website/docs/developer-guide/web-search-provider-plugin.md +++ b/website/docs/developer-guide/web-search-provider-plugin.md @@ -80,9 +80,6 @@ class MyBackendWebSearchProvider(WebSearchProvider): def supports_extract(self) -> bool: return False - def supports_crawl(self) -> bool: - return False - def search(self, query: str, limit: int = 5) -> Dict[str, Any]: import httpx @@ -157,12 +154,10 @@ Full contract in `agent/web_search_provider.py`. Methods you may override: | `is_available()` | ✅ | — | Cheap availability gate — env vars, optional deps | | `supports_search()` | — | `True` | Capability flag for `web_search` routing | | `supports_extract()` | — | `False` | Capability flag for `web_extract` routing | -| `supports_crawl()` | — | `False` | Capability flag for deep-crawl modes | | `search(query, limit)` | conditional | raises | Required when `supports_search()` returns `True` | | `extract(urls, **kwargs)` | conditional | raises | Required when `supports_extract()` returns `True` | -| `crawl(url, **kwargs)` | conditional | raises | Required when `supports_crawl()` returns `True` | -Providers can advertise multiple capabilities from a single class — Firecrawl, Tavily, Exa, and Parallel all implement all three of search/extract/crawl. Brave Search and DDGS are search-only; SearXNG is search-only with a documented "pair me with an extract provider" workflow. +Providers can advertise multiple capabilities from a single class — Firecrawl, Tavily, Exa, and Parallel all implement both search and extract. Brave Search and DDGS are search-only; SearXNG is search-only with a documented "pair me with an extract provider" workflow. ## Response shape diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 96205b922f8..64506bc4e5c 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -1429,7 +1429,7 @@ Environment scrubbing (strips `*_API_KEY`, `*_TOKEN`, `*_SECRET`, `*_PASSWORD`, ## Web Search Backends -The `web_search`, `web_extract`, and `web_crawl` tools support five backend providers. Configure the backend in `config.yaml` or via `hermes tools`: +The `web_search` and `web_extract` tools support five backend providers. Configure the backend in `config.yaml` or via `hermes tools`: ```yaml web: @@ -1440,17 +1440,17 @@ web: extract_backend: "firecrawl" ``` -| Backend | Env Var | Search | Extract | Crawl | -|---------|---------|--------|---------|-------| -| **Firecrawl** (default) | `FIRECRAWL_API_KEY` | ✔ | ✔ | ✔ | -| **SearXNG** | `SEARXNG_URL` | ✔ | — | — | -| **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | — | -| **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | ✔ | -| **Exa** | `EXA_API_KEY` | ✔ | ✔ | — | +| Backend | Env Var | Search | Extract | +|---------|---------|--------|---------| +| **Firecrawl** (default) | `FIRECRAWL_API_KEY` | ✔ | ✔ | +| **SearXNG** | `SEARXNG_URL` | ✔ | — | +| **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | +| **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | +| **Exa** | `EXA_API_KEY` | ✔ | ✔ | **Backend selection:** If `web.backend` is not set, the backend is auto-detected from available API keys. If only `SEARXNG_URL` is set, SearXNG is used. If only `EXA_API_KEY` is set, Exa is used. If only `TAVILY_API_KEY` is set, Tavily is used. If only `PARALLEL_API_KEY` is set, Parallel is used. Otherwise Firecrawl is the default. -**SearXNG** is a free, self-hosted, privacy-respecting metasearch engine that queries 70+ search engines. No API key needed — just set `SEARXNG_URL` to your instance (e.g., `http://localhost:8080`). SearXNG is search-only; `web_extract` and `web_crawl` require a separate extract provider (set `web.extract_backend`). See the [Web Search setup guide](/user-guide/features/web-search) for Docker setup instructions. +**SearXNG** is a free, self-hosted, privacy-respecting metasearch engine that queries 70+ search engines. No API key needed — just set `SEARXNG_URL` to your instance (e.g., `http://localhost:8080`). SearXNG is search-only; `web_extract` requires a separate extract provider (set `web.extract_backend`). See the [Web Search setup guide](/user-guide/features/web-search) for Docker setup instructions. **Self-hosted Firecrawl:** Set `FIRECRAWL_API_URL` to point at your own instance. When a custom URL is set, the API key becomes optional (set `USE_DB_AUTHENTICATION=*** on the server to disable auth). diff --git a/website/docs/user-guide/features/web-search.md b/website/docs/user-guide/features/web-search.md index 645d1a4c629..161b91ec83c 100644 --- a/website/docs/user-guide/features/web-search.md +++ b/website/docs/user-guide/features/web-search.md @@ -1,6 +1,6 @@ --- title: Web Search & Extract -description: Search the web, extract page content, and crawl websites with multiple backend providers — including free self-hosted SearXNG. +description: Search the web and extract page content with multiple backend providers — including free self-hosted SearXNG. sidebar_label: Web Search sidebar_position: 6 --- @@ -10,22 +10,22 @@ sidebar_position: 6 Hermes Agent includes two model-callable web tools backed by multiple providers: - **`web_search`** — search the web and return ranked results -- **`web_extract`** — fetch and extract readable content from one or more URLs (with built-in deep-crawl support when the backend provides it) +- **`web_extract`** — fetch and extract readable content from one or more URLs -Both are configured through a single backend selection. Providers are chosen via `hermes tools` or set directly in `config.yaml`. Recursive crawling capabilities (Firecrawl/Tavily) are exposed through `web_extract` rather than as a separate `web_crawl` tool. +Both are configured through a single backend selection. Providers are chosen via `hermes tools` or set directly in `config.yaml`. ## Backends -| Provider | Env Var | Search | Extract | Crawl | Free tier | -|----------|---------|--------|---------|-------|-----------| -| **Firecrawl** (default) | `FIRECRAWL_API_KEY` | ✔ | ✔ | ✔ | 500 credits/mo | -| **SearXNG** | `SEARXNG_URL` | ✔ | — | — | ✔ Free (self-hosted) | -| **Brave Search (free tier)** | `BRAVE_SEARCH_API_KEY` | ✔ | — | — | 2 000 queries/mo | -| **DDGS (DuckDuckGo)** | — (no key) | ✔ | — | — | ✔ Free | -| **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | ✔ | 1 000 searches/mo | -| **Exa** | `EXA_API_KEY` | ✔ | ✔ | — | 1 000 searches/mo | -| **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | — | Paid | -| **xAI (Grok)** | `XAI_API_KEY` or `hermes auth login xai-oauth` | ✔ | — | — | Paid (SuperGrok or per-token) | +| Provider | Env Var | Search | Extract | Free tier | +|----------|---------|--------|---------|-----------| +| **Firecrawl** (default) | `FIRECRAWL_API_KEY` | ✔ | ✔ | 500 credits/mo | +| **SearXNG** | `SEARXNG_URL` | ✔ | — | ✔ Free (self-hosted) | +| **Brave Search (free tier)** | `BRAVE_SEARCH_API_KEY` | ✔ | — | 2 000 queries/mo | +| **DDGS (DuckDuckGo)** | — (no key) | ✔ | — | ✔ Free | +| **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | 1 000 searches/mo | +| **Exa** | `EXA_API_KEY` | ✔ | ✔ | 1 000 searches/mo | +| **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | Paid | +| **xAI (Grok)** | `XAI_API_KEY` or `hermes auth login xai-oauth` | ✔ | — | Paid (SuperGrok or per-token) | Brave Search, DDGS, and xAI are **search-only** — pair any of them with Firecrawl/Tavily/Exa/Parallel when you also need `web_extract`. DDGS uses the [`ddgs` Python package](https://pypi.org/project/ddgs/) under the hood; if it isn't already installed, run `pip install ddgs` (or let Hermes lazy-install it on first use). xAI runs Grok's server-side `web_search` tool on the Responses API — results are LLM-generated rather than index-backed, so titles, descriptions, and URL choice are all model output (see the [trust-model caveat](#xai-grok) below). @@ -46,7 +46,7 @@ Backends return raw page markdown, which can be huge (forum threads, docs sites, | Under 5 000 | Returned as-is — no LLM call, full markdown reaches the agent | | 5 000 – 500 000 | Single-pass summary via the `web_extract` auxiliary model, capped at ~5 000 chars of output | | 500 000 – 2 000 000 | Chunked: split into 100 k-char chunks, summarize each in parallel, then synthesize a final summary (~5 000 chars) | -| Over 2 000 000 | Refused with a hint to use `web_crawl` with focused extraction instructions or a more specific source | +| Over 2 000 000 | Refused with a hint to use a more focused source URL | The summary keeps quotes, code blocks, and key facts in their original formatting — it's a content compressor, not a paraphraser. If summarization fails or times out, Hermes falls back to the first ~5 000 chars of raw content rather than a useless error. @@ -89,7 +89,7 @@ hermes tools ### Firecrawl (default) -Full-featured search, extract, and crawl. Recommended for most users. +Full-featured search and extract. Recommended for most users. ```bash # ~/.hermes/.env @@ -113,7 +113,7 @@ When `FIRECRAWL_API_URL` is set, the API key is optional (disable server auth wi SearXNG is a privacy-respecting, open-source metasearch engine that aggregates results from 70+ search engines. **No API key required** — just point Hermes at a running SearXNG instance. -SearXNG is **search-only** — `web_extract` (including its crawl modes) requires a separate extract provider. +SearXNG is **search-only** — `web_extract` requires a separate extract provider. #### Option A — Self-host with Docker (recommended) @@ -222,7 +222,7 @@ Public instances have rate limits, variable uptime, and may disable JSON format #### Pair SearXNG with an extract provider -SearXNG handles search; you need a separate provider for `web_extract` (including any deep-crawl modes). Use the per-capability keys: +SearXNG handles search; you need a separate provider for `web_extract`. Use the per-capability keys: ```yaml # ~/.hermes/config.yaml @@ -237,7 +237,7 @@ With this config, Hermes uses SearXNG for all search queries and Firecrawl for U ### Tavily -AI-optimised search, extract, and crawl with a generous free tier. +AI-optimised search and extract with a generous free tier. ```bash # ~/.hermes/.env @@ -341,7 +341,7 @@ Use different providers for search vs extract. This lets you combine free search # ~/.hermes/config.yaml web: search_backend: "searxng" # used by web_search - extract_backend: "firecrawl" # used by web_extract (and its deep-crawl modes) + extract_backend: "firecrawl" # used by web_extract ``` When per-capability keys are empty, both fall through to `web.backend`. When `web.backend` is also empty, the backend is auto-detected from whichever API key/URL is present. diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/developer-guide/web-search-provider-plugin.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/developer-guide/web-search-provider-plugin.md index 2c1f971dfcc..739501b0376 100644 --- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/developer-guide/web-search-provider-plugin.md +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/developer-guide/web-search-provider-plugin.md @@ -80,9 +80,6 @@ class MyBackendWebSearchProvider(WebSearchProvider): def supports_extract(self) -> bool: return False - def supports_crawl(self) -> bool: - return False - def search(self, query: str, limit: int = 5) -> Dict[str, Any]: import httpx @@ -157,12 +154,10 @@ requires_env: | `is_available()` | ✅ | — | 轻量可用性检查——环境变量、可选依赖等 | | `supports_search()` | — | `True` | `web_search` 路由的能力标志 | | `supports_extract()` | — | `False` | `web_extract` 路由的能力标志 | -| `supports_crawl()` | — | `False` | 深度爬取模式的能力标志 | | `search(query, limit)` | 条件必须 | 抛出异常 | 当 `supports_search()` 返回 `True` 时必须实现 | | `extract(urls, **kwargs)` | 条件必须 | 抛出异常 | 当 `supports_extract()` 返回 `True` 时必须实现 | -| `crawl(url, **kwargs)` | 条件必须 | 抛出异常 | 当 `supports_crawl()` 返回 `True` 时必须实现 | -提供商可以在单个类中声明多种能力——Firecrawl、Tavily、Exa 和 Parallel 均实现了搜索/提取/爬取三种能力。Brave Search 和 DDGS 仅支持搜索;SearXNG 也仅支持搜索,并有文档说明的"与提取提供商配对使用"工作流。 +提供商可以在单个类中声明多种能力——Firecrawl、Tavily、Exa 和 Parallel 均实现了搜索和提取两种能力。Brave Search 和 DDGS 仅支持搜索;SearXNG 也仅支持搜索,并有文档说明的"与提取提供商配对使用"工作流。 ## 响应格式 diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/configuration.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/configuration.md index 441bad64619..f8a0f87b40a 100644 --- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/configuration.md +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/configuration.md @@ -1411,7 +1411,7 @@ code_execution: ## Web 搜索后端 -`web_search`、`web_extract` 和 `web_crawl` 工具支持五种后端 provider。在 `config.yaml` 中或通过 `hermes tools` 配置后端: +`web_search` 和 `web_extract` 工具支持五种后端 provider。在 `config.yaml` 中或通过 `hermes tools` 配置后端: ```yaml web: @@ -1422,17 +1422,17 @@ web: extract_backend: "firecrawl" ``` -| 后端 | 环境变量 | 搜索 | 提取 | 爬取 | -|---------|---------|--------|---------|-------| -| **Firecrawl**(默认) | `FIRECRAWL_API_KEY` | ✔ | ✔ | ✔ | -| **SearXNG** | `SEARXNG_URL` | ✔ | — | — | -| **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | — | -| **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | ✔ | -| **Exa** | `EXA_API_KEY` | ✔ | ✔ | — | +| 后端 | 环境变量 | 搜索 | 提取 | +|---------|---------|--------|---------| +| **Firecrawl**(默认) | `FIRECRAWL_API_KEY` | ✔ | ✔ | +| **SearXNG** | `SEARXNG_URL` | ✔ | — | +| **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | +| **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | +| **Exa** | `EXA_API_KEY` | ✔ | ✔ | **后端选择:** 如果未设置 `web.backend`,后端从可用的 API 密钥自动检测。如果仅设置了 `SEARXNG_URL`,使用 SearXNG。如果仅设置了 `EXA_API_KEY`,使用 Exa。如果仅设置了 `TAVILY_API_KEY`,使用 Tavily。如果仅设置了 `PARALLEL_API_KEY`,使用 Parallel。否则 Firecrawl 是默认值。 -**SearXNG** 是一个免费、自托管、尊重隐私的元搜索引擎,查询 70+ 个搜索引擎。无需 API 密钥 —— 只需将 `SEARXNG_URL` 设置为您的实例(例如 `http://localhost:8080`)。SearXNG 仅限搜索;`web_extract` 和 `web_crawl` 需要单独的提取 provider(设置 `web.extract_backend`)。Docker 设置说明请参阅 [Web 搜索设置指南](/user-guide/features/web-search)。 +**SearXNG** 是一个免费、自托管、尊重隐私的元搜索引擎,查询 70+ 个搜索引擎。无需 API 密钥 —— 只需将 `SEARXNG_URL` 设置为您的实例(例如 `http://localhost:8080`)。SearXNG 仅限搜索;`web_extract` 需要单独的提取 provider(设置 `web.extract_backend`)。Docker 设置说明请参阅 [Web 搜索设置指南](/user-guide/features/web-search)。 **自托管 Firecrawl:** 设置 `FIRECRAWL_API_URL` 指向您自己的实例。设置自定义 URL 后,API 密钥变为可选(在服务器上设置 `USE_DB_AUTHENTICATION=***` 以禁用认证)。 diff --git a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/web-search.md b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/web-search.md index 3bb64b74dde..70b378bedd1 100644 --- a/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/web-search.md +++ b/website/i18n/zh-Hans/docusaurus-plugin-content-docs/current/user-guide/features/web-search.md @@ -1,6 +1,6 @@ --- title: 网页搜索与提取 -description: 通过多个后端提供商搜索网页、提取页面内容并爬取网站——包括免费的自托管 SearXNG。 +description: 通过多个后端提供商搜索网页并提取页面内容——包括免费的自托管 SearXNG。 sidebar_label: Web Search sidebar_position: 6 --- @@ -10,22 +10,22 @@ sidebar_position: 6 Hermes Agent 内置两个可供模型调用的网页工具,由多个提供商支持: - **`web_search`** — 搜索网页并返回排序结果 -- **`web_extract`** — 从一个或多个 URL 获取并提取可读内容(当后端支持时内置深度爬取功能) +- **`web_extract`** — 从一个或多个 URL 获取并提取可读内容 -两者均通过单一后端选择进行配置。提供商可通过 `hermes tools` 选择,或直接在 `config.yaml` 中设置。递归爬取功能(Firecrawl/Tavily)通过 `web_extract` 暴露,而非作为独立的 `web_crawl` 工具。 +两者均通过单一后端选择进行配置。提供商可通过 `hermes tools` 选择,或直接在 `config.yaml` 中设置。 ## 后端 -| 提供商 | 环境变量 | 搜索 | 提取 | 爬取 | 免费层级 | -|----------|---------|--------|---------|-------|-----------| -| **Firecrawl**(默认) | `FIRECRAWL_API_KEY` | ✔ | ✔ | ✔ | 500 积分/月 | -| **SearXNG** | `SEARXNG_URL` | ✔ | — | — | ✔ 免费(自托管) | -| **Brave Search(免费层级)** | `BRAVE_SEARCH_API_KEY` | ✔ | — | — | 2 000 次查询/月 | -| **DDGS (DuckDuckGo)** | —(无需密钥) | ✔ | — | — | ✔ 免费 | -| **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | ✔ | 1 000 次搜索/月 | -| **Exa** | `EXA_API_KEY` | ✔ | ✔ | — | 1 000 次搜索/月 | -| **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | — | 付费 | -| **xAI (Grok)** | `XAI_API_KEY` 或 `hermes auth login xai-oauth` | ✔ | — | — | 付费(SuperGrok 或按 token 计费) | +| 提供商 | 环境变量 | 搜索 | 提取 | 免费层级 | +|----------|---------|--------|---------|-----------| +| **Firecrawl**(默认) | `FIRECRAWL_API_KEY` | ✔ | ✔ | 500 积分/月 | +| **SearXNG** | `SEARXNG_URL` | ✔ | — | ✔ 免费(自托管) | +| **Brave Search(免费层级)** | `BRAVE_SEARCH_API_KEY` | ✔ | — | 2 000 次查询/月 | +| **DDGS (DuckDuckGo)** | —(无需密钥) | ✔ | — | ✔ 免费 | +| **Tavily** | `TAVILY_API_KEY` | ✔ | ✔ | 1 000 次搜索/月 | +| **Exa** | `EXA_API_KEY` | ✔ | ✔ | 1 000 次搜索/月 | +| **Parallel** | `PARALLEL_API_KEY` | ✔ | ✔ | 付费 | +| **xAI (Grok)** | `XAI_API_KEY` 或 `hermes auth login xai-oauth` | ✔ | — | 付费(SuperGrok 或按 token 计费) | Brave Search、DDGS 和 xAI 均为**仅搜索**——如果同时需要 `web_extract`,可将其中任意一个与 Firecrawl/Tavily/Exa/Parallel 配合使用。DDGS 底层使用 [`ddgs` Python 包](https://pypi.org/project/ddgs/);若尚未安装,请运行 `pip install ddgs`(或让 Hermes 在首次使用时懒加载安装)。xAI 通过 Responses API 运行 Grok 服务端的 `web_search` 工具——结果由 LLM 生成而非基于索引,因此标题、描述和 URL 选择均为模型输出(参见下方[信任模型说明](#xai-grok))。 @@ -46,7 +46,7 @@ Brave Search、DDGS 和 xAI 均为**仅搜索**——如果同时需要 `web_ext | 5 000 以下 | 原样返回——不调用 LLM,完整 markdown 直达 agent | | 5 000 – 500 000 | 通过 `web_extract` 辅助模型单次摘要,输出上限约 5 000 字符 | | 500 000 – 2 000 000 | 分块处理:拆分为 10 万字符的块,并行摘要每块,再合成最终摘要(约 5 000 字符) | -| 超过 2 000 000 | 拒绝处理,并提示使用带有针对性提取指令的 `web_crawl` 或更具体的来源 | +| 超过 2 000 000 | 拒绝处理,并提示使用更具体的来源 URL | 摘要保留引用、代码块和关键事实的原始格式——它是内容压缩器,而非改写器。如果摘要失败或超时,Hermes 会回退到原始内容的前约 5 000 字符,而非返回无用的错误信息。 @@ -89,7 +89,7 @@ hermes tools ### Firecrawl(默认) -功能完整的搜索、提取和爬取。推荐大多数用户使用。 +功能完整的搜索和提取。推荐大多数用户使用。 ```bash # ~/.hermes/.env @@ -113,7 +113,7 @@ FIRECRAWL_API_URL=http://localhost:3002 SearXNG 是一个注重隐私的开源元搜索引擎,聚合来自 70 多个搜索引擎的结果。**无需 API 密钥**——只需将 Hermes 指向一个运行中的 SearXNG 实例。 -SearXNG 为**仅搜索**——`web_extract`(包括其爬取模式)需要单独的提取提供商。 +SearXNG 为**仅搜索**——`web_extract` 需要单独的提取提供商。 #### 方案 A — 使用 Docker 自托管(推荐) @@ -222,7 +222,7 @@ SEARXNG_URL=https://searx.example.com #### 将 SearXNG 与提取提供商配合使用 -SearXNG 负责搜索;`web_extract`(包括任何深度爬取模式)需要单独的提供商。使用按能力配置的键: +SearXNG 负责搜索;`web_extract` 需要单独的提供商。使用按能力配置的键: ```yaml # ~/.hermes/config.yaml @@ -237,7 +237,7 @@ web: ### Tavily -针对 AI 优化的搜索、提取和爬取,免费层级慷慨。 +针对 AI 优化的搜索和提取,免费层级慷慨。 ```bash # ~/.hermes/.env @@ -341,7 +341,7 @@ web: # ~/.hermes/config.yaml web: search_backend: "searxng" # 由 web_search 使用 - extract_backend: "firecrawl" # 由 web_extract(及其深度爬取模式)使用 + extract_backend: "firecrawl" # 由 web_extract 使用 ``` 当按能力键为空时,两者均回退到 `web.backend`。当 `web.backend` 也为空时,后端根据存在的 API 密钥/URL 自动检测。