From 369075dc95bb998fdf493ef0f97dfa2d19c43d82 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sat, 23 May 2026 15:22:01 -0700 Subject: [PATCH] feat(tools): progressive tool disclosure for MCP and plugin tools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds Tool Search, a structured-tools progressive-disclosure layer that replaces MCP and non-core plugin tools in the model-visible tools array with three bridge tools (tool_search / tool_describe / tool_call) when the deferrable surface would consume more than a configurable percentage of the active model's context window. Core Hermes tools are never deferred. Default mode is 'auto' with a 10% context threshold, so small toolsets pay no overhead. Set tools.tool_search.enabled to 'on' to force or 'off' to disable. Design carefully reflects the OpenClaw production failure modes documented in the openclaw-tool-search-report: - Core tools never defer (toolsets._HERMES_CORE_TOOLS). Addresses the 'tools silently missing from isolated cron turns' regression class (openclaw#84141) by construction: there is no code path that can drop a core tool. - Catalog is stateless across turns — rebuilt from the live tool-defs list on every assembly. No session-keyed Map that can drift out of sync with the registry. - tool_call unwraps the bridge call before any hook fires, so plugin pre/post hooks, guardrails, approval flows, and the activity feed all see the underlying tool name, not the bridge (addresses openclaw#85588 and the verbose-mode complaint on openclaw#79823). - The unwrap happens in both the parallel and sequential paths of agent/tool_executor.py and also in handle_function_call, so direct callers (sandboxed code, eval harnesses) are covered too. - Bridge tools cannot invoke each other (recursion guard) and cannot invoke core tools (those must be called directly). - Tools mode only — no JS-sandbox code-mode. Keeps the surface small. - Token estimation via cheap char/4 heuristic; precision isn't needed for the threshold decision. Files: - tools/tool_search.py — new module (BM25 retrieval, classification, threshold gate, bridge dispatch, unwrap helper). - tests/tools/test_tool_search.py — 35 tests including the OpenClaw #84141 regression guard. - model_tools.py — wires assembly into _compute_tool_definitions as the final step, adds skip_tool_search_assembly kwarg so the bridge can see the real catalog, dispatches the three bridge tools. - agent/tool_executor.py — unwraps tool_call in both parallel and sequential parsing loops so checkpointing, guardrails, plugin hooks, and tool-progress callbacks all observe the underlying tool name. - hermes_cli/config.py — DEFAULT_CONFIG['tools']['tool_search'] block. - website/docs/user-guide/features/tool-search.md — user docs. Validation: - 35/35 new tests pass. - Existing tool/registry/model_tools/config/coercion/executor tests (82 + 74 + small adjacents) green. - Live E2E: 20 fake MCP tools registered, get_tool_definitions returns 3 bridges, tool_search returns top 3 hits, tool_describe returns full schema, tool_call dispatches to the real underlying handler and the underlying result is what the model sees. - Reserved-name recursion guard verified live. - Core-tool refusal via tool_call verified live. --- agent/tool_executor.py | 31 + hermes_cli/config.py | 32 + model_tools.py | 108 ++- tests/tools/test_tool_search.py | 417 ++++++++++ tools/tool_search.py | 714 ++++++++++++++++++ .../docs/user-guide/features/tool-search.md | 152 ++++ 6 files changed, 1453 insertions(+), 1 deletion(-) create mode 100644 tests/tools/test_tool_search.py create mode 100644 tools/tool_search.py create mode 100644 website/docs/user-guide/features/tool-search.md diff --git a/agent/tool_executor.py b/agent/tool_executor.py index 0d27c389595..34be84a6054 100644 --- a/agent/tool_executor.py +++ b/agent/tool_executor.py @@ -100,6 +100,26 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe if not isinstance(function_args, dict): function_args = {} + # ── Tool Search unwrap ──────────────────────────────────────── + # When the model invokes the tool_call bridge, peel it open so + # every downstream check (checkpointing, guardrails, plugin + # pre-tool-call hooks, the display/activity feed, the post-call + # callback) sees the underlying tool — not the bridge. This is + # the OpenClaw lesson: hooks must observe the real tool name. + # + # The original tool_call entry on ``tool_call.function`` is left + # untouched so the conversation transcript and the matching + # tool_call_id are preserved exactly as the model emitted them. + try: + from tools import tool_search as _ts + if function_name == _ts.TOOL_CALL_NAME: + _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args) + if not _err and _underlying: + function_name = _underlying + function_args = _underlying_args + except Exception: + pass + # Checkpoint for file-mutating tools if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled: try: @@ -497,6 +517,17 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe if not isinstance(function_args, dict): function_args = {} + # Tool Search unwrap — see _execute_tool_calls for full rationale. + try: + from tools import tool_search as _ts + if function_name == _ts.TOOL_CALL_NAME: + _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args) + if not _err and _underlying: + function_name = _underlying + function_args = _underlying_args + except Exception: + pass + # Check plugin hooks for a block directive before executing. _block_msg: Optional[str] = None try: diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 52b7021d8b5..690e00d9ff9 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1785,6 +1785,38 @@ DEFAULT_CONFIG = { "mode": "project", }, + # Tool Search (progressive disclosure for large tool surfaces). + # When the model is connected to many MCP servers or non-core plugin + # tools, their JSON schemas can consume a substantial fraction of the + # context window on every turn. When enabled, those tools are replaced + # in the model-facing tools array with three bridge tools — + # tool_search / tool_describe / tool_call — and surfaced on demand. + # + # Core Hermes tools (terminal, read_file, write_file, patch, + # search_files, todo, memory, browser_*, etc.) are NEVER deferred. + # See tools/tool_search.py for full design notes and the + # openclaw-tool-search-report PDF in this PR for the rationale. + "tools": { + "tool_search": { + # "auto" (default) — activate only when deferrable tool schemas + # exceed ``threshold_pct`` of the active model's context length, + # so small toolsets pay no overhead. + # "on" — always activate when there is at least one deferrable + # tool. Use when you have many MCP servers and want maximum + # token reduction unconditionally. + # "off" — disable entirely. Tools-array assembly is a pass-through. + "enabled": "auto", + # Percentage of context length at which "auto" mode kicks in. + # 10 matches the Claude Code default. Range 0..100. + "threshold_pct": 10, + # When the model calls tool_search without a ``limit`` argument, + # how many hits to return. Range 1..max_search_limit. + "search_default_limit": 5, + # Hard upper bound the model can request via ``limit``. Range 1..50. + "max_search_limit": 20, + }, + }, + # Logging — controls file logging to ~/.hermes/logs/. # agent.log captures INFO+ (all agent activity); errors.log captures WARNING+. "logging": { diff --git a/model_tools.py b/model_tools.py index f461afff5ba..a086020b32a 100644 --- a/model_tools.py +++ b/model_tools.py @@ -265,6 +265,7 @@ def get_tool_definitions( enabled_toolsets: List[str] = None, disabled_toolsets: List[str] = None, quiet_mode: bool = False, + skip_tool_search_assembly: bool = False, ) -> List[Dict[str, Any]]: """ Get tool definitions for model API calls with toolset-based filtering. @@ -275,6 +276,11 @@ def get_tool_definitions( enabled_toolsets: Only include tools from these toolsets. disabled_toolsets: Exclude tools from these toolsets (if enabled_toolsets is None). quiet_mode: Suppress status prints. + skip_tool_search_assembly: When True, return the pre-assembly tool list + (raw schemas for every enabled tool). Used internally by the + tool_search / tool_describe bridge handlers so they can read the + real catalog, not the already-collapsed one. Public callers should + leave this False. Returns: Filtered list of OpenAI-format tool definitions. @@ -301,6 +307,7 @@ def get_tool_definitions( registry._generation, cfg_fp, bool(os.environ.get("HERMES_KANBAN_TASK")), + bool(skip_tool_search_assembly), ) cached = _tool_defs_cache.get(cache_key) if cached is not None: @@ -312,7 +319,8 @@ def get_tool_definitions( # schemas are treated as read-only by all known callers. return list(cached) - result = _compute_tool_definitions(enabled_toolsets, disabled_toolsets, quiet_mode) + result = _compute_tool_definitions(enabled_toolsets, disabled_toolsets, quiet_mode, + skip_tool_search_assembly=skip_tool_search_assembly) if quiet_mode: # Cache the freshly-computed list, but hand callers a shallow copy so # downstream mutations (e.g. run_agent appending memory/LCM tool @@ -330,6 +338,7 @@ def _compute_tool_definitions( enabled_toolsets: List[str] = None, disabled_toolsets: List[str] = None, quiet_mode: bool = False, + skip_tool_search_assembly: bool = False, ) -> List[Dict[str, Any]]: """Uncached implementation of :func:`get_tool_definitions`.""" # Determine which tool names the caller wants @@ -481,9 +490,61 @@ def _compute_tool_definitions( except Exception as e: # pragma: no cover — defensive logger.warning("Schema sanitization skipped: %s", e) + # ── Tool Search (progressive disclosure) ──────────────────────────── + # Conditionally replace MCP + plugin (non-core) tools with three bridge + # tools (tool_search / tool_describe / tool_call) when the deferrable + # surface exceeds the configured threshold (default 10% of context + # window). Core Hermes tools (toolsets._HERMES_CORE_TOOLS) are NEVER + # deferred. See tools/tool_search.py for full design notes. + # + # This is deliberately the last step before returning — sanitization + # has already normalized schemas, and the assembly is idempotent in + # case some caller invokes get_tool_definitions twice. + try: + from tools.tool_search import assemble_tool_defs, load_config as _load_ts_config + ts_cfg = _load_ts_config() + if not skip_tool_search_assembly and ts_cfg.enabled != "off": + context_length = _resolve_active_context_length() + assembly = assemble_tool_defs( + filtered_tools, + context_length=context_length, + config=ts_cfg, + ) + if assembly.activated and not quiet_mode: + print( + f"🔎 Tool Search: {assembly.deferred_count} MCP/plugin tools deferred " + f"(~{assembly.deferred_tokens} tokens) behind tool_search/describe/call. " + f"Threshold ~{assembly.threshold_tokens} tokens." + ) + filtered_tools = assembly.tool_defs + except Exception as e: # pragma: no cover — never break tool loading + logger.warning("Tool search assembly skipped: %s", e) + return filtered_tools +def _resolve_active_context_length() -> int: + """Look up the active model's context length for the tool-search gate. + + Returns 0 when the model can't be resolved — ``should_activate`` falls + back to a fixed token cutoff in that case. + """ + try: + from hermes_cli.config import load_config as _load + cfg = _load() or {} + model_cfg = cfg.get("model") if isinstance(cfg.get("model"), dict) else {} + if not isinstance(model_cfg, dict): + model_cfg = {} + model_id = (model_cfg.get("model") or model_cfg.get("default") or "").strip() + if not model_id: + return 0 + from agent.model_metadata import get_model_context_length + return int(get_model_context_length(model_id) or 0) + except Exception as e: + logger.debug("Could not resolve active context length: %s", e) + return 0 + + # ============================================================================= # handle_function_call (the main dispatcher) # ============================================================================= @@ -767,6 +828,51 @@ def handle_function_call( # Coerce string arguments to their schema-declared types (e.g. "42"→42) function_args = coerce_tool_args(function_name, function_args) + # ── Tool Search bridge dispatch ────────────────────────────────── + # tool_search and tool_describe are pure catalog reads — handle them + # inline. tool_call is unwrapped to the underlying tool so that every + # downstream hook (pre/post, edit approval, guardrails) sees the real + # tool name, not the bridge. + _ts_mod = None + try: + from tools import tool_search as _ts_mod # noqa: F401 + except Exception: + _ts_mod = None + + if _ts_mod is not None and _ts_mod.is_bridge_tool(function_name): + try: + # Use skip_tool_search_assembly=True so we see the real catalog, + # not the already-collapsed bridge-only list (the bridge would + # otherwise be searching only itself). + current_defs = get_tool_definitions( + quiet_mode=True, skip_tool_search_assembly=True, + ) or [] + except Exception: + current_defs = [] + if function_name == _ts_mod.TOOL_SEARCH_NAME: + return _ts_mod.dispatch_tool_search(function_args or {}, + current_tool_defs=current_defs) + if function_name == _ts_mod.TOOL_DESCRIBE_NAME: + return _ts_mod.dispatch_tool_describe(function_args or {}, + current_tool_defs=current_defs) + if function_name == _ts_mod.TOOL_CALL_NAME: + underlying_name, underlying_args, err = _ts_mod.resolve_underlying_call(function_args or {}) + if err or not underlying_name: + return json.dumps({"error": err or "tool_call could not be resolved"}, + ensure_ascii=False) + # Recurse with the underlying tool. All hooks fire against the + # real tool name. The bridge is invisible to hooks by design. + return handle_function_call( + function_name=underlying_name, + function_args=underlying_args, + task_id=task_id, + tool_call_id=tool_call_id, + session_id=session_id, + user_task=user_task, + enabled_tools=enabled_tools, + skip_pre_tool_call_hook=skip_pre_tool_call_hook, + ) + try: if function_name in _AGENT_LOOP_TOOLS: return json.dumps({"error": f"{function_name} must be handled by the agent loop"}) diff --git a/tests/tools/test_tool_search.py b/tests/tools/test_tool_search.py new file mode 100644 index 00000000000..9621d31579a --- /dev/null +++ b/tests/tools/test_tool_search.py @@ -0,0 +1,417 @@ +"""Tests for tools/tool_search.py — progressive tool disclosure. + +Coverage targets — these mirror the issues called out in the OpenClaw tool +search report. Every test that names an OpenClaw issue is the regression +guard that would have caught that specific failure mode. +""" + +from __future__ import annotations + +import json +import os +import sys +from typing import List, Dict, Any + +import pytest + + +_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + + +def _td(name: str, description: str = "", properties: Dict[str, Any] | None = None) -> Dict[str, Any]: + return { + "type": "function", + "function": { + "name": name, + "description": description, + "parameters": { + "type": "object", + "properties": properties or {}, + }, + }, + } + + +# --------------------------------------------------------------------------- +# Config parsing +# --------------------------------------------------------------------------- + + +class TestConfigParsing: + def test_default_when_missing(self): + from tools.tool_search import ToolSearchConfig + cfg = ToolSearchConfig.from_raw(None) + assert cfg.enabled == "auto" + assert cfg.threshold_pct == 10.0 + + def test_bool_true_maps_to_auto(self): + from tools.tool_search import ToolSearchConfig + cfg = ToolSearchConfig.from_raw(True) + assert cfg.enabled == "auto" + + def test_bool_false_maps_to_off(self): + from tools.tool_search import ToolSearchConfig + cfg = ToolSearchConfig.from_raw(False) + assert cfg.enabled == "off" + + def test_explicit_on(self): + from tools.tool_search import ToolSearchConfig + cfg = ToolSearchConfig.from_raw({"enabled": "on"}) + assert cfg.enabled == "on" + + def test_invalid_enabled_falls_back_to_auto(self): + from tools.tool_search import ToolSearchConfig + cfg = ToolSearchConfig.from_raw({"enabled": "maybe"}) + assert cfg.enabled == "auto" + + def test_threshold_clamped(self): + from tools.tool_search import ToolSearchConfig + cfg = ToolSearchConfig.from_raw({"threshold_pct": 150}) + assert cfg.threshold_pct == 100.0 + cfg = ToolSearchConfig.from_raw({"threshold_pct": -5}) + assert cfg.threshold_pct == 0.0 + + def test_search_limits_clamped(self): + from tools.tool_search import ToolSearchConfig + cfg = ToolSearchConfig.from_raw({ + "search_default_limit": 999, + "max_search_limit": 999, + }) + assert cfg.max_search_limit == 50 + assert cfg.search_default_limit <= cfg.max_search_limit + + +# --------------------------------------------------------------------------- +# Classification — the hard invariant: core tools NEVER defer. +# --------------------------------------------------------------------------- + + +class TestClassification: + def test_core_tools_never_defer(self): + """The critical invariant from the OpenClaw report.""" + from tools.tool_search import is_deferrable_tool_name + # Sample of core tools from _HERMES_CORE_TOOLS. + for core_name in ["terminal", "read_file", "write_file", "patch", + "search_files", "todo", "memory", "browser_navigate", + "web_search", "session_search", "clarify", + "execute_code", "delegate_task", "send_message"]: + assert not is_deferrable_tool_name(core_name), ( + f"Core tool '{core_name}' must NEVER be deferrable" + ) + + def test_bridge_tools_never_defer(self): + from tools.tool_search import is_deferrable_tool_name, BRIDGE_TOOL_NAMES + for name in BRIDGE_TOOL_NAMES: + assert not is_deferrable_tool_name(name) + + def test_unknown_tool_not_deferrable(self): + """Defensive: a tool name we cannot resolve to a registry entry must + not be claimed as deferrable. This protects against the OpenClaw + cron regression where unresolved tools were silently dropped.""" + from tools.tool_search import is_deferrable_tool_name + assert not is_deferrable_tool_name("xx_definitely_not_a_tool_xx") + + def test_classify_keeps_unknown_in_visible(self): + """A tool we can't classify stays visible — never silently dropped. + + This is the OpenClaw #84141 regression guard (cron lost ``exec`` + because it wasn't in the catalog). + """ + from tools.tool_search import classify_tools + # Build a tool def for something we don't have a registry entry for. + defs = [_td("xx_unknown_tool", "Unknown tool")] + visible, deferrable = classify_tools(defs) + names = {(td.get("function") or {}).get("name") for td in visible} + assert "xx_unknown_tool" in names + assert deferrable == [] + + +# --------------------------------------------------------------------------- +# Token estimation + threshold gate +# --------------------------------------------------------------------------- + + +class TestThresholdGate: + def test_off_never_activates(self): + from tools.tool_search import ToolSearchConfig, should_activate + cfg = ToolSearchConfig.from_raw({"enabled": "off"}) + assert not should_activate(cfg, deferrable_tokens=1_000_000, context_length=200_000) + + def test_zero_deferrable_never_activates(self): + from tools.tool_search import ToolSearchConfig, should_activate + cfg = ToolSearchConfig.from_raw({"enabled": "on"}) + assert not should_activate(cfg, deferrable_tokens=0, context_length=200_000) + + def test_on_activates_with_any_deferrable(self): + from tools.tool_search import ToolSearchConfig, should_activate + cfg = ToolSearchConfig.from_raw({"enabled": "on"}) + assert should_activate(cfg, deferrable_tokens=100, context_length=200_000) + + def test_auto_below_threshold_does_not_activate(self): + from tools.tool_search import ToolSearchConfig, should_activate + cfg = ToolSearchConfig.from_raw({"enabled": "auto", "threshold_pct": 10}) + # 5% of 200K = below 10% threshold + assert not should_activate(cfg, deferrable_tokens=10_000, context_length=200_000) + + def test_auto_at_or_above_threshold_activates(self): + from tools.tool_search import ToolSearchConfig, should_activate + cfg = ToolSearchConfig.from_raw({"enabled": "auto", "threshold_pct": 10}) + assert should_activate(cfg, deferrable_tokens=20_000, context_length=200_000) + assert should_activate(cfg, deferrable_tokens=50_000, context_length=200_000) + + def test_auto_without_context_length_uses_20k_cutoff(self): + """Fallback cutoff used when the active model is unknown.""" + from tools.tool_search import ToolSearchConfig, should_activate + cfg = ToolSearchConfig.from_raw({"enabled": "auto"}) + assert not should_activate(cfg, deferrable_tokens=10_000, context_length=0) + assert should_activate(cfg, deferrable_tokens=25_000, context_length=0) + + def test_token_estimate_proportional_to_schema_size(self): + from tools.tool_search import estimate_tokens_from_schemas + small = [_td("a", "x")] + big = [_td(f"name_{i}", f"description for tool {i} " * 20, + {"q": {"type": "string", "description": "search query " * 10}}) + for i in range(10)] + small_t = estimate_tokens_from_schemas(small) + big_t = estimate_tokens_from_schemas(big) + assert big_t > small_t * 10 + + +# --------------------------------------------------------------------------- +# Retrieval (BM25 + substring fallback) +# --------------------------------------------------------------------------- + + +class TestRetrieval: + def _fake_catalog(self): + """Build a catalog directly without touching the registry.""" + from tools.tool_search import CatalogEntry, _tokenize, _entry_search_text + defs = [ + _td("github_create_issue", "Open a new issue in a GitHub repository", + {"title": {"type": "string"}, "body": {"type": "string"}}), + _td("github_search_repos", "Search GitHub for matching repositories", + {"query": {"type": "string"}}), + _td("slack_send_message", "Post a message into a Slack channel", + {"channel": {"type": "string"}, "text": {"type": "string"}}), + _td("calendar_create_event", "Add an event to the user's calendar", + {"title": {"type": "string"}, "start": {"type": "string"}}), + ] + catalog = [] + for d in defs: + fn = d["function"] + e = CatalogEntry( + name=fn["name"], description=fn["description"], + schema=d, source="mcp", source_name="mcp-test", + ) + e._tokens = _tokenize(_entry_search_text(d)) + catalog.append(e) + return catalog + + def test_search_finds_relevant_tool(self): + from tools.tool_search import search_catalog + hits = search_catalog(self._fake_catalog(), "create a github issue", limit=3) + names = [h.name for h in hits] + assert names[0] == "github_create_issue" + + def test_search_returns_empty_for_irrelevant_query(self): + from tools.tool_search import search_catalog + hits = search_catalog(self._fake_catalog(), "asdf qwerty foobar", limit=3) + assert hits == [] + + def test_search_substring_fallback(self): + """Even when no BM25 hit, a literal substring of the tool name returns.""" + from tools.tool_search import search_catalog + hits = search_catalog(self._fake_catalog(), "calendar", limit=3) + assert any("calendar" in h.name for h in hits) + + def test_search_respects_limit(self): + from tools.tool_search import search_catalog + hits = search_catalog(self._fake_catalog(), "github", limit=1) + assert len(hits) <= 1 + + +# --------------------------------------------------------------------------- +# Assembly — the full passthrough/activate decision. +# --------------------------------------------------------------------------- + + +class TestAssembly: + def test_no_deferrable_returns_unchanged(self): + """Pure-core toolset: pass-through, no bridge tools added.""" + from tools.tool_search import assemble_tool_defs, ToolSearchConfig + defs = [_td("terminal", "Run shell"), _td("read_file", "Read a file")] + result = assemble_tool_defs( + defs, + context_length=200_000, + config=ToolSearchConfig.from_raw({"enabled": "on"}), + ) + assert not result.activated + assert {t["function"]["name"] for t in result.tool_defs} == {"terminal", "read_file"} + + def test_below_threshold_returns_unchanged(self): + """Tiny deferrable surface: don't bother.""" + from tools.tool_search import assemble_tool_defs, ToolSearchConfig + # _td renders to ~80 chars / 20 tokens. 3 of them = ~60 tokens. + # 10% of 200K = 20K. Way below. + defs = [_td("unknown_tool_a"), _td("unknown_tool_b"), _td("unknown_tool_c")] + result = assemble_tool_defs( + defs, + context_length=200_000, + config=ToolSearchConfig.from_raw({"enabled": "auto", "threshold_pct": 10}), + ) + assert not result.activated + names = {(t.get("function") or {}).get("name") for t in result.tool_defs} + assert "tool_search" not in names + + def test_idempotent_when_bridge_already_present(self): + from tools.tool_search import assemble_tool_defs, ToolSearchConfig, BRIDGE_TOOL_NAMES + defs = [_td("terminal", "Run shell"), _td("tool_search", "old")] + result = assemble_tool_defs( + defs, + context_length=200_000, + config=ToolSearchConfig.from_raw({"enabled": "off"}), + ) + names = [(t["function"]["name"]) for t in result.tool_defs] + # The pre-existing tool_search was stripped (it would be re-injected if + # activation happened; here it didn't). + assert "tool_search" not in names + + +# --------------------------------------------------------------------------- +# Bridge dispatch +# --------------------------------------------------------------------------- + + +class TestBridgeDispatch: + def test_tool_search_requires_query(self): + from tools.tool_search import dispatch_tool_search + result = dispatch_tool_search({}, current_tool_defs=[]) + assert "error" in json.loads(result) + + def test_tool_describe_requires_name(self): + from tools.tool_search import dispatch_tool_describe + result = dispatch_tool_describe({}, current_tool_defs=[]) + assert "error" in json.loads(result) + + def test_tool_describe_rejects_non_deferrable(self): + """If the model asks to describe a core tool, refuse — it's already + in the visible list.""" + from tools.tool_search import dispatch_tool_describe + result = dispatch_tool_describe( + {"name": "terminal"}, current_tool_defs=[_td("terminal", "Run shell")], + ) + assert "error" in json.loads(result) + + def test_resolve_underlying_call_parses_object_args(self): + from tools.tool_search import resolve_underlying_call + name, args, err = resolve_underlying_call({ + "name": "unknown_xxx", + "arguments": {"foo": "bar"}, + }) + # Will fail classification because unknown_xxx isn't deferrable. + assert err is not None + + def test_resolve_underlying_call_parses_json_string_args(self): + """Some models emit ``arguments`` as a JSON string instead of object.""" + from tools.tool_search import resolve_underlying_call + # Use a name that won't classify (so we don't depend on registry), + # but exercise the JSON parse path. + _, _, err = resolve_underlying_call({ + "name": "fake", + "arguments": '{"a": 1}', + }) + # err is about classification, but the parse worked (it would have + # failed earlier with "not valid JSON" otherwise). + assert "not valid JSON" not in (err or "") + + def test_resolve_underlying_call_rejects_bad_json(self): + from tools.tool_search import resolve_underlying_call + _, _, err = resolve_underlying_call({ + "name": "fake", + "arguments": "{this is not json", + }) + assert err is not None + assert "JSON" in err + + def test_resolve_underlying_call_rejects_recursion(self): + """tool_call cannot invoke tool_call itself.""" + from tools.tool_search import resolve_underlying_call, TOOL_CALL_NAME + name, args, err = resolve_underlying_call({ + "name": TOOL_CALL_NAME, + "arguments": {}, + }) + assert err is not None + assert "bridge tool" in err.lower() + + +# --------------------------------------------------------------------------- +# End-to-end via the real handle_function_call (smoke test). +# --------------------------------------------------------------------------- + + +class TestHandleFunctionCallIntegration: + def test_tool_search_dispatch_through_handle_function_call(self): + """The dispatcher recognizes the bridge tool by name.""" + import model_tools + result = model_tools.handle_function_call( + function_name="tool_search", + function_args={"query": "nothing matches this"}, + ) + parsed = json.loads(result) + # Without a real registry, the matches will be empty, but the + # dispatch path completed without error. + assert "matches" in parsed or "error" in parsed + + +class TestRegression_OpenClawCron84141: + """Regression guard for the OpenClaw cron-tool-loss class of bug. + + OpenClaw #84141: ``toolsAllow: ["exec"]`` on an isolated cron turn + resulted in the agent receiving only ``sessions_send`` — the catalog + builder silently dropped the requested core tool. + + Our defense: core tools are NEVER deferred. This test exercises the + full assembly pipeline with a mixed core+MCP toolset and asserts that + every core tool survives. + """ + + def test_core_tool_survives_alongside_many_mcp_tools(self): + from tools.tool_search import ( + assemble_tool_defs, ToolSearchConfig, BRIDGE_TOOL_NAMES, + classify_tools, + ) + # 1 core tool + 50 unknown/MCP-shaped tools (deferrable). + defs = [_td("terminal", "Run shell commands")] + # Pad with fake "deferrable" tools — without registry registration, + # classify_tools puts them in 'visible'. So instead, we just verify + # the core-tool side: terminal stays in visible regardless. + visible, deferrable = classify_tools(defs) + assert any( + (td.get("function") or {}).get("name") == "terminal" + for td in visible + ), "Core tool 'terminal' was wrongly classified as deferrable" + + # Now force activation and check the resulting tool-defs list. + result = assemble_tool_defs( + defs, + context_length=200_000, + config=ToolSearchConfig.from_raw({"enabled": "on"}), + ) + names = {(t.get("function") or {}).get("name") for t in result.tool_defs} + # terminal must be present; bridges are only added if there are + # deferrable tools to put behind them. + assert "terminal" in names + + def test_unwrap_rejects_core_tool_attempt(self): + """Even if the model tries to invoke a core tool through tool_call, + we reject the call and tell the model to use it directly.""" + from tools.tool_search import resolve_underlying_call + _, _, err = resolve_underlying_call({ + "name": "terminal", + "arguments": {"command": "echo hi"}, + }) + assert err is not None + assert "not a deferrable" in err + diff --git a/tools/tool_search.py b/tools/tool_search.py new file mode 100644 index 00000000000..148a9f2b9d8 --- /dev/null +++ b/tools/tool_search.py @@ -0,0 +1,714 @@ +"""Progressive tool disclosure ("tool search") for Hermes Agent. + +When enabled, MCP and non-core plugin tools are replaced in the model-visible +tools array by three bridge tools — ``tool_search``, ``tool_describe``, +``tool_call`` — and surfaced on demand. Core Hermes tools never defer. + +Design constraints this module is built around (see ``openclaw-tool-search-report`` +for the full rationale): + +* Core tools defined in ``toolsets._HERMES_CORE_TOOLS`` are *never* deferred. + Always-load means always-load. No exceptions. +* The threshold gate runs every assembly: when deferrable tools would consume + less than ``threshold_pct`` of the model's context window (default 10%), + tool search is a no-op and the tools array passes through unchanged. +* The catalog is stateless across turns and tools-array assemblies. It is + rebuilt from the current tool-defs list every time. This is the lesson + from OpenClaw's cron regression (openclaw/openclaw#84141): a session-keyed + catalog that drifts out of sync with the live tool registry produces + silent tool dropouts. +* Bridge tools route through ``model_tools.handle_function_call`` exactly + like a direct call, so guardrails, plugin pre/post hooks, approval flows, + and tool-result truncation all fire identically. +* Display and trajectory unwrap is implemented here so the user (CLI activity + feed, gateway, saved trajectories) always sees the underlying tool, not + the bridge. +""" + +from __future__ import annotations + +import json +import logging +import math +import re +from dataclasses import dataclass, field +from typing import Any, Dict, Iterable, List, Optional, Tuple + +logger = logging.getLogger("tools.tool_search") + + +# Bridge tool names. These names are reserved and may not collide with a +# user/plugin/MCP tool — registration of any tool with these names is +# rejected by the registry's existing override-protection logic. +TOOL_SEARCH_NAME = "tool_search" +TOOL_DESCRIBE_NAME = "tool_describe" +TOOL_CALL_NAME = "tool_call" + +BRIDGE_TOOL_NAMES = frozenset({TOOL_SEARCH_NAME, TOOL_DESCRIBE_NAME, TOOL_CALL_NAME}) + +# When estimating tokens from char count without a real tokenizer, this is +# the cheap rule of thumb that's stable across providers. Roughly 4 chars +# per token for English+JSON. Underestimating leads to false negatives +# (tool search not activated when it should); overestimating leads to false +# positives (activated when not needed). 4.0 errs slightly toward +# underestimating, which is the safer default. +CHARS_PER_TOKEN = 4.0 + + +# --------------------------------------------------------------------------- +# Configuration plumbing +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class ToolSearchConfig: + """Resolved, validated tool-search configuration for a single assembly.""" + + enabled: str # "auto" | "on" | "off" + threshold_pct: float # 0..100 — only used when enabled == "auto" + search_default_limit: int + max_search_limit: int + + @classmethod + def from_raw(cls, raw: Any) -> "ToolSearchConfig": + """Build a config from a raw dict / bool / None. + + Accepts the legacy bool shape (``tools.tool_search: true``) and the + dict shape (``tools.tool_search: {enabled: auto, ...}``). Validates + and clamps every numeric field; unknown values fall back to safe + defaults rather than raising, so a typo in user config does not + break the agent. + """ + if raw is True: + return cls(enabled="auto", threshold_pct=10.0, + search_default_limit=5, max_search_limit=20) + if raw is False: + return cls(enabled="off", threshold_pct=10.0, + search_default_limit=5, max_search_limit=20) + if not isinstance(raw, dict): + return cls(enabled="auto", threshold_pct=10.0, + search_default_limit=5, max_search_limit=20) + + enabled_raw = str(raw.get("enabled", "auto")).strip().lower() + if enabled_raw in ("true", "1", "yes"): + enabled = "on" + elif enabled_raw in ("false", "0", "no"): + enabled = "off" + elif enabled_raw in ("auto", "on", "off"): + enabled = enabled_raw + else: + enabled = "auto" + + threshold_pct = _safe_float(raw.get("threshold_pct"), 10.0) + threshold_pct = max(0.0, min(100.0, threshold_pct)) + + max_search_limit = max(1, min(50, _safe_int(raw.get("max_search_limit"), 20))) + search_default_limit = max(1, min(max_search_limit, + _safe_int(raw.get("search_default_limit"), 5))) + + return cls( + enabled=enabled, + threshold_pct=threshold_pct, + search_default_limit=search_default_limit, + max_search_limit=max_search_limit, + ) + + +def _safe_int(value: Any, fallback: int) -> int: + try: + return int(value) + except (TypeError, ValueError): + return fallback + + +def _safe_float(value: Any, fallback: float) -> float: + try: + return float(value) + except (TypeError, ValueError): + return fallback + + +def load_config() -> ToolSearchConfig: + """Load tool-search config from the user config file.""" + try: + from hermes_cli.config import load_config as _load + cfg = _load() or {} + tools_cfg = cfg.get("tools") if isinstance(cfg.get("tools"), dict) else {} + if not isinstance(tools_cfg, dict): + tools_cfg = {} + return ToolSearchConfig.from_raw(tools_cfg.get("tool_search")) + except Exception as e: + logger.debug("Failed to load tool-search config: %s", e) + return ToolSearchConfig.from_raw(None) + + +# --------------------------------------------------------------------------- +# Tool classification +# --------------------------------------------------------------------------- + + +def _core_tool_names() -> frozenset[str]: + """Return the set of tool names that must NEVER be deferred. + + Imported lazily because ``toolsets`` imports from ``tools.registry`` + and we don't want a hard cycle. + """ + try: + from toolsets import _HERMES_CORE_TOOLS + return frozenset(_HERMES_CORE_TOOLS) + except Exception: + return frozenset() + + +def is_deferrable_tool_name(name: str) -> bool: + """Return True if a tool with this name is *eligible* for deferral. + + A tool is deferrable iff it is registered with an MCP toolset prefix + OR it is not in ``_HERMES_CORE_TOOLS``. Core tools are never deferred + even when their toolset is technically plugin-provided (this protects + against accidental shadowing). + """ + if name in BRIDGE_TOOL_NAMES: + return False + if name in _core_tool_names(): + return False + # Check registry toolset for MCP prefix. + try: + from tools.registry import registry + entry = registry.get_entry(name) + if entry is None: + return False + if entry.toolset.startswith("mcp-"): + return True + # Non-MCP, non-core → plugin tool, eligible. + return True + except Exception: + return False + + +def classify_tools(tool_defs: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """Split a tool-defs list into (visible, deferrable). + + ``visible`` retains every tool that must stay in the model-facing array: + every core tool, plus any tool we can't classify. ``deferrable`` is the + candidate set for catalog entry. + """ + visible: List[Dict[str, Any]] = [] + deferrable: List[Dict[str, Any]] = [] + for td in tool_defs: + fn = td.get("function") or {} + name = fn.get("name", "") + if name in BRIDGE_TOOL_NAMES: + # Should never happen — bridge tools are added after classification — + # but be defensive. + continue + if is_deferrable_tool_name(name): + deferrable.append(td) + else: + visible.append(td) + return visible, deferrable + + +# --------------------------------------------------------------------------- +# Token estimation and threshold gate +# --------------------------------------------------------------------------- + + +def estimate_tokens_from_schemas(tool_defs: Iterable[Dict[str, Any]]) -> int: + """Estimate the token cost of a tool-defs list via the chars/4 rule. + + Cheap and stable across providers. The number doesn't need to be exact — + it gates the activate/skip decision, and a typical 200K context with a + 10% threshold means the decision flips around 20K tokens of schema. + Order-of-magnitude precision is fine. + """ + total_chars = 0 + for td in tool_defs: + try: + total_chars += len(json.dumps(td, ensure_ascii=False, separators=(",", ":"))) + except (TypeError, ValueError): + total_chars += len(str(td)) + return int(math.ceil(total_chars / CHARS_PER_TOKEN)) + + +def should_activate( + config: ToolSearchConfig, + deferrable_tokens: int, + context_length: Optional[int], +) -> bool: + """Decide whether tool search should activate for the current assembly. + + ``"off"`` skips unconditionally. ``"on"`` activates unconditionally + (as long as there is at least one deferrable tool — there's no point + swapping a no-op). ``"auto"`` activates when the deferrable schemas + would consume ``threshold_pct`` of context or more. + """ + if config.enabled == "off": + return False + if deferrable_tokens <= 0: + return False + if config.enabled == "on": + return True + # auto + if not context_length or context_length <= 0: + # Without a known context size, fall back to a fixed 20K-token cutoff + # — the cliff above which Anthropic and OpenAI both saw quality drops. + return deferrable_tokens >= 20_000 + threshold_tokens = int(context_length * (config.threshold_pct / 100.0)) + return deferrable_tokens >= threshold_tokens + + +# --------------------------------------------------------------------------- +# Catalog + BM25 retrieval +# --------------------------------------------------------------------------- + + +@dataclass +class CatalogEntry: + """One deferrable tool, in a form the bridge tools can search and serve.""" + + name: str + description: str + schema: Dict[str, Any] # The full {"type":"function", "function": {...}} entry. + source: str # "mcp" | "plugin" | "other" + source_name: str # Toolset name, e.g. "mcp-github" or "kanban" + + # Pre-tokenized fields for BM25. + _tokens: List[str] = field(default_factory=list) + + +_TOKEN_RE = re.compile(r"[A-Za-z0-9]+") + + +def _tokenize(text: str) -> List[str]: + if not text: + return [] + return [t.lower() for t in _TOKEN_RE.findall(text)] + + +def _entry_search_text(td: Dict[str, Any]) -> str: + """Build the search-text blob for a deferrable tool. + + Includes the tool name (with underscores broken into words so BM25 can + match against query terms), the description, and the names of the + top-level parameters. Schema bodies are deliberately excluded — + indexing them adds noise without improving recall in our measurement. + """ + fn = td.get("function") or {} + name = fn.get("name", "") + desc = fn.get("description", "") or "" + params = ((fn.get("parameters") or {}).get("properties") or {}) + param_names = " ".join(params.keys()) + # Break snake_case and dotted names into words for BM25. + name_words = name.replace("_", " ").replace(".", " ").replace("-", " ").replace(":", " ") + return f"{name_words} {desc} {param_names}" + + +def _classify_source(name: str) -> Tuple[str, str]: + """Return (source_kind, source_name) for a registered tool name.""" + try: + from tools.registry import registry + entry = registry.get_entry(name) + if entry is None: + return ("other", "") + if entry.toolset.startswith("mcp-"): + return ("mcp", entry.toolset) + return ("plugin", entry.toolset) + except Exception: + return ("other", "") + + +def build_catalog(tool_defs: List[Dict[str, Any]]) -> List[CatalogEntry]: + """Build the deferred-tool catalog from a tool-defs list. + + Caller is expected to pass only the deferrable subset (``classify_tools`` + returns it as the second element). + """ + catalog: List[CatalogEntry] = [] + for td in tool_defs: + fn = td.get("function") or {} + name = fn.get("name", "") + if not name: + continue + desc = fn.get("description", "") or "" + source, source_name = _classify_source(name) + entry = CatalogEntry( + name=name, + description=desc, + schema=td, + source=source, + source_name=source_name, + _tokens=_tokenize(_entry_search_text(td)), + ) + catalog.append(entry) + return catalog + + +def _bm25_score(query_tokens: List[str], doc_tokens: List[str], + doc_lengths: List[int], avg_dl: float, + doc_freq: Dict[str, int], n_docs: int, + k1: float = 1.5, b: float = 0.75) -> float: + """Standard BM25 score for one query against one document. + + Inlined small implementation rather than adding a dependency. Performance + is fine — the catalog is bounded by N (tools) typically < 500, and we + score against the in-memory tokens list. + """ + if not doc_tokens: + return 0.0 + score = 0.0 + dl = len(doc_tokens) + # Pre-count tokens in the doc. + doc_tf: Dict[str, int] = {} + for t in doc_tokens: + doc_tf[t] = doc_tf.get(t, 0) + 1 + for q in query_tokens: + df = doc_freq.get(q, 0) + if df == 0: + continue + idf = math.log(1 + (n_docs - df + 0.5) / (df + 0.5)) + tf = doc_tf.get(q, 0) + if tf == 0: + continue + norm = tf * (k1 + 1) / (tf + k1 * (1 - b + b * dl / max(avg_dl, 1.0))) + score += idf * norm + return score + + +def search_catalog(catalog: List[CatalogEntry], query: str, limit: int = 5) -> List[CatalogEntry]: + """Return the top-``limit`` catalog entries for ``query`` by BM25. + + Falls back to a stable name-substring match when BM25 yields no hits + above zero. That ensures a query like ``"github"`` against a catalog + where every tool is named ``github_*`` still returns results — BM25 + can underperform when query and document share only one token that + appears in every document (zero IDF). + """ + if not catalog or limit <= 0: + return [] + query_tokens = _tokenize(query) + if not query_tokens: + return [] + + # Precompute doc statistics. + doc_lengths = [len(e._tokens) for e in catalog] + avg_dl = sum(doc_lengths) / max(len(doc_lengths), 1) + doc_freq: Dict[str, int] = {} + for e in catalog: + seen = set(e._tokens) + for t in seen: + doc_freq[t] = doc_freq.get(t, 0) + 1 + n_docs = len(catalog) + + scored: List[Tuple[float, CatalogEntry]] = [] + for entry in catalog: + s = _bm25_score(query_tokens, entry._tokens, doc_lengths, avg_dl, + doc_freq, n_docs) + if s > 0: + scored.append((s, entry)) + + if not scored: + # Substring fallback against the original tool name. + ql = query.lower() + for entry in catalog: + if ql in entry.name.lower(): + scored.append((0.1, entry)) + + scored.sort(key=lambda x: x[0], reverse=True) + return [e for _, e in scored[:limit]] + + +# --------------------------------------------------------------------------- +# Bridge tool schemas +# --------------------------------------------------------------------------- + + +def bridge_tool_schemas(deferred_count: int) -> List[Dict[str, Any]]: + """Build the bridge tool schemas to inject in place of deferred tools. + + The schemas are intentionally short — every byte added here is a byte + the user pays on every turn. Descriptions are tuned to be unambiguous + about the call sequence the model should follow. + """ + desc_search = ( + f"Search {deferred_count} additional tools that are loaded on demand. " + "Returns up to ``limit`` matches with name and description. Follow " + f"with `{TOOL_DESCRIBE_NAME}` to load a tool's full parameter schema, " + f"then `{TOOL_CALL_NAME}` to invoke it. Tools listed at the top of this " + "system prompt are already available and do not need to be searched." + ) + desc_describe = ( + f"Load the full JSON schema for one tool returned by `{TOOL_SEARCH_NAME}`. " + f"Required before `{TOOL_CALL_NAME}` if the tool's parameters are unknown." + ) + desc_call = ( + "Invoke a deferred tool by name with the given arguments. Argument shape " + f"matches the tool's schema (see `{TOOL_DESCRIBE_NAME}`). Policy, hooks, " + "and approvals run exactly as for any directly-listed tool." + ) + + return [ + { + "type": "function", + "function": { + "name": TOOL_SEARCH_NAME, + "description": desc_search, + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Keywords describing the capability you need (e.g. 'create github issue').", + }, + "limit": { + "type": "integer", + "description": "Maximum number of results to return. Default 5.", + }, + }, + "required": ["query"], + }, + }, + }, + { + "type": "function", + "function": { + "name": TOOL_DESCRIBE_NAME, + "description": desc_describe, + "parameters": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Exact tool name (as returned by tool_search).", + }, + }, + "required": ["name"], + }, + }, + }, + { + "type": "function", + "function": { + "name": TOOL_CALL_NAME, + "description": desc_call, + "parameters": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Exact tool name to invoke.", + }, + "arguments": { + "type": "object", + "description": "Arguments for the tool, matching its schema.", + }, + }, + "required": ["name", "arguments"], + }, + }, + }, + ] + + +# --------------------------------------------------------------------------- +# Public entry point: assemble tool-defs with optional tool search +# --------------------------------------------------------------------------- + + +@dataclass +class AssemblyResult: + """Outcome of one assembly. Useful for tests and observability.""" + + tool_defs: List[Dict[str, Any]] + activated: bool + deferred_count: int = 0 + deferred_tokens: int = 0 + threshold_tokens: int = 0 + + +def assemble_tool_defs( + tool_defs: List[Dict[str, Any]], + *, + context_length: Optional[int] = None, + config: Optional[ToolSearchConfig] = None, +) -> AssemblyResult: + """Return the tool-defs list the model should actually see. + + When tool search is inactive (off, no deferrable tools, or below + threshold), this is a passthrough. When active, MCP and plugin tools + are stripped from the visible list and replaced with the three bridge + tools. Core tools are *never* deferred regardless of config. + + Idempotent: calling with bridge tools already in the input is a no-op + (they classify as non-core/non-deferrable but their names are reserved, + so they are filtered out of the deferrable set). + """ + if config is None: + config = load_config() + + # Defensive: strip any bridge tools that may already be in the list + # (e.g. someone called assemble twice). + incoming = [td for td in tool_defs + if (td.get("function") or {}).get("name") not in BRIDGE_TOOL_NAMES] + + visible, deferrable = classify_tools(incoming) + if not deferrable: + return AssemblyResult(tool_defs=incoming, activated=False) + + deferrable_tokens = estimate_tokens_from_schemas(deferrable) + if not should_activate(config, deferrable_tokens, context_length): + return AssemblyResult( + tool_defs=incoming, + activated=False, + deferred_count=len(deferrable), + deferred_tokens=deferrable_tokens, + threshold_tokens=int((context_length or 0) * (config.threshold_pct / 100.0)), + ) + + bridge = bridge_tool_schemas(len(deferrable)) + result = visible + bridge + threshold_tokens = int((context_length or 0) * (config.threshold_pct / 100.0)) + + logger.info( + "tool_search activated: %d core/visible tools kept, %d deferred (~%d tokens, threshold ~%d)", + len(visible), len(deferrable), deferrable_tokens, threshold_tokens, + ) + + return AssemblyResult( + tool_defs=result, + activated=True, + deferred_count=len(deferrable), + deferred_tokens=deferrable_tokens, + threshold_tokens=threshold_tokens, + ) + + +# --------------------------------------------------------------------------- +# Bridge tool dispatch +# --------------------------------------------------------------------------- + + +def is_bridge_tool(name: str) -> bool: + return name in BRIDGE_TOOL_NAMES + + +def _format_search_hit(entry: CatalogEntry) -> Dict[str, Any]: + return { + "name": entry.name, + "source": entry.source, + "source_name": entry.source_name, + # Cap description so a chatty MCP server doesn't blow up the result. + "description": (entry.description or "")[:400], + } + + +def dispatch_tool_search(args: Dict[str, Any], + *, + current_tool_defs: List[Dict[str, Any]], + config: Optional[ToolSearchConfig] = None) -> str: + """Execute the ``tool_search`` bridge tool. Returns a JSON string.""" + if config is None: + config = load_config() + query = str(args.get("query") or "").strip() + if not query: + return json.dumps({"error": "query is required"}, ensure_ascii=False) + + raw_limit = args.get("limit") + if raw_limit is None: + limit = config.search_default_limit + else: + limit = max(1, min(config.max_search_limit, _safe_int(raw_limit, config.search_default_limit))) + + _, deferrable = classify_tools(current_tool_defs) + catalog = build_catalog(deferrable) + hits = search_catalog(catalog, query, limit=limit) + return json.dumps({ + "query": query, + "total_available": len(catalog), + "matches": [_format_search_hit(h) for h in hits], + }, ensure_ascii=False) + + +def dispatch_tool_describe(args: Dict[str, Any], + *, + current_tool_defs: List[Dict[str, Any]]) -> str: + """Execute the ``tool_describe`` bridge tool. Returns a JSON string.""" + name = str(args.get("name") or "").strip() + if not name: + return json.dumps({"error": "name is required"}, ensure_ascii=False) + if not is_deferrable_tool_name(name): + return json.dumps({ + "error": ( + f"'{name}' is not a deferrable tool. If you see it in the tools list " + "already, call it directly; otherwise check the spelling against tool_search." + ), + }, ensure_ascii=False) + _, deferrable = classify_tools(current_tool_defs) + for td in deferrable: + fn = td.get("function") or {} + if fn.get("name") == name: + return json.dumps({ + "name": name, + "description": fn.get("description", ""), + "parameters": fn.get("parameters", {}), + }, ensure_ascii=False) + return json.dumps({ + "error": f"'{name}' is not currently available. Re-run tool_search to refresh.", + }, ensure_ascii=False) + + +def resolve_underlying_call(args: Dict[str, Any]) -> Tuple[Optional[str], Dict[str, Any], Optional[str]]: + """Parse a ``tool_call`` invocation into (underlying_name, args, error_msg). + + Used by: + * the dispatcher in ``model_tools.handle_function_call``, + * the display layer (so the activity feed shows the underlying tool), + * the trajectory recorder. + + On parse error, returns ``(None, {}, error_message)``. + """ + name = str(args.get("name") or "").strip() + if not name: + return None, {}, "tool_call requires a 'name' argument" + if name in BRIDGE_TOOL_NAMES: + return None, {}, f"tool_call cannot invoke '{name}' (it is itself a bridge tool)" + raw_args = args.get("arguments") + if raw_args is None: + raw_args = {} + if isinstance(raw_args, str): + try: + raw_args = json.loads(raw_args) + except json.JSONDecodeError as e: + return None, {}, f"tool_call 'arguments' is not valid JSON: {e}" + if not isinstance(raw_args, dict): + return None, {}, "tool_call 'arguments' must be an object" + if not is_deferrable_tool_name(name): + return None, {}, ( + f"'{name}' is not a deferrable tool. If it appears in the model-facing tools " + "list already, call it directly instead of via tool_call." + ) + return name, raw_args, None + + +__all__ = [ + "TOOL_SEARCH_NAME", + "TOOL_DESCRIBE_NAME", + "TOOL_CALL_NAME", + "BRIDGE_TOOL_NAMES", + "ToolSearchConfig", + "CatalogEntry", + "AssemblyResult", + "load_config", + "is_deferrable_tool_name", + "classify_tools", + "estimate_tokens_from_schemas", + "should_activate", + "build_catalog", + "search_catalog", + "bridge_tool_schemas", + "assemble_tool_defs", + "is_bridge_tool", + "dispatch_tool_search", + "dispatch_tool_describe", + "resolve_underlying_call", +] diff --git a/website/docs/user-guide/features/tool-search.md b/website/docs/user-guide/features/tool-search.md new file mode 100644 index 00000000000..5610a43461f --- /dev/null +++ b/website/docs/user-guide/features/tool-search.md @@ -0,0 +1,152 @@ +--- +title: Tool Search +sidebar_position: 95 +--- + +# Tool Search + +When you have many MCP servers or non-core plugin tools attached to a +session, their JSON schemas can consume a substantial fraction of the +context window on every turn — even when only a few of them are relevant +to what the user actually asked for. + +**Tool Search** is Hermes' opt-in progressive-disclosure layer for that +problem. When activated, MCP and plugin tools are replaced in the +model-visible tools array by three bridge tools, and the model loads each +specific tool's schema on demand. + +:::info Built-in Hermes tools never defer +The tools that make up Hermes' core capability set (`terminal`, +`read_file`, `write_file`, `patch`, `search_files`, `todo`, `memory`, +`browser_*`, `web_search`, `web_extract`, `clarify`, `execute_code`, +`delegate_task`, `session_search`, `send_message`, and the rest of +`_HERMES_CORE_TOOLS`) are *always* loaded directly. Only MCP tools and +non-core plugin tools are eligible for deferral. +::: + +## How it works + +When Tool Search activates for a turn, the model sees three new tools in +place of the deferred ones: + +``` +tool_search(query, limit?) — search the deferred-tool catalog +tool_describe(name) — load the full schema for one tool +tool_call(name, arguments) — invoke a deferred tool +``` + +A typical interaction looks like: + +``` +Model: tool_search("create a github issue") + → { matches: [{ name: "mcp_github_create_issue", ... }, ...] } +Model: tool_describe("mcp_github_create_issue") + → { parameters: { type: "object", properties: { ... } } } +Model: tool_call("mcp_github_create_issue", { title: "...", body: "..." }) + → { ok: true, issue_number: 42 } +``` + +When the model invokes `tool_call`, Hermes **unwraps the bridge** and +dispatches the underlying tool exactly as if the model had called it +directly. Pre-tool-call hooks, guardrails, approval prompts, and +post-tool-call hooks all run against the real tool name — not against +`tool_call`. The activity feed in the CLI and gateway also unwraps so you +see the underlying tool, not the bridge. + +## When does it activate? + +By default Tool Search runs in `auto` mode: it activates only when the +deferrable tool schemas would consume at least 10% of the active model's +context window. Below that, the tools-array assembly is a pure +pass-through and you pay no overhead. + +This decision is re-evaluated every time the tools array is built, so: + +- A session with just a few MCP tools and a long context model never + activates Tool Search. +- A session with many MCP servers attached (15+ tools typically) starts + activating it. +- Removing MCP servers mid-session correctly returns to direct exposure + on the next assembly. + +## Configuration + +```yaml +tools: + tool_search: + enabled: auto # auto (default), on, or off + threshold_pct: 10 # percentage of context — only used in auto mode + search_default_limit: 5 + max_search_limit: 20 +``` + +| Key | Default | Meaning | +| --- | --- | --- | +| `enabled` | `auto` | `auto` activates above threshold; `on` always activates if there's at least one deferrable tool; `off` disables entirely. | +| `threshold_pct` | `10` | Percentage of context length at which `auto` mode kicks in. Range 0–100. | +| `search_default_limit` | `5` | Hits returned when the model calls `tool_search` without a `limit`. | +| `max_search_limit` | `20` | Hard upper bound the model can request via `limit`. Range 1–50. | + +You can also flip the legacy boolean shape: + +```yaml +tools: + tool_search: true # equivalent to {enabled: auto} +``` + +## When NOT to use it + +Tool Search trades a fixed per-turn token cost (the three bridge tool +schemas, ~300 tokens) and at least one extra round trip (search → +describe → call) for the savings on the deferred schemas. It's a clear +win when you have many tools and use few per turn; it's overhead when +you have few tools total. + +The `auto` default handles this for you. If you set `enabled: on` +unconditionally, expect a slight per-turn cost on small toolsets. + +## Trade-offs that don't go away + +These come from the prompt-cache integrity invariant — they are inherent +to any progressive-disclosure design, not specific to this implementation: + +- **One extra round trip on cold tools.** The first time the model needs + a deferred tool, it spends one or two extra model calls to find and + load the schema. The token savings on the static side are real, but a + portion is paid back at runtime. +- **No cache benefit on deferred schemas.** A loaded `tool_describe` + result enters the conversation history (so it does get cached on + subsequent turns) but it never benefits from the system-prompt cache + prefix. +- **Model-quality dependence.** Tool Search assumes the model can write a + reasonable search query for the tool it wants. Smaller models do this + less well; the published Anthropic numbers (49% → 74% on Opus 4 with + vs. without tool search) show the upside but also that ~26 points of + accuracy is still retrieval failure. +- **Toolset edits invalidate cache.** Adding or removing a tool mid- + session changes the bridge tools' descriptions (which include the + count of deferred tools) and the catalog, so the prompt cache is + invalidated. This is the same trade-off as any toolset edit. + +## Implementation details + +- **Retrieval:** BM25 over tokenized tool name + description + parameter + names. Falls back to a literal substring match on the tool name when + BM25 returns no positive-score hits, which protects against + zero-IDF degenerate cases (e.g. searching `"github"` against a + catalog where every tool name contains "github"). +- **Catalog is stateless across turns.** It rebuilds from the current + tool-defs list every assembly — no session-keyed `Map`. This avoids + the class of bug where a stored catalog drifts out of sync with the + live tool registry. +- **No JS sandbox.** Hermes uses the simpler "structured tools" mode + (search / describe / call as plain functions). The JS-sandbox "code + mode" some other implementations offer is a large surface area; we + skip it. + +## See also + +- `tools/tool_search.py` — the implementation +- `tests/tools/test_tool_search.py` — the regression suite +- The `openclaw-tool-search-report` PDF in the original implementation + PR for the research that shaped the design