From 88d523220fddcfb42bd4f29e9ace4ae30ebbf1d9 Mon Sep 17 00:00:00 2001 From: alt-glitch Date: Fri, 19 Jun 2026 22:52:47 +0530 Subject: [PATCH] fix(mcp): address adversarial review round 2 (stale-publish race, parity holes) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second review pass (Codex + Hermes subagent). Codex reproduced a real race with a two-thread harness; both converged on the remaining issues. - Generation-aware publish (fixes a lost-update race): two refresh callers (the late-refresh daemon and the between-turns prologue around turn 1) could each compute a snapshot outside the lock; a SLOWER caller holding an OLDER registry generation could acquire the publish lock after a newer caller and clobber it, deleting just-landed tools. refresh_agent_mcp_tools now captures registry._generation before computing and refuses to publish a stale set; agent._tool_snapshot_generation tracks the published generation. - Context-engine routing names (_context_engine_tool_names) are now staged on a local and published atomically with the snapshot, and only claimed when this rebuild actually appended the schema — matching agent_init's dedup so a registry/plugin tool of the same name keeps its own dispatch. (Previously mutated live, before the publish lock, and on no-change refreshes.) - CLI /reload-mcp: self.enabled_toolsets is resolved once at startup, so a server newly ENABLED in config mid-session wasn't picked up (TUI already re-resolved). Merge now-connected MCP server names into the override (unless the user pinned all/*), mirroring startup, and keep self.enabled_toolsets in sync. Closes the CLI/TUI parity hole. - ACP (acp_adapter/server.py) routed through the shared helper — it was a 5th sibling rebuild that re-injected memory tools but NOT context-engine tools and bypassed the atomic/name-diff path (inert today, fragile). - mcp_startup._resolve_discovery_timeout pulls its default from DEFAULT_CONFIG (single source of truth) instead of a stale hardcoded 5.0 literal. - Tests: stale-generation-no-clobber, _skip_mcp_refresh honored, timeout fallback uses DEFAULT_CONFIG. --- acp_adapter/server.py | 19 +++--- agent/agent_init.py | 8 +++ cli.py | 21 ++++++- hermes_cli/mcp_startup.py | 16 ++--- tests/agent/test_turn_context.py | 16 +++++ tests/tools/test_refresh_agent_mcp_tools.py | 29 ++++++++- tools/mcp_tool.py | 68 ++++++++++++++++----- 7 files changed, 137 insertions(+), 40 deletions(-) diff --git a/acp_adapter/server.py b/acp_adapter/server.py index a51db91d4e8..7b0129bc2ba 100644 --- a/acp_adapter/server.py +++ b/acp_adapter/server.py @@ -823,24 +823,21 @@ class HermesACPAgent(acp.Agent): return try: - from model_tools import get_tool_definitions - from agent.memory_manager import inject_memory_provider_tools + from tools.mcp_tool import refresh_agent_mcp_tools enabled_toolsets = _expand_acp_enabled_toolsets( getattr(state.agent, "enabled_toolsets", None) or ["hermes-acp"], mcp_server_names=[server.name for server in mcp_servers], ) - state.agent.enabled_toolsets = enabled_toolsets - disabled_toolsets = getattr(state.agent, "disabled_toolsets", None) - state.agent.tools = get_tool_definitions( - enabled_toolsets=enabled_toolsets, - disabled_toolsets=disabled_toolsets, + # Route through the shared helper (name-diff, atomic publish, and — + # critically — additive-preserving so memory-provider AND context- + # engine tools survive). enabled_override applies the ACP-expanded + # toolset and stores it on the agent, matching prior behavior. + refresh_agent_mcp_tools( + state.agent, + enabled_override=enabled_toolsets, quiet_mode=True, ) - state.agent.valid_tool_names = { - tool["function"]["name"] for tool in state.agent.tools or [] - } - inject_memory_provider_tools(state.agent) invalidate = getattr(state.agent, "_invalidate_system_prompt", None) if callable(invalidate): invalidate() diff --git a/agent/agent_init.py b/agent/agent_init.py index 7131a93c3b7..90e363c7f2b 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -535,6 +535,14 @@ def init_agent( # Set on internal forks (e.g. background_review) that must keep ``tools[]`` # byte-identical to a parent for provider cache parity. agent._skip_mcp_refresh = False + # Registry generation the current tool snapshot was derived from. Lets a + # late/concurrent refresh reject a stale (older-generation) rebuild instead + # of clobbering a newer one. See tools.mcp_tool.refresh_agent_mcp_tools. + try: + from tools.registry import registry as _registry + agent._tool_snapshot_generation = _registry._generation + except Exception: + agent._tool_snapshot_generation = 0 # Rate limit tracking — updated from x-ratelimit-* response headers # after each API call. Accessed by /usage slash command. agent._rate_limit_state: Optional["RateLimitState"] = None diff --git a/cli.py b/cli.py index eb8b15a93f8..49da337dfd8 100644 --- a/cli.py +++ b/cli.py @@ -9668,13 +9668,28 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin): # memory-provider and context-engine tools survive the rebuild). if self.agent is not None: from tools.mcp_tool import refresh_agent_mcp_tools - # Explicit reload: re-resolve enabled toolsets so a server the - # user just enabled in config this session is picked up. + # Explicit reload: pick up MCP servers the user ENABLED in config + # this session. self.enabled_toolsets was resolved once at + # startup; merge in any now-connected server names (unless the + # user pinned `all`/`*`, which already includes everything) so a + # freshly-added server isn't filtered out. Mirrors startup, where + # MCP server names are part of enabled_toolsets (see __init__). + enabled_override = None + et = self.enabled_toolsets + if et and "all" not in et and "*" not in et: + merged = list(et) + for _name in sorted(connected_servers): + if _name not in merged: + merged.append(_name) + enabled_override = merged refresh_agent_mcp_tools( self.agent, - enabled_override=self.enabled_toolsets, + enabled_override=enabled_override, quiet_mode=True, ) + # Keep the CLI's own list in sync with what the agent now uses. + if enabled_override is not None: + self.enabled_toolsets = enabled_override # Inject a message at the END of conversation history so the # model knows tools changed. Appended after all existing diff --git a/hermes_cli/mcp_startup.py b/hermes_cli/mcp_startup.py index 2a06d6c24d2..410a3c7059c 100644 --- a/hermes_cli/mcp_startup.py +++ b/hermes_cli/mcp_startup.py @@ -54,20 +54,22 @@ def start_background_mcp_discovery(*, logger, thread_name: str) -> None: def _resolve_discovery_timeout(explicit: "float | None") -> float: """Resolve the MCP discovery wait bound: explicit arg > config > default. - Reads ``mcp_discovery_timeout`` from config.yaml. Kept lazy and - fail-safe — a missing/invalid value falls back to the historical 0.75s so - a broken config can never make startup hang or crash. + Reads ``mcp_discovery_timeout`` from config.yaml, defaulting to the value in + ``DEFAULT_CONFIG`` (single source of truth) when the key is absent. Kept lazy + and fail-safe — a missing/invalid value or a broken config falls back to a + short safe bound so startup can never hang or crash. """ if explicit is not None: return explicit try: - from hermes_cli.config import load_config + from hermes_cli.config import load_config, DEFAULT_CONFIG - raw = (load_config() or {}).get("mcp_discovery_timeout", 5.0) + default = float(DEFAULT_CONFIG.get("mcp_discovery_timeout", 1.5)) + raw = (load_config() or {}).get("mcp_discovery_timeout", default) val = float(raw) - return val if val > 0 else 0.75 + return val if val > 0 else default except Exception: - return 0.75 + return 1.5 def wait_for_mcp_discovery(timeout: "float | None" = None) -> None: diff --git a/tests/agent/test_turn_context.py b/tests/agent/test_turn_context.py index c475c4fb145..05bea3d9e51 100644 --- a/tests/agent/test_turn_context.py +++ b/tests/agent/test_turn_context.py @@ -49,6 +49,7 @@ class _FakeAgent: self.valid_tool_names = set() self.enabled_toolsets = None self.disabled_toolsets = None + self._skip_mcp_refresh = False self.compression_enabled = False self.context_compressor = types.SimpleNamespace( protect_first_n=2, protect_last_n=2 @@ -225,6 +226,21 @@ def test_between_turns_refresh_skipped_when_no_servers(): gtd.assert_not_called() +def test_between_turns_refresh_skipped_when_skip_flag_set(): + """Internal forks (background_review) set _skip_mcp_refresh to keep tools[] + byte-identical to the parent for cache parity — the hook must honor it even + when MCP servers are registered.""" + agent = _FakeAgent() + agent._skip_mcp_refresh = True + import model_tools + + with patch("tools.mcp_tool.has_registered_mcp_tools", return_value=True), \ + patch.object(model_tools, "get_tool_definitions") as gtd: + _build(agent) + + gtd.assert_not_called() + + def test_between_turns_refresh_no_churn_when_unchanged(): """R2: an unchanged tool set leaves the snapshot object identity intact (no needless swap → nothing for the next request prefix to diff against).""" diff --git a/tests/tools/test_refresh_agent_mcp_tools.py b/tests/tools/test_refresh_agent_mcp_tools.py index 3a347371c02..da349474a33 100644 --- a/tests/tools/test_refresh_agent_mcp_tools.py +++ b/tests/tools/test_refresh_agent_mcp_tools.py @@ -253,12 +253,35 @@ def test_resolve_discovery_timeout_falls_back_on_bad_value(monkeypatch): from hermes_cli import mcp_startup import hermes_cli.config as cfg - # Non-positive / unparsable → historical safe default, never hang. + # Non-positive / unparsable → DEFAULT_CONFIG value, never hang. + default = float(cfg.DEFAULT_CONFIG.get("mcp_discovery_timeout", 1.5)) monkeypatch.setattr(cfg, "load_config", lambda: {"mcp_discovery_timeout": 0}) - assert mcp_startup._resolve_discovery_timeout(None) == 0.75 + assert mcp_startup._resolve_discovery_timeout(None) == default monkeypatch.setattr(cfg, "load_config", lambda: {"mcp_discovery_timeout": "oops"}) - assert mcp_startup._resolve_discovery_timeout(None) == 0.75 + assert mcp_startup._resolve_discovery_timeout(None) == default + + +def test_stale_generation_refresh_does_not_clobber_newer(monkeypatch): + """A slower refresh that computed an OLDER registry generation must not + overwrite a snapshot a newer-generation refresh already published.""" + from tools import registry as _reg_mod + + agent = _agent(["read_file"]) + # A newer refresh already published generation = current+5, with two tools. + agent._tool_snapshot_generation = _reg_mod.registry._generation + 5 + agent.tools = [_tool("read_file"), _tool("mcp_new_tool")] + agent.valid_tool_names = {"read_file", "mcp_new_tool"} + + import model_tools + # This (stale) refresh computes only the old single-tool set. + monkeypatch.setattr(model_tools, "get_tool_definitions", lambda **kw: [_tool("read_file")]) + + added = mcp_tool.refresh_agent_mcp_tools(agent) + + # Stale write rejected: the newer tool survives. + assert added == set() + assert "mcp_new_tool" in agent.valid_tool_names def test_wait_returns_instantly_when_no_discovery_thread(monkeypatch): diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index ffb9fe0f1ab..4b021e499e3 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -4292,6 +4292,7 @@ def refresh_agent_mcp_tools( at a turn boundary, before that turn's ``tools=`` prefix is assembled). """ from model_tools import get_tool_definitions + from tools.registry import registry # Explicit reloads (/reload-mcp) pass freshly-resolved toolsets so a server # the user just ENABLED in config is picked up; the agent's stored selection @@ -4306,10 +4307,18 @@ def refresh_agent_mcp_tools( enabled = getattr(agent, "enabled_toolsets", None) disabled = getattr(agent, "disabled_toolsets", None) + # Capture the registry generation this rebuild is derived from BEFORE the + # (potentially slow) get_tool_definitions call. Used at publish time to + # reject a stale write: if two callers race (e.g. the late-refresh daemon + # and the between-turns prologue around turn 1), a slower caller that + # computed an OLDER set must not clobber a newer set another caller already + # published. ``registry._generation`` bumps on every (de)register. + snapshot_generation = registry._generation + # Registry-derived tools (built-ins + MCP), filtered to the agent's toolsets. # Computed OUTSIDE the lock (get_tool_definitions can be slow); the diff and # publish below happen together in ONE critical section so two concurrent - # callers can't compute overlapping ``added`` sets or torn-publish. + # callers can't torn-publish or compute overlapping ``added`` sets. new_defs = list( get_tool_definitions( enabled_toolsets=enabled, @@ -4323,40 +4332,63 @@ def refresh_agent_mcp_tools( # Re-append the post-build injected families that get_tool_definitions does # NOT reproduce, so a refresh never strips them (memory-provider + context- # engine tools). Staged entirely on LOCALS — the live ``agent.tools`` / - # ``valid_tool_names`` are never touched until the single atomic publish - # below, so a concurrent reader (``build_api_kwargs``) can't see a partial - # rebuild or a cross-attribute half-swap. - _reinject_post_build_tools(agent, new_defs, new_names) + # ``valid_tool_names`` / ``_context_engine_tool_names`` are never touched + # until the single atomic publish below, so a concurrent reader + # (``build_api_kwargs``) can't see a partial rebuild or a cross-attribute + # half-swap. ``staged_engine_names`` are the context-engine routing names + # this rebuild actually appended (matching agent_init's dedup-aware add). + staged_engine_names = _reinject_post_build_tools(agent, new_defs, new_names) # Single atomic read-diff-publish so the returned ``added`` is consistent - # with what was actually published, even under concurrent callers. + # with what was actually published, even under concurrent callers, and a + # stale (older-generation) rebuild can't overwrite a newer published one. with _agent_tools_lock: + published_gen = getattr(agent, "_tool_snapshot_generation", -1) + if snapshot_generation < published_gen: + # A newer snapshot already won; our set is stale — drop it. + return set() current = { t["function"]["name"] for t in (getattr(agent, "tools", None) or []) } if new_names == current: - return set() # no change → leave the live snapshot untouched (no churn) + # No change → leave the live snapshot untouched (no churn), but + # record the generation so an in-flight older caller can't clobber. + agent._tool_snapshot_generation = max(published_gen, snapshot_generation) + return set() agent.tools = new_defs agent.valid_tool_names = new_names + # Publish context-engine routing names atomically with the snapshot. + engine_names = getattr(agent, "_context_engine_tool_names", None) + if isinstance(engine_names, set): + engine_names.clear() + engine_names.update(staged_engine_names) + agent._tool_snapshot_generation = max(published_gen, snapshot_generation) return new_names - current -def _reinject_post_build_tools(agent, tools_list: list, name_set: set) -> None: +def _reinject_post_build_tools(agent, tools_list: list, name_set: set) -> set: """Append memory-provider and context-engine tools onto staged locals. Mirrors the post-``get_tool_definitions`` injection in ``agent_init`` so a snapshot rebuild reconstructs the FULL tool surface, not just the - registry-derived subset. Operates on the caller's staged ``tools_list`` / - ``name_set`` (NOT the live agent attributes) so the rebuild stays atomic. + registry-derived subset. Operates ONLY on the caller's staged ``tools_list`` + / ``name_set`` (never the live agent attributes) so the rebuild stays atomic. Idempotent (skips names already present) and fail-soft. + + Returns the set of context-engine routing names actually appended by THIS + rebuild — matching ``agent_init``'s dedup behavior (a name already provided + by a registry/plugin tool is NOT claimed for context-engine routing). The + caller publishes this into ``agent._context_engine_tool_names`` atomically + with the snapshot. """ - def _add(schema: dict) -> None: + def _add(schema: dict) -> bool: name = schema.get("name", "") if not name or name in name_set: - return + return False tools_list.append({"type": "function", "function": schema}) name_set.add(name) + return True # Memory-provider tools (mem0/honcho/byterover/supermemory/…). try: @@ -4378,23 +4410,27 @@ def _reinject_post_build_tools(agent, tools_list: list, name_set: set) -> None: # restricted-toolset platform (e.g. platform_toolsets: telegram: []) would # re-leak lcm_* tools the build deliberately excluded, and pay the local- # model latency penalty. + staged_engine_names: set = set() try: enabled = getattr(agent, "enabled_toolsets", None) context_engine_allowed = enabled is None or "context_engine" in enabled compressor = getattr(agent, "context_compressor", None) get_schemas = getattr(compressor, "get_tool_schemas", None) if compressor else None if context_engine_allowed and callable(get_schemas): - engine_names = getattr(agent, "_context_engine_tool_names", None) for schema in get_schemas(): if not isinstance(schema, dict): continue name = schema.get("name", "") - _add(schema) - if name and isinstance(engine_names, set): - engine_names.add(name) + # Only claim the routing name when WE appended the schema, so a + # name already owned by a registry/plugin tool keeps its own + # dispatch (matches agent_init.py's `continue`-before-claim). + if _add(schema) and name: + staged_engine_names.add(name) except Exception: logger.debug("Context-engine tool re-injection skipped", exc_info=True) + return staged_engine_names + def shutdown_mcp_servers(): """Close all MCP server connections and stop the background loop.