hermes-agent/hermes_cli/mcp_startup.py
alt-glitch 88d523220f fix(mcp): address adversarial review round 2 (stale-publish race, parity holes)
Second review pass (Codex + Hermes subagent). Codex reproduced a real race with
a two-thread harness; both converged on the remaining issues.

- Generation-aware publish (fixes a lost-update race): two refresh callers (the
  late-refresh daemon and the between-turns prologue around turn 1) could each
  compute a snapshot outside the lock; a SLOWER caller holding an OLDER registry
  generation could acquire the publish lock after a newer caller and clobber it,
  deleting just-landed tools. refresh_agent_mcp_tools now captures
  registry._generation before computing and refuses to publish a stale set;
  agent._tool_snapshot_generation tracks the published generation.
- Context-engine routing names (_context_engine_tool_names) are now staged on a
  local and published atomically with the snapshot, and only claimed when this
  rebuild actually appended the schema — matching agent_init's dedup so a
  registry/plugin tool of the same name keeps its own dispatch. (Previously
  mutated live, before the publish lock, and on no-change refreshes.)
- CLI /reload-mcp: self.enabled_toolsets is resolved once at startup, so a
  server newly ENABLED in config mid-session wasn't picked up (TUI already
  re-resolved). Merge now-connected MCP server names into the override (unless
  the user pinned all/*), mirroring startup, and keep self.enabled_toolsets in
  sync. Closes the CLI/TUI parity hole.
- ACP (acp_adapter/server.py) routed through the shared helper — it was a 5th
  sibling rebuild that re-injected memory tools but NOT context-engine tools and
  bypassed the atomic/name-diff path (inert today, fragile).
- mcp_startup._resolve_discovery_timeout pulls its default from DEFAULT_CONFIG
  (single source of truth) instead of a stale hardcoded 5.0 literal.
- Tests: stale-generation-no-clobber, _skip_mcp_refresh honored, timeout
  fallback uses DEFAULT_CONFIG.
2026-06-19 11:57:43 -07:00

88 lines
3.1 KiB
Python

"""Shared CLI/TUI-safe helpers for background MCP discovery."""
from __future__ import annotations
import threading
from typing import Optional
_mcp_discovery_lock = threading.Lock()
_mcp_discovery_started = False
_mcp_discovery_thread: Optional[threading.Thread] = None
def _has_configured_mcp_servers() -> bool:
"""Cheap config probe so non-MCP users avoid importing the MCP stack."""
try:
from hermes_cli.config import read_raw_config
mcp_servers = (read_raw_config() or {}).get("mcp_servers")
return isinstance(mcp_servers, dict) and len(mcp_servers) > 0
except Exception:
# Be conservative: if config probing fails, try discovery in the
# background so startup still can't block.
return True
def start_background_mcp_discovery(*, logger, thread_name: str) -> None:
"""Spawn one shared background MCP discovery thread for this process."""
global _mcp_discovery_started, _mcp_discovery_thread
with _mcp_discovery_lock:
if _mcp_discovery_started:
return
_mcp_discovery_started = True
if not _has_configured_mcp_servers():
return
def _discover() -> None:
try:
from tools.mcp_tool import discover_mcp_tools
discover_mcp_tools()
except Exception:
logger.debug("Background MCP tool discovery failed", exc_info=True)
thread = threading.Thread(
target=_discover,
name=thread_name,
daemon=True,
)
_mcp_discovery_thread = thread
thread.start()
def _resolve_discovery_timeout(explicit: "float | None") -> float:
"""Resolve the MCP discovery wait bound: explicit arg > config > default.
Reads ``mcp_discovery_timeout`` from config.yaml, defaulting to the value in
``DEFAULT_CONFIG`` (single source of truth) when the key is absent. Kept lazy
and fail-safe — a missing/invalid value or a broken config falls back to a
short safe bound so startup can never hang or crash.
"""
if explicit is not None:
return explicit
try:
from hermes_cli.config import load_config, DEFAULT_CONFIG
default = float(DEFAULT_CONFIG.get("mcp_discovery_timeout", 1.5))
raw = (load_config() or {}).get("mcp_discovery_timeout", default)
val = float(raw)
return val if val > 0 else default
except Exception:
return 1.5
def wait_for_mcp_discovery(timeout: "float | None" = None) -> None:
"""Wait for background MCP discovery before the first tool snapshot.
``thread.join(timeout)`` returns the INSTANT discovery completes, so this
only ever blocks for the real connect time of a still-pending server —
users with no MCP servers or fast servers pay ~0s. The bound (from
``mcp_discovery_timeout`` in config) just caps the wait so a dead server
can't freeze startup; servers that miss it are picked up by the automatic
late-binding refresh.
"""
thread = _mcp_discovery_thread
if thread is None or not thread.is_alive():
return
thread.join(timeout=_resolve_discovery_timeout(timeout))