diff --git a/hermes_cli/mcp_config.py b/hermes_cli/mcp_config.py index 17b6c0329e5..575d5f9b5ee 100644 --- a/hermes_cli/mcp_config.py +++ b/hermes_cli/mcp_config.py @@ -212,7 +212,7 @@ def _probe_single_server( _ensure_mcp_loop, _run_on_mcp_loop, _connect_server, - _stop_mcp_loop, + _stop_mcp_loop_if_idle, ) config = _resolve_mcp_server_config(config) @@ -240,7 +240,7 @@ def _probe_single_server( except BaseException as exc: raise _unwrap_exception_group(exc) from None finally: - _stop_mcp_loop() + _stop_mcp_loop_if_idle() return tools_found diff --git a/tests/tools/test_mcp_probe.py b/tests/tools/test_mcp_probe.py index 89d4d1478d1..92656a441b3 100644 --- a/tests/tools/test_mcp_probe.py +++ b/tests/tools/test_mcp_probe.py @@ -162,14 +162,14 @@ class TestProbeMcpServerTools: assert result["github"][0] == ("my_tool", "") def test_cleanup_called_even_on_failure(self): - """_stop_mcp_loop is called even when probe fails.""" + """Probe cleanup is attempted even when probe fails.""" config = {"github": {"command": "npx", "connect_timeout": 5}} with patch("tools.mcp_tool._MCP_AVAILABLE", True), \ patch("tools.mcp_tool._load_mcp_config", return_value=config), \ patch("tools.mcp_tool._ensure_mcp_loop"), \ patch("tools.mcp_tool._run_on_mcp_loop", side_effect=RuntimeError("boom")), \ - patch("tools.mcp_tool._stop_mcp_loop") as mock_stop: + patch("tools.mcp_tool._stop_mcp_loop_if_idle") as mock_stop: from tools.mcp_tool import probe_mcp_server_tools result = probe_mcp_server_tools() diff --git a/tests/tools/test_mcp_stability.py b/tests/tools/test_mcp_stability.py index 3897a65a866..feb0d7a5aff 100644 --- a/tests/tools/test_mcp_stability.py +++ b/tests/tools/test_mcp_stability.py @@ -57,6 +57,32 @@ class TestMCPLoopExceptionHandler: finally: mcp_mod._stop_mcp_loop() + def test_probe_cleanup_does_not_stop_loop_with_registered_servers(self): + """Probe cleanup must not kill the shared loop used by live MCP tools.""" + import tools.mcp_tool as mcp_mod + + with mcp_mod._lock: + mcp_mod._servers.clear() + mcp_mod._server_connecting.clear() + try: + mcp_mod._ensure_mcp_loop() + with mcp_mod._lock: + loop = mcp_mod._mcp_loop + mcp_mod._servers["live"] = MagicMock(session=object()) + + assert mcp_mod._stop_mcp_loop_if_idle() is False + + with mcp_mod._lock: + assert mcp_mod._mcp_loop is loop + assert mcp_mod._mcp_thread is not None + assert loop is not None + assert loop.is_running() + finally: + with mcp_mod._lock: + mcp_mod._servers.clear() + mcp_mod._server_connecting.clear() + mcp_mod._stop_mcp_loop() + # --------------------------------------------------------------------------- # Fix 2: stdio PID tracking diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index 1317b14dc35..3b0224b59b4 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -3946,7 +3946,7 @@ def probe_mcp_server_tools() -> Dict[str, List[tuple]]: except Exception as exc: logger.debug("MCP probe failed: %s", exc) finally: - _stop_mcp_loop() + _stop_mcp_loop_if_idle() return result @@ -4084,10 +4084,25 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None: ) -def _stop_mcp_loop(): +def _stop_mcp_loop_if_idle() -> bool: + """Stop the MCP loop only when no registered server still owns it. + + Probe paths create temporary MCPServerTask instances that are not placed in + ``_servers``. They should clean up an otherwise-idle loop, but must not + tear down the process-global loop when live agent tools are registered on + it. Otherwise a dashboard/CLI probe can make later MCP tool calls fail + with ``MCP event loop is not running``. + """ + return _stop_mcp_loop(only_if_idle=True) + + +def _stop_mcp_loop(*, only_if_idle: bool = False) -> bool: """Stop the background event loop and join its thread.""" global _mcp_loop, _mcp_thread with _lock: + if only_if_idle and (_servers or _server_connecting): + logger.debug("Leaving MCP event loop running; active servers are registered or connecting") + return False loop = _mcp_loop thread = _mcp_thread _mcp_loop = None @@ -4104,3 +4119,4 @@ def _stop_mcp_loop(): # graceful shutdown are now orphaned — include active PIDs too # since the loop is gone and no session can still be in flight. _kill_orphaned_mcp_children(include_active=True) + return True