fix(mcp): preserve loop during probes
Some checks failed
Deploy Site / deploy-vercel (push) Waiting to run
Deploy Site / deploy-docs (push) Waiting to run
Docker Build and Publish / build-amd64 (push) Waiting to run
Docker Build and Publish / build-arm64 (push) Waiting to run
Docker Build and Publish / merge (push) Blocked by required conditions
Lint (ruff + ty) / ruff + ty diff (push) Waiting to run
Lint (ruff + ty) / ruff enforcement (blocking) (push) Waiting to run
Lint (ruff + ty) / Windows footguns (blocking) (push) Waiting to run
Nix / nix (macos-latest) (push) Waiting to run
Nix / nix (ubuntu-latest) (push) Waiting to run
OSV-Scanner / Scan lockfiles (push) Waiting to run
Tests / test (1) (push) Waiting to run
Tests / test (2) (push) Waiting to run
Tests / test (3) (push) Waiting to run
Tests / test (4) (push) Waiting to run
Tests / test (5) (push) Waiting to run
Tests / test (6) (push) Waiting to run
Tests / save-durations (push) Blocked by required conditions
Tests / e2e (push) Waiting to run
Typecheck / typecheck (apps/bootstrap-installer) (push) Waiting to run
Typecheck / typecheck (apps/desktop) (push) Waiting to run
Typecheck / typecheck (apps/shared) (push) Waiting to run
Typecheck / typecheck (ui-tui) (push) Waiting to run
Typecheck / typecheck (web) (push) Waiting to run
uv.lock check / uv lock --check (push) Waiting to run
Nix Lockfile Fix / auto-fix-main (push) Has been cancelled
Nix Lockfile Fix / fix (push) Has been cancelled
Build Skills Index / build-index (push) Has been cancelled
Build Skills Index / trigger-deploy (push) Has been cancelled

This commit is contained in:
helix4u 2026-06-13 16:45:45 -06:00 committed by Teknium
parent 85e6232a07
commit 4936a49a0c
4 changed files with 48 additions and 6 deletions

View file

@ -212,7 +212,7 @@ def _probe_single_server(
_ensure_mcp_loop,
_run_on_mcp_loop,
_connect_server,
_stop_mcp_loop,
_stop_mcp_loop_if_idle,
)
config = _resolve_mcp_server_config(config)
@ -240,7 +240,7 @@ def _probe_single_server(
except BaseException as exc:
raise _unwrap_exception_group(exc) from None
finally:
_stop_mcp_loop()
_stop_mcp_loop_if_idle()
return tools_found

View file

@ -162,14 +162,14 @@ class TestProbeMcpServerTools:
assert result["github"][0] == ("my_tool", "")
def test_cleanup_called_even_on_failure(self):
"""_stop_mcp_loop is called even when probe fails."""
"""Probe cleanup is attempted even when probe fails."""
config = {"github": {"command": "npx", "connect_timeout": 5}}
with patch("tools.mcp_tool._MCP_AVAILABLE", True), \
patch("tools.mcp_tool._load_mcp_config", return_value=config), \
patch("tools.mcp_tool._ensure_mcp_loop"), \
patch("tools.mcp_tool._run_on_mcp_loop", side_effect=RuntimeError("boom")), \
patch("tools.mcp_tool._stop_mcp_loop") as mock_stop:
patch("tools.mcp_tool._stop_mcp_loop_if_idle") as mock_stop:
from tools.mcp_tool import probe_mcp_server_tools
result = probe_mcp_server_tools()

View file

@ -57,6 +57,32 @@ class TestMCPLoopExceptionHandler:
finally:
mcp_mod._stop_mcp_loop()
def test_probe_cleanup_does_not_stop_loop_with_registered_servers(self):
"""Probe cleanup must not kill the shared loop used by live MCP tools."""
import tools.mcp_tool as mcp_mod
with mcp_mod._lock:
mcp_mod._servers.clear()
mcp_mod._server_connecting.clear()
try:
mcp_mod._ensure_mcp_loop()
with mcp_mod._lock:
loop = mcp_mod._mcp_loop
mcp_mod._servers["live"] = MagicMock(session=object())
assert mcp_mod._stop_mcp_loop_if_idle() is False
with mcp_mod._lock:
assert mcp_mod._mcp_loop is loop
assert mcp_mod._mcp_thread is not None
assert loop is not None
assert loop.is_running()
finally:
with mcp_mod._lock:
mcp_mod._servers.clear()
mcp_mod._server_connecting.clear()
mcp_mod._stop_mcp_loop()
# ---------------------------------------------------------------------------
# Fix 2: stdio PID tracking

View file

@ -3946,7 +3946,7 @@ def probe_mcp_server_tools() -> Dict[str, List[tuple]]:
except Exception as exc:
logger.debug("MCP probe failed: %s", exc)
finally:
_stop_mcp_loop()
_stop_mcp_loop_if_idle()
return result
@ -4084,10 +4084,25 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
)
def _stop_mcp_loop():
def _stop_mcp_loop_if_idle() -> bool:
"""Stop the MCP loop only when no registered server still owns it.
Probe paths create temporary MCPServerTask instances that are not placed in
``_servers``. They should clean up an otherwise-idle loop, but must not
tear down the process-global loop when live agent tools are registered on
it. Otherwise a dashboard/CLI probe can make later MCP tool calls fail
with ``MCP event loop is not running``.
"""
return _stop_mcp_loop(only_if_idle=True)
def _stop_mcp_loop(*, only_if_idle: bool = False) -> bool:
"""Stop the background event loop and join its thread."""
global _mcp_loop, _mcp_thread
with _lock:
if only_if_idle and (_servers or _server_connecting):
logger.debug("Leaving MCP event loop running; active servers are registered or connecting")
return False
loop = _mcp_loop
thread = _mcp_thread
_mcp_loop = None
@ -4104,3 +4119,4 @@ def _stop_mcp_loop():
# graceful shutdown are now orphaned — include active PIDs too
# since the loop is gone and no session can still be in flight.
_kill_orphaned_mcp_children(include_active=True)
return True