fix(mcp): stability fix pack — reload timeout, shutdown cleanup, event loop handler, OAuth non-blocking (#4757)

Four fixes for MCP server stability issues reported by community member
(terminal lockup, zombie processes, escape sequence pollution, startup hang):

1. MCP reload timeout guard (cli.py): _check_config_mcp_changes now runs
   _reload_mcp in a separate daemon thread with a 30s hard timeout. Previously,
   a hung MCP server could block the process_loop thread indefinitely, freezing
   the entire TUI (user can type but nothing happens, only Ctrl+D/Ctrl+\ work).

2. MCP stdio subprocess PID tracking (mcp_tool.py): Tracks child PIDs spawned
   by stdio_client via before/after snapshots of /proc children. On shutdown,
   _stop_mcp_loop force-kills any tracked PIDs that survived the SDK's graceful
   SIGTERM→SIGKILL cleanup. Prevents zombie MCP server processes from
   accumulating across sessions.

3. MCP event loop exception handler (mcp_tool.py): Installs
   _mcp_loop_exception_handler on the MCP background event loop — same pattern
   as the existing _suppress_closed_loop_errors on prompt_toolkit's loop.
   Suppresses benign 'Event loop is closed' RuntimeError from httpx transport
   __del__ during MCP shutdown. Salvaged from PR #2538 (acsezen).

4. MCP OAuth non-blocking (mcp_oauth.py): Replaces blocking input() call in
   _wait_for_callback with OAuthNonInteractiveError raise. Adds _is_interactive()
   TTY detection. In non-interactive environments, build_oauth_auth() still
   returns a provider (cached tokens + refresh work), but the callback handler
   raises immediately instead of blocking the MCP event loop for 120s. Re-raises
   OAuth setup failures in _run_http so failed servers are reported cleanly
   without blocking others. Salvaged from PRs #4521 (voidborne-d) and #4465
   (heathley).

Closes #2537, closes #4462
Related: #4128, #3436
This commit is contained in:
Teknium 2026-04-03 02:29:20 -07:00 committed by GitHub
parent f374ae4c61
commit cc54818d26
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 431 additions and 12 deletions

View file

@ -9,10 +9,13 @@ import pytest
from tools.mcp_oauth import (
HermesTokenStorage,
OAuthNonInteractiveError,
build_oauth_auth,
remove_oauth_tokens,
_find_free_port,
_can_open_browser,
_is_interactive,
_wait_for_callback,
)
@ -236,3 +239,99 @@ class TestRemoveOAuthTokens:
def test_no_error_when_files_missing(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
remove_oauth_tokens("nonexistent") # should not raise
# ---------------------------------------------------------------------------
# Non-interactive / startup-safety tests (issue #4462)
# ---------------------------------------------------------------------------
class TestIsInteractive:
"""_is_interactive() detects headless/daemon/container environments."""
def test_false_when_stdin_not_tty(self, monkeypatch):
mock_stdin = MagicMock()
mock_stdin.isatty.return_value = False
monkeypatch.setattr("tools.mcp_oauth.sys.stdin", mock_stdin)
assert _is_interactive() is False
def test_true_when_stdin_is_tty(self, monkeypatch):
mock_stdin = MagicMock()
mock_stdin.isatty.return_value = True
monkeypatch.setattr("tools.mcp_oauth.sys.stdin", mock_stdin)
assert _is_interactive() is True
def test_false_when_stdin_has_no_isatty(self, monkeypatch):
"""Some environments replace stdin with an object without isatty()."""
mock_stdin = object() # no isatty attribute
monkeypatch.setattr("tools.mcp_oauth.sys.stdin", mock_stdin)
assert _is_interactive() is False
class TestWaitForCallbackNoBlocking:
"""_wait_for_callback() must never call input() — it raises instead."""
def test_raises_on_timeout_instead_of_input(self):
"""When no auth code arrives, raises OAuthNonInteractiveError."""
import tools.mcp_oauth as mod
import asyncio
mod._oauth_port = _find_free_port()
async def instant_sleep(_seconds):
pass
with patch.object(mod.asyncio, "sleep", instant_sleep):
with patch("builtins.input", side_effect=AssertionError("input() must not be called")):
with pytest.raises(OAuthNonInteractiveError, match="callback timed out"):
asyncio.run(_wait_for_callback())
class TestBuildOAuthAuthNonInteractive:
"""build_oauth_auth() in non-interactive mode."""
def test_noninteractive_without_cached_tokens_warns(self, tmp_path, monkeypatch, caplog):
"""Without cached tokens, non-interactive mode logs a clear warning."""
try:
from mcp.client.auth import OAuthClientProvider
except ImportError:
pytest.skip("MCP SDK auth not available")
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
mock_stdin = MagicMock()
mock_stdin.isatty.return_value = False
monkeypatch.setattr("tools.mcp_oauth.sys.stdin", mock_stdin)
import logging
with caplog.at_level(logging.WARNING, logger="tools.mcp_oauth"):
auth = build_oauth_auth("atlassian", "https://mcp.atlassian.com/v1/mcp")
assert auth is not None
assert "no cached tokens found" in caplog.text.lower()
assert "non-interactive" in caplog.text.lower()
def test_noninteractive_with_cached_tokens_no_warning(self, tmp_path, monkeypatch, caplog):
"""With cached tokens, non-interactive mode logs no 'no cached tokens' warning."""
try:
from mcp.client.auth import OAuthClientProvider
except ImportError:
pytest.skip("MCP SDK auth not available")
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
mock_stdin = MagicMock()
mock_stdin.isatty.return_value = False
monkeypatch.setattr("tools.mcp_oauth.sys.stdin", mock_stdin)
# Pre-populate cached tokens
d = tmp_path / "mcp-tokens"
d.mkdir(parents=True)
(d / "atlassian.json").write_text(json.dumps({
"access_token": "cached",
"token_type": "Bearer",
}))
import logging
with caplog.at_level(logging.WARNING, logger="tools.mcp_oauth"):
auth = build_oauth_auth("atlassian", "https://mcp.atlassian.com/v1/mcp")
assert auth is not None
assert "no cached tokens found" not in caplog.text.lower()