mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-09 03:11:58 +00:00
PR #21561 migrated liveness probes across 14 call sites from
`os.kill(pid, 0)` to `gateway.status._pid_exists` (psutil-first) so
the gateway doesn't Ctrl+C-itself on Windows via bpo-14484. A handful of
tests still patched the old `os.kill` seam and either happened to pass
on POSIX (when PID 12345 incidentally wasn't alive on the CI worker) or
failed outright — on CI runs they surfaced as 7 flaky/stable failures.
Migrate each affected test to patch the correct seam:
- tests/tools/test_browser_orphan_reaper.py (5 tests)
Patch `gateway.status._pid_exists` instead of `os.kill`.
Rename test_permission_error_on_kill_check_skips to
test_alive_legacy_daemon_is_reaped — the old assertion was
"PermissionError on sig 0 → skip dir"; post-migration the
untracked-alive-daemon path always reaps the dir after SIGTERM
(best-effort semantics were preserved).
- tests/tools/test_windows_native_support.py (4 tests)
Replace tests that asserted `os.kill` seam behavior with tests
that exercise `ProcessRegistry._is_host_pid_alive` as a
delegator and split out a new TestPidExistsOSErrorWidening class
that hits `gateway.status._pid_exists` directly via the POSIX
fallback branch (so Windows-style `OSError(WinError 87)` + `PermissionError`
widening is still covered on Linux CI).
- tests/tools/test_process_registry.py (1 test)
Mock `psutil.Process` + `_pid_exists` instead of `os.kill`
for the detached-session kill path.
- tests/tools/test_mcp_stability.py::test_kill_orphaned_uses_sigkill_when_available
SIGTERM → alive-check → SIGKILL flow now uses `_pid_exists`
for the middle step; assertion count drops from 3 to 2.
- tests/gateway/test_status.py::TestScopedLocks (2 tests)
`acquire_scoped_lock` consults `_pid_exists`; patch that
seam directly instead of trying to control the nested psutil
call via os.kill monkeypatch.
- tests/hermes_cli/test_gateway.py::test_stop_profile_gateway_keeps_pid_file_when_process_still_running
The stop loop sends one SIGTERM via os.kill then polls 20x via
_pid_exists; instrument both separately. Old assertion
`calls["kill"] == 21` split into `kill == 1` + `alive_probes == 20`.
- tests/hermes_cli/test_auth_toctou_file_modes.py::test_shared_nous_store_writes_0o600_with_0o700_parent
Commit c34884ea2 switched the pytest seat-belt guard in
`_nous_shared_store_path()` from `Path.home() / ".hermes"`
to `get_default_hermes_root()`, which honors HERMES_HOME. The
test sets both HERMES_HOME and HERMES_SHARED_AUTH_DIR to
subpaths of the same tmp_path, and the override now collapses
onto the same path the guard is refusing. Renamed the override
subdirectory so the two paths diverge — guard passes, test runs.
All 21 original CI failures and their local-flaky siblings now pass
(278 tests across the touched files, 0 failures).
322 lines
12 KiB
Python
322 lines
12 KiB
Python
"""Tests for MCP stability fixes — event loop handler, PID tracking, shutdown robustness."""
|
|
|
|
import asyncio
|
|
import os
|
|
import signal
|
|
import threading
|
|
from unittest.mock import patch, MagicMock
|
|
|
|
import pytest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix 1: MCP event loop exception handler
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestMCPLoopExceptionHandler:
|
|
"""_mcp_loop_exception_handler suppresses benign 'Event loop is closed'."""
|
|
|
|
def test_suppresses_event_loop_closed(self):
|
|
from tools.mcp_tool import _mcp_loop_exception_handler
|
|
loop = MagicMock()
|
|
context = {"exception": RuntimeError("Event loop is closed")}
|
|
# Should NOT call default handler
|
|
_mcp_loop_exception_handler(loop, context)
|
|
loop.default_exception_handler.assert_not_called()
|
|
|
|
def test_forwards_other_runtime_errors(self):
|
|
from tools.mcp_tool import _mcp_loop_exception_handler
|
|
loop = MagicMock()
|
|
context = {"exception": RuntimeError("some other error")}
|
|
_mcp_loop_exception_handler(loop, context)
|
|
loop.default_exception_handler.assert_called_once_with(context)
|
|
|
|
def test_forwards_non_runtime_errors(self):
|
|
from tools.mcp_tool import _mcp_loop_exception_handler
|
|
loop = MagicMock()
|
|
context = {"exception": ValueError("bad value")}
|
|
_mcp_loop_exception_handler(loop, context)
|
|
loop.default_exception_handler.assert_called_once_with(context)
|
|
|
|
def test_forwards_contexts_without_exception(self):
|
|
from tools.mcp_tool import _mcp_loop_exception_handler
|
|
loop = MagicMock()
|
|
context = {"message": "just a message"}
|
|
_mcp_loop_exception_handler(loop, context)
|
|
loop.default_exception_handler.assert_called_once_with(context)
|
|
|
|
def test_handler_installed_on_mcp_loop(self):
|
|
"""_ensure_mcp_loop installs the exception handler on the new loop."""
|
|
import tools.mcp_tool as mcp_mod
|
|
try:
|
|
mcp_mod._ensure_mcp_loop()
|
|
with mcp_mod._lock:
|
|
loop = mcp_mod._mcp_loop
|
|
assert loop is not None
|
|
assert loop.get_exception_handler() is mcp_mod._mcp_loop_exception_handler
|
|
finally:
|
|
mcp_mod._stop_mcp_loop()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix 2: stdio PID tracking
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestStdioPidTracking:
|
|
"""_snapshot_child_pids and _stdio_pids track subprocess PIDs."""
|
|
|
|
def test_snapshot_returns_set(self):
|
|
from tools.mcp_tool import _snapshot_child_pids
|
|
result = _snapshot_child_pids()
|
|
assert isinstance(result, set)
|
|
# All elements should be ints
|
|
for pid in result:
|
|
assert isinstance(pid, int)
|
|
|
|
def test_stdio_pids_starts_empty(self):
|
|
from tools.mcp_tool import _stdio_pids, _lock
|
|
with _lock:
|
|
# Might have residual state from other tests, just check type
|
|
assert isinstance(_stdio_pids, dict)
|
|
|
|
def test_kill_orphaned_noop_when_empty(self):
|
|
"""_kill_orphaned_mcp_children does nothing when no PIDs tracked."""
|
|
from tools.mcp_tool import (
|
|
_kill_orphaned_mcp_children,
|
|
_orphan_stdio_pids,
|
|
_stdio_pids,
|
|
_lock,
|
|
)
|
|
|
|
with _lock:
|
|
_stdio_pids.clear()
|
|
_orphan_stdio_pids.clear()
|
|
|
|
# Should not raise
|
|
_kill_orphaned_mcp_children()
|
|
|
|
def test_kill_orphaned_handles_dead_pids(self):
|
|
"""_kill_orphaned_mcp_children gracefully handles already-dead PIDs."""
|
|
from tools.mcp_tool import (
|
|
_kill_orphaned_mcp_children,
|
|
_orphan_stdio_pids,
|
|
_lock,
|
|
)
|
|
|
|
# Use a PID that definitely doesn't exist
|
|
fake_pid = 999999999
|
|
with _lock:
|
|
_orphan_stdio_pids.add(fake_pid)
|
|
|
|
# Should not raise (ProcessLookupError is caught)
|
|
_kill_orphaned_mcp_children()
|
|
|
|
with _lock:
|
|
assert fake_pid not in _orphan_stdio_pids
|
|
|
|
def test_kill_orphaned_uses_sigkill_when_available(self, monkeypatch):
|
|
"""SIGTERM-first then SIGKILL after 2s for orphan cleanup."""
|
|
from tools.mcp_tool import (
|
|
_kill_orphaned_mcp_children,
|
|
_orphan_stdio_pids,
|
|
_lock,
|
|
)
|
|
|
|
fake_pid = 424242
|
|
with _lock:
|
|
_orphan_stdio_pids.clear()
|
|
_orphan_stdio_pids.add(fake_pid)
|
|
|
|
fake_sigkill = 9
|
|
monkeypatch.setattr(signal, "SIGKILL", fake_sigkill, raising=False)
|
|
|
|
# Post-#21561 the alive check routes through
|
|
# ``gateway.status._pid_exists`` (so it's safe on Windows — see
|
|
# bpo-14484). Return True so the SIGKILL escalation fires.
|
|
with patch("tools.mcp_tool.os.kill") as mock_kill, \
|
|
patch("gateway.status._pid_exists", return_value=True), \
|
|
patch("time.sleep") as mock_sleep:
|
|
_kill_orphaned_mcp_children()
|
|
|
|
# SIGTERM then SIGKILL; the alive check no longer touches os.kill.
|
|
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
|
|
mock_kill.assert_any_call(fake_pid, fake_sigkill)
|
|
assert mock_kill.call_count == 2
|
|
mock_sleep.assert_called_once_with(2)
|
|
|
|
with _lock:
|
|
assert fake_pid not in _orphan_stdio_pids
|
|
|
|
def test_kill_orphaned_falls_back_without_sigkill(self, monkeypatch):
|
|
"""Without SIGKILL, SIGTERM is used for both phases."""
|
|
from tools.mcp_tool import (
|
|
_kill_orphaned_mcp_children,
|
|
_orphan_stdio_pids,
|
|
_lock,
|
|
)
|
|
|
|
fake_pid = 434343
|
|
with _lock:
|
|
_orphan_stdio_pids.clear()
|
|
_orphan_stdio_pids.add(fake_pid)
|
|
|
|
monkeypatch.delattr(signal, "SIGKILL", raising=False)
|
|
|
|
with patch("tools.mcp_tool.os.kill") as mock_kill, \
|
|
patch("time.sleep") as mock_sleep:
|
|
_kill_orphaned_mcp_children()
|
|
|
|
# SIGTERM phase, alive check raises (process gone), no escalation
|
|
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
|
|
assert mock_sleep.called
|
|
|
|
with _lock:
|
|
assert fake_pid not in _orphan_stdio_pids
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix 3: MCP reload timeout (cli.py)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestMCPReloadTimeout:
|
|
"""_check_config_mcp_changes uses a timeout on _reload_mcp."""
|
|
|
|
def test_reload_timeout_does_not_block_forever(self, tmp_path, monkeypatch):
|
|
"""If _reload_mcp hangs, the config watcher times out and returns."""
|
|
import time
|
|
|
|
# Create a mock HermesCLI-like object with the needed attributes
|
|
class FakeCLI:
|
|
_config_mtime = 0.0
|
|
_config_mcp_servers = {}
|
|
_last_config_check = 0.0
|
|
_command_running = False
|
|
config = {}
|
|
agent = None
|
|
|
|
def _reload_mcp(self):
|
|
# Simulate a hang — sleep longer than the timeout
|
|
time.sleep(60)
|
|
|
|
def _slow_command_status(self, cmd):
|
|
return cmd
|
|
|
|
# This test verifies the timeout mechanism exists in the code
|
|
# by checking that _check_config_mcp_changes doesn't call
|
|
# _reload_mcp directly (it uses a thread now)
|
|
import inspect
|
|
from cli import HermesCLI
|
|
source = inspect.getsource(HermesCLI._check_config_mcp_changes)
|
|
# The fix adds threading.Thread for _reload_mcp
|
|
assert "Thread" in source or "thread" in source.lower(), \
|
|
"_check_config_mcp_changes should use a thread for _reload_mcp"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fix 4: MCP initial connection retry with backoff
|
|
# (Ported from Kilo Code's MCP resilience fix)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestMCPInitialConnectionRetry:
|
|
"""MCPServerTask.run() retries initial connection failures instead of giving up."""
|
|
|
|
def test_initial_connect_retries_constant_exists(self):
|
|
"""_MAX_INITIAL_CONNECT_RETRIES should be defined."""
|
|
from tools.mcp_tool import _MAX_INITIAL_CONNECT_RETRIES
|
|
assert _MAX_INITIAL_CONNECT_RETRIES >= 1
|
|
|
|
def test_initial_connect_retry_succeeds_on_second_attempt(self):
|
|
"""Server succeeds after one transient initial failure."""
|
|
from tools.mcp_tool import MCPServerTask, _MAX_INITIAL_CONNECT_RETRIES
|
|
|
|
call_count = 0
|
|
|
|
async def _run():
|
|
nonlocal call_count
|
|
server = MCPServerTask("test-retry")
|
|
|
|
# Track calls via patching the method on the class
|
|
original_run_stdio = MCPServerTask._run_stdio
|
|
|
|
async def fake_run_stdio(self_inner, config):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count == 1:
|
|
raise ConnectionError("DNS resolution failed")
|
|
# Second attempt: success — set ready and "run" until shutdown
|
|
self_inner._ready.set()
|
|
await self_inner._shutdown_event.wait()
|
|
|
|
with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio):
|
|
task = asyncio.ensure_future(server.run({"command": "fake"}))
|
|
await server._ready.wait()
|
|
|
|
# It should have succeeded (no error) after retrying
|
|
assert server._error is None, f"Expected no error, got: {server._error}"
|
|
assert call_count == 2, f"Expected 2 attempts, got {call_count}"
|
|
|
|
# Clean shutdown
|
|
server._shutdown_event.set()
|
|
await task
|
|
|
|
asyncio.get_event_loop().run_until_complete(_run())
|
|
|
|
def test_initial_connect_gives_up_after_max_retries(self):
|
|
"""Server gives up after _MAX_INITIAL_CONNECT_RETRIES failures."""
|
|
from tools.mcp_tool import MCPServerTask, _MAX_INITIAL_CONNECT_RETRIES
|
|
|
|
call_count = 0
|
|
|
|
async def _run():
|
|
nonlocal call_count
|
|
server = MCPServerTask("test-exhaust")
|
|
|
|
async def fake_run_stdio(self_inner, config):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
raise ConnectionError("DNS resolution failed")
|
|
|
|
with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio):
|
|
task = asyncio.ensure_future(server.run({"command": "fake"}))
|
|
await server._ready.wait()
|
|
|
|
# Should have an error after exhausting retries
|
|
assert server._error is not None
|
|
assert "DNS resolution failed" in str(server._error)
|
|
# 1 initial + N retries = _MAX_INITIAL_CONNECT_RETRIES + 1 total attempts
|
|
assert call_count == _MAX_INITIAL_CONNECT_RETRIES + 1
|
|
|
|
await task
|
|
|
|
asyncio.get_event_loop().run_until_complete(_run())
|
|
|
|
def test_initial_connect_retry_respects_shutdown(self):
|
|
"""Shutdown during initial retry backoff aborts cleanly."""
|
|
from tools.mcp_tool import MCPServerTask
|
|
|
|
async def _run():
|
|
server = MCPServerTask("test-shutdown")
|
|
attempt = 0
|
|
|
|
async def fake_run_stdio(self_inner, config):
|
|
nonlocal attempt
|
|
attempt += 1
|
|
if attempt == 1:
|
|
raise ConnectionError("transient failure")
|
|
# Should not reach here because shutdown fires during sleep
|
|
raise AssertionError("Should not attempt after shutdown")
|
|
|
|
with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio):
|
|
task = asyncio.ensure_future(server.run({"command": "fake"}))
|
|
|
|
# Give the first attempt time to fail, then set shutdown
|
|
# during the backoff sleep
|
|
await asyncio.sleep(0.1)
|
|
server._shutdown_event.set()
|
|
await server._ready.wait()
|
|
|
|
# Should have the error set and be done
|
|
assert server._error is not None
|
|
await task
|
|
|
|
asyncio.get_event_loop().run_until_complete(_run())
|