mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-11 08:42:11 +00:00
fix(mcp): reap stdio MCP grandchildren via process-group signal
The orphan reaper for stdio MCP subprocesses only tracked the direct child PID spawned by ``stdio_client`` (e.g. ``openclaw mcp serve``). When that wrapper itself spawned a helper (``claude mcp serve``) and then exited, the helper reparented to ``systemd --user`` and survived shutdown. The MCP SDK already spawns stdio children with ``start_new_session=True``, so the wrapper is its own pgroup leader and same-pgroup descendants are reachable via ``killpg``. Capture the pgid at spawn time and reap via ``killpg(pgid, sig)`` so reparented grandchildren are reaped alongside the direct child, even after the wrapper itself exits. Falls back to per-pid ``os.kill`` on Windows or when no pgid was recorded. Fixes part 2 (orphan ``claude mcp serve``) of #23799. Part 1 (per-invocation respawn) was confirmed by the reporter to be an environmental artifact, not a code bug.
This commit is contained in:
parent
4d7ea3fd36
commit
a29d64e50c
2 changed files with 299 additions and 16 deletions
|
|
@ -171,6 +171,221 @@ class TestStdioPidTracking:
|
|||
assert fake_pid not in _orphan_stdio_pids
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fix 2b: stdio descendant reaping via process group (issue #23799)
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# When a stdio MCP wrapper (e.g. ``openclaw mcp serve``) itself spawns a
|
||||
# helper subprocess (``claude mcp serve``) and then exits, the helper
|
||||
# reparents to systemd-user and is invisible to the per-pid orphan reaper.
|
||||
# The fix captures the wrapper's pgid at spawn time and reaps via killpg,
|
||||
# which reaches same-group descendants whether or not the direct pid is alive.
|
||||
|
||||
class TestStdioPgroupReaping:
|
||||
"""_kill_orphaned_mcp_children reaps via killpg when a pgid is tracked."""
|
||||
|
||||
def _reset_state(self):
|
||||
from tools.mcp_tool import _stdio_pids, _orphan_stdio_pids, _stdio_pgids, _lock
|
||||
with _lock:
|
||||
_stdio_pids.clear()
|
||||
_orphan_stdio_pids.clear()
|
||||
_stdio_pgids.clear()
|
||||
|
||||
def test_killpg_used_when_pgid_tracked(self, monkeypatch):
|
||||
"""SIGTERM and SIGKILL route through killpg when pgid is known."""
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_stdio_pgids,
|
||||
_lock,
|
||||
)
|
||||
|
||||
self._reset_state()
|
||||
fake_pid = 525252
|
||||
fake_pgid = 525252 # session leader: pgid == pid
|
||||
with _lock:
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
_stdio_pgids[fake_pid] = fake_pgid
|
||||
|
||||
fake_sigkill = 9
|
||||
monkeypatch.setattr(signal, "SIGKILL", fake_sigkill, raising=False)
|
||||
|
||||
# Ensure os.killpg exists on this platform for the test to make sense;
|
||||
# the production fallback path is covered by the per-pid tests above.
|
||||
if not hasattr(os, "killpg"):
|
||||
pytest.skip("os.killpg not available on this platform")
|
||||
|
||||
with patch("tools.mcp_tool.os.killpg") as mock_killpg, \
|
||||
patch("tools.mcp_tool.os.kill") as mock_kill, \
|
||||
patch("gateway.status._pid_exists", return_value=True), \
|
||||
patch("time.sleep"):
|
||||
_kill_orphaned_mcp_children()
|
||||
|
||||
# Both phases should have used killpg (pgroup reach), not per-pid kill.
|
||||
mock_killpg.assert_any_call(fake_pgid, signal.SIGTERM)
|
||||
mock_killpg.assert_any_call(fake_pgid, fake_sigkill)
|
||||
assert mock_killpg.call_count == 2
|
||||
mock_kill.assert_not_called()
|
||||
|
||||
with _lock:
|
||||
assert fake_pid not in _orphan_stdio_pids
|
||||
assert fake_pid not in _stdio_pgids
|
||||
|
||||
def test_killpg_failure_falls_back_to_kill(self, monkeypatch):
|
||||
"""If killpg raises ProcessLookupError (pgroup gone), try os.kill."""
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_stdio_pgids,
|
||||
_lock,
|
||||
)
|
||||
|
||||
self._reset_state()
|
||||
fake_pid = 636363
|
||||
fake_pgid = 636363
|
||||
with _lock:
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
_stdio_pgids[fake_pid] = fake_pgid
|
||||
|
||||
if not hasattr(os, "killpg"):
|
||||
pytest.skip("os.killpg not available on this platform")
|
||||
|
||||
with patch(
|
||||
"tools.mcp_tool.os.killpg",
|
||||
side_effect=ProcessLookupError("no such process group"),
|
||||
) as mock_killpg, \
|
||||
patch("tools.mcp_tool.os.kill") as mock_kill, \
|
||||
patch("gateway.status._pid_exists", return_value=False), \
|
||||
patch("time.sleep"):
|
||||
_kill_orphaned_mcp_children()
|
||||
|
||||
# killpg was attempted (phase 1 SIGTERM) and fell back to os.kill.
|
||||
# Phase 3 skips because _pid_exists returns False (direct pid gone).
|
||||
mock_killpg.assert_called()
|
||||
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
|
||||
|
||||
with _lock:
|
||||
assert fake_pid not in _orphan_stdio_pids
|
||||
assert fake_pid not in _stdio_pgids
|
||||
|
||||
def test_no_pgid_uses_per_pid_kill(self, monkeypatch):
|
||||
"""When no pgid is recorded (e.g. Windows), fall back to os.kill."""
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_stdio_pgids,
|
||||
_lock,
|
||||
)
|
||||
|
||||
self._reset_state()
|
||||
fake_pid = 747474
|
||||
with _lock:
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
# No entry in _stdio_pgids.
|
||||
|
||||
with patch("tools.mcp_tool.os.kill") as mock_kill, \
|
||||
patch("gateway.status._pid_exists", return_value=False), \
|
||||
patch("time.sleep"):
|
||||
# killpg may or may not exist; either way the no-pgid path skips it.
|
||||
_kill_orphaned_mcp_children()
|
||||
|
||||
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
|
||||
|
||||
with _lock:
|
||||
assert fake_pid not in _orphan_stdio_pids
|
||||
|
||||
@pytest.mark.live_system_guard_bypass
|
||||
@pytest.mark.skipif(
|
||||
not hasattr(os, "killpg") or not hasattr(os, "setsid"),
|
||||
reason="POSIX-only: requires os.killpg and os.setsid",
|
||||
)
|
||||
def test_grandchild_reaped_via_pgroup(self, tmp_path):
|
||||
"""End-to-end: parent spawns grandchild, parent exits, killpg reaps grandchild.
|
||||
|
||||
Mirrors issue #23799: a stdio MCP wrapper (parent) launches a long-lived
|
||||
helper subprocess (grandchild) in the same process group, then the
|
||||
wrapper exits while the grandchild keeps running. killpg on the pgid
|
||||
captured at spawn time must still deliver the signal to the grandchild.
|
||||
|
||||
Marked ``live_system_guard_bypass`` because this test genuinely needs
|
||||
real signal delivery to its own subprocess tree (the conftest guard
|
||||
only knows the test's *initial* children; the spawned tree here is
|
||||
outside that allowlist).
|
||||
"""
|
||||
import subprocess
|
||||
import sys
|
||||
import time as _time
|
||||
|
||||
psutil = pytest.importorskip("psutil")
|
||||
|
||||
# Grandchild: sleep forever, write its pid then wait.
|
||||
grandchild_pid_file = tmp_path / "grandchild.pid"
|
||||
grandchild_script = tmp_path / "grandchild.py"
|
||||
grandchild_script.write_text(
|
||||
"import os, sys, time\n"
|
||||
f"open({str(grandchild_pid_file)!r}, 'w').write(str(os.getpid()))\n"
|
||||
"while True:\n"
|
||||
" time.sleep(0.5)\n"
|
||||
)
|
||||
|
||||
# Parent: spawn grandchild, exit immediately (without killing it).
|
||||
parent_script = tmp_path / "parent.py"
|
||||
parent_script.write_text(
|
||||
"import subprocess, sys\n"
|
||||
f"subprocess.Popen([sys.executable, {str(grandchild_script)!r}])\n"
|
||||
# Parent exits — grandchild reparents to init.
|
||||
)
|
||||
|
||||
# Spawn parent in its own session (mirrors stdio_client behaviour).
|
||||
parent = subprocess.Popen(
|
||||
[sys.executable, str(parent_script)],
|
||||
start_new_session=True,
|
||||
)
|
||||
parent_pgid = os.getpgid(parent.pid)
|
||||
# Wait for parent to exit and grandchild to spin up.
|
||||
parent.wait(timeout=5)
|
||||
deadline = _time.time() + 5
|
||||
while _time.time() < deadline and not grandchild_pid_file.exists():
|
||||
_time.sleep(0.05)
|
||||
assert grandchild_pid_file.exists(), "grandchild did not start"
|
||||
grandchild_pid = int(grandchild_pid_file.read_text().strip())
|
||||
|
||||
# Sanity: grandchild is alive and shares the parent's pgid.
|
||||
assert psutil.pid_exists(grandchild_pid)
|
||||
assert os.getpgid(grandchild_pid) == parent_pgid
|
||||
|
||||
# Drive the reaper: register the parent pid + pgid as an orphan.
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_stdio_pgids,
|
||||
_stdio_pids,
|
||||
_lock,
|
||||
)
|
||||
with _lock:
|
||||
_stdio_pids.clear()
|
||||
_orphan_stdio_pids.clear()
|
||||
_stdio_pgids.clear()
|
||||
_orphan_stdio_pids.add(parent.pid)
|
||||
_stdio_pgids[parent.pid] = parent_pgid
|
||||
try:
|
||||
_kill_orphaned_mcp_children()
|
||||
finally:
|
||||
# Belt-and-suspenders: ensure grandchild is dead even if test fails.
|
||||
try:
|
||||
os.kill(grandchild_pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
# Grandchild should be gone — SIGTERM via killpg in phase 1 reached it.
|
||||
deadline = _time.time() + 3
|
||||
while _time.time() < deadline and psutil.pid_exists(grandchild_pid):
|
||||
_time.sleep(0.05)
|
||||
assert not psutil.pid_exists(grandchild_pid), (
|
||||
"grandchild survived killpg-based reaping (issue #23799 regression)"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fix 3: MCP reload timeout (cli.py)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -1395,9 +1395,22 @@ class MCPServerTask:
|
|||
# Capture the newly spawned subprocess PID for force-kill cleanup.
|
||||
new_pids = _snapshot_child_pids() - pids_before
|
||||
if new_pids:
|
||||
# Capture pgid while the child is alive — once it exits we
|
||||
# can no longer call ``os.getpgid`` on it, and the cleanup
|
||||
# sweep needs the pgid to reach any reparented descendants
|
||||
# (e.g. ``claude mcp serve`` spawned by a stdio wrapper).
|
||||
new_pgids: Dict[int, int] = {}
|
||||
for _pid in new_pids:
|
||||
try:
|
||||
new_pgids[_pid] = os.getpgid(_pid)
|
||||
except (AttributeError, ProcessLookupError, OSError):
|
||||
# AttributeError: Windows (os.getpgid is POSIX-only)
|
||||
# ProcessLookupError: child raced and already exited
|
||||
pass
|
||||
with _lock:
|
||||
for _pid in new_pids:
|
||||
_stdio_pids[_pid] = self.name
|
||||
_stdio_pgids.update(new_pgids)
|
||||
async with ClientSession(
|
||||
read_stream, write_stream, **sampling_kwargs
|
||||
) as session:
|
||||
|
|
@ -1416,16 +1429,33 @@ class MCPServerTask:
|
|||
# on Linux, where setsid() children escape the parent cgroup).
|
||||
# Mark them as orphans so the next cleanup sweep can reap them.
|
||||
if new_pids:
|
||||
from gateway.status import _pid_exists
|
||||
_killpg = getattr(os, "killpg", None)
|
||||
with _lock:
|
||||
for _pid in new_pids:
|
||||
_stdio_pids.pop(_pid, None)
|
||||
for pid in new_pids:
|
||||
# ``os.kill(pid, 0)`` is NOT a no-op on Windows
|
||||
# (bpo-14484). Use the cross-platform check.
|
||||
from gateway.status import _pid_exists
|
||||
if not _pid_exists(pid):
|
||||
continue # process already exited — nothing to do
|
||||
_orphan_stdio_pids.add(pid)
|
||||
pid_alive = _pid_exists(pid)
|
||||
pgroup_alive = False
|
||||
pgid = _stdio_pgids.get(pid)
|
||||
if not pid_alive and pgid is not None and _killpg is not None:
|
||||
# Direct child exited but descendants may still be
|
||||
# in its pgroup (e.g. ``claude mcp serve`` spawned
|
||||
# by an MCP wrapper that exited first). Probe with
|
||||
# signal 0 — succeeds iff any pgroup member is alive.
|
||||
try:
|
||||
_killpg(pgid, 0)
|
||||
pgroup_alive = True
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pgroup_alive = False
|
||||
if pid_alive or pgroup_alive:
|
||||
_orphan_stdio_pids.add(pid)
|
||||
else:
|
||||
# Nothing left to reap — drop the pgid entry so
|
||||
# PID-reuse can't surface stale pgroup state later.
|
||||
_stdio_pgids.pop(pid, None)
|
||||
|
||||
async def _run_http(self, config: dict):
|
||||
"""Run the server using HTTP/StreamableHTTP transport."""
|
||||
|
|
@ -2224,6 +2254,19 @@ _stdio_pids: Dict[int, str] = {} # pid -> server_name
|
|||
# sessions (e.g. concurrent cron jobs or live user chats).
|
||||
_orphan_stdio_pids: set = set()
|
||||
|
||||
# Process-group IDs of stdio MCP subprocesses, captured at spawn time.
|
||||
# The MCP SDK spawns stdio children with ``start_new_session=True`` so each
|
||||
# direct child becomes its own session/pgroup leader (PGID == its own PID).
|
||||
# Grandchildren spawned by that child (e.g. a wrapper MCP server that itself
|
||||
# launches helper subprocesses like ``claude mcp serve``) inherit that PGID
|
||||
# unless they call ``setsid`` themselves. When the direct child exits, those
|
||||
# grandchildren reparent to init/systemd-user but keep the original PGID, so
|
||||
# ``killpg(pgid, sig)`` still reaches them. Tracked separately from
|
||||
# ``_stdio_pids`` so we retain the PGID even after the direct child has
|
||||
# exited and been removed from the active map. Empty on Windows
|
||||
# (``os.getpgid`` is POSIX-only).
|
||||
_stdio_pgids: Dict[int, int] = {} # pid -> pgid
|
||||
|
||||
|
||||
def _snapshot_child_pids() -> set:
|
||||
"""Return a set of current child process PIDs.
|
||||
|
|
@ -3640,6 +3683,12 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
|
|||
survivors, avoiding shared-resource collisions when multiple hermes
|
||||
processes run on the same host (each has its own ``_stdio_pids`` dict).
|
||||
|
||||
On POSIX, signals are sent via ``os.killpg`` to the spawn-time pgid when
|
||||
one is tracked, so reparented grandchildren in the same process group
|
||||
(e.g. ``claude mcp serve`` spawned by a stdio MCP wrapper that exited
|
||||
first) are reaped alongside the direct child. Falls back to ``os.kill``
|
||||
on Windows and when no pgid is recorded.
|
||||
|
||||
With ``include_active=True`` also kills every PID in ``_stdio_pids`` —
|
||||
used only at final shutdown, after the MCP event loop has stopped and no
|
||||
sessions can still be in flight.
|
||||
|
|
@ -3654,20 +3703,42 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
|
|||
if include_active:
|
||||
pids.update(dict(_stdio_pids))
|
||||
_stdio_pids.clear()
|
||||
# Snapshot pgids for the pids we're about to kill, then drop the
|
||||
# entries so a future spawn can't collide with stale state.
|
||||
pgids: Dict[int, int] = {pid: _stdio_pgids[pid] for pid in pids if pid in _stdio_pgids}
|
||||
for pid in pgids:
|
||||
_stdio_pgids.pop(pid, None)
|
||||
|
||||
# Fast path: no tracked stdio PIDs to reap. Skip the SIGTERM/sleep/SIGKILL
|
||||
# dance entirely — otherwise every MCP-free shutdown pays a 2s sleep tax.
|
||||
if not pids:
|
||||
return
|
||||
|
||||
# Phase 1: SIGTERM (graceful)
|
||||
for pid, server_name in pids.items():
|
||||
def _send_signal(pid: int, sig: int, server_name: str) -> None:
|
||||
"""SIGTERM/SIGKILL via pgroup on POSIX, fall back to pid signal."""
|
||||
pgid = pgids.get(pid)
|
||||
killpg = getattr(os, "killpg", None)
|
||||
if pgid is not None and killpg is not None:
|
||||
try:
|
||||
killpg(pgid, sig)
|
||||
return
|
||||
except (ProcessLookupError, PermissionError, OSError) as exc:
|
||||
# Pgroup gone (all members exited) or refused — fall back to
|
||||
# the per-pid path so we still try the direct child if alive.
|
||||
logger.debug(
|
||||
"killpg(%d, %d) failed for MCP server '%s': %s; falling back to kill(pid)",
|
||||
pgid, sig, server_name, exc,
|
||||
)
|
||||
try:
|
||||
os.kill(pid, _signal.SIGTERM)
|
||||
logger.debug("Sent SIGTERM to orphaned MCP process %d (%s)", pid, server_name)
|
||||
os.kill(pid, sig)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
|
||||
# Phase 1: SIGTERM (graceful)
|
||||
for pid, server_name in pids.items():
|
||||
_send_signal(pid, _signal.SIGTERM, server_name)
|
||||
logger.debug("Sent SIGTERM to orphaned MCP process %d (%s)", pid, server_name)
|
||||
|
||||
# Phase 2: Wait for graceful exit
|
||||
time.sleep(2)
|
||||
|
||||
|
|
@ -3679,14 +3750,11 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
|
|||
for pid, server_name in pids.items():
|
||||
if not _pid_exists(pid):
|
||||
continue # Good — exited after SIGTERM
|
||||
try:
|
||||
os.kill(pid, _sigkill)
|
||||
logger.warning(
|
||||
"Force-killed MCP process %d (%s) after SIGTERM timeout",
|
||||
pid, server_name,
|
||||
)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
_send_signal(pid, _sigkill, server_name)
|
||||
logger.warning(
|
||||
"Force-killed MCP process %d (%s) after SIGTERM timeout",
|
||||
pid, server_name,
|
||||
)
|
||||
|
||||
|
||||
def _stop_mcp_loop():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue