fix(mcp): reap stdio MCP grandchildren via process-group signal

The orphan reaper for stdio MCP subprocesses only tracked the direct child
PID spawned by ``stdio_client`` (e.g. ``openclaw mcp serve``). When that
wrapper itself spawned a helper (``claude mcp serve``) and then exited, the
helper reparented to ``systemd --user`` and survived shutdown.

The MCP SDK already spawns stdio children with ``start_new_session=True``,
so the wrapper is its own pgroup leader and same-pgroup descendants are
reachable via ``killpg``. Capture the pgid at spawn time and reap via
``killpg(pgid, sig)`` so reparented grandchildren are reaped alongside the
direct child, even after the wrapper itself exits. Falls back to per-pid
``os.kill`` on Windows or when no pgid was recorded.

Fixes part 2 (orphan ``claude mcp serve``) of #23799. Part 1 (per-invocation
respawn) was confirmed by the reporter to be an environmental artifact, not
a code bug.
This commit is contained in:
konsisumer 2026-05-11 22:31:53 +02:00 committed by Teknium
parent 4d7ea3fd36
commit a29d64e50c
2 changed files with 299 additions and 16 deletions

View file

@ -171,6 +171,221 @@ class TestStdioPidTracking:
assert fake_pid not in _orphan_stdio_pids
# ---------------------------------------------------------------------------
# Fix 2b: stdio descendant reaping via process group (issue #23799)
# ---------------------------------------------------------------------------
#
# When a stdio MCP wrapper (e.g. ``openclaw mcp serve``) itself spawns a
# helper subprocess (``claude mcp serve``) and then exits, the helper
# reparents to systemd-user and is invisible to the per-pid orphan reaper.
# The fix captures the wrapper's pgid at spawn time and reaps via killpg,
# which reaches same-group descendants whether or not the direct pid is alive.
class TestStdioPgroupReaping:
"""_kill_orphaned_mcp_children reaps via killpg when a pgid is tracked."""
def _reset_state(self):
from tools.mcp_tool import _stdio_pids, _orphan_stdio_pids, _stdio_pgids, _lock
with _lock:
_stdio_pids.clear()
_orphan_stdio_pids.clear()
_stdio_pgids.clear()
def test_killpg_used_when_pgid_tracked(self, monkeypatch):
"""SIGTERM and SIGKILL route through killpg when pgid is known."""
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_stdio_pgids,
_lock,
)
self._reset_state()
fake_pid = 525252
fake_pgid = 525252 # session leader: pgid == pid
with _lock:
_orphan_stdio_pids.add(fake_pid)
_stdio_pgids[fake_pid] = fake_pgid
fake_sigkill = 9
monkeypatch.setattr(signal, "SIGKILL", fake_sigkill, raising=False)
# Ensure os.killpg exists on this platform for the test to make sense;
# the production fallback path is covered by the per-pid tests above.
if not hasattr(os, "killpg"):
pytest.skip("os.killpg not available on this platform")
with patch("tools.mcp_tool.os.killpg") as mock_killpg, \
patch("tools.mcp_tool.os.kill") as mock_kill, \
patch("gateway.status._pid_exists", return_value=True), \
patch("time.sleep"):
_kill_orphaned_mcp_children()
# Both phases should have used killpg (pgroup reach), not per-pid kill.
mock_killpg.assert_any_call(fake_pgid, signal.SIGTERM)
mock_killpg.assert_any_call(fake_pgid, fake_sigkill)
assert mock_killpg.call_count == 2
mock_kill.assert_not_called()
with _lock:
assert fake_pid not in _orphan_stdio_pids
assert fake_pid not in _stdio_pgids
def test_killpg_failure_falls_back_to_kill(self, monkeypatch):
"""If killpg raises ProcessLookupError (pgroup gone), try os.kill."""
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_stdio_pgids,
_lock,
)
self._reset_state()
fake_pid = 636363
fake_pgid = 636363
with _lock:
_orphan_stdio_pids.add(fake_pid)
_stdio_pgids[fake_pid] = fake_pgid
if not hasattr(os, "killpg"):
pytest.skip("os.killpg not available on this platform")
with patch(
"tools.mcp_tool.os.killpg",
side_effect=ProcessLookupError("no such process group"),
) as mock_killpg, \
patch("tools.mcp_tool.os.kill") as mock_kill, \
patch("gateway.status._pid_exists", return_value=False), \
patch("time.sleep"):
_kill_orphaned_mcp_children()
# killpg was attempted (phase 1 SIGTERM) and fell back to os.kill.
# Phase 3 skips because _pid_exists returns False (direct pid gone).
mock_killpg.assert_called()
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
with _lock:
assert fake_pid not in _orphan_stdio_pids
assert fake_pid not in _stdio_pgids
def test_no_pgid_uses_per_pid_kill(self, monkeypatch):
"""When no pgid is recorded (e.g. Windows), fall back to os.kill."""
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_stdio_pgids,
_lock,
)
self._reset_state()
fake_pid = 747474
with _lock:
_orphan_stdio_pids.add(fake_pid)
# No entry in _stdio_pgids.
with patch("tools.mcp_tool.os.kill") as mock_kill, \
patch("gateway.status._pid_exists", return_value=False), \
patch("time.sleep"):
# killpg may or may not exist; either way the no-pgid path skips it.
_kill_orphaned_mcp_children()
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
with _lock:
assert fake_pid not in _orphan_stdio_pids
@pytest.mark.live_system_guard_bypass
@pytest.mark.skipif(
not hasattr(os, "killpg") or not hasattr(os, "setsid"),
reason="POSIX-only: requires os.killpg and os.setsid",
)
def test_grandchild_reaped_via_pgroup(self, tmp_path):
"""End-to-end: parent spawns grandchild, parent exits, killpg reaps grandchild.
Mirrors issue #23799: a stdio MCP wrapper (parent) launches a long-lived
helper subprocess (grandchild) in the same process group, then the
wrapper exits while the grandchild keeps running. killpg on the pgid
captured at spawn time must still deliver the signal to the grandchild.
Marked ``live_system_guard_bypass`` because this test genuinely needs
real signal delivery to its own subprocess tree (the conftest guard
only knows the test's *initial* children; the spawned tree here is
outside that allowlist).
"""
import subprocess
import sys
import time as _time
psutil = pytest.importorskip("psutil")
# Grandchild: sleep forever, write its pid then wait.
grandchild_pid_file = tmp_path / "grandchild.pid"
grandchild_script = tmp_path / "grandchild.py"
grandchild_script.write_text(
"import os, sys, time\n"
f"open({str(grandchild_pid_file)!r}, 'w').write(str(os.getpid()))\n"
"while True:\n"
" time.sleep(0.5)\n"
)
# Parent: spawn grandchild, exit immediately (without killing it).
parent_script = tmp_path / "parent.py"
parent_script.write_text(
"import subprocess, sys\n"
f"subprocess.Popen([sys.executable, {str(grandchild_script)!r}])\n"
# Parent exits — grandchild reparents to init.
)
# Spawn parent in its own session (mirrors stdio_client behaviour).
parent = subprocess.Popen(
[sys.executable, str(parent_script)],
start_new_session=True,
)
parent_pgid = os.getpgid(parent.pid)
# Wait for parent to exit and grandchild to spin up.
parent.wait(timeout=5)
deadline = _time.time() + 5
while _time.time() < deadline and not grandchild_pid_file.exists():
_time.sleep(0.05)
assert grandchild_pid_file.exists(), "grandchild did not start"
grandchild_pid = int(grandchild_pid_file.read_text().strip())
# Sanity: grandchild is alive and shares the parent's pgid.
assert psutil.pid_exists(grandchild_pid)
assert os.getpgid(grandchild_pid) == parent_pgid
# Drive the reaper: register the parent pid + pgid as an orphan.
from tools.mcp_tool import (
_kill_orphaned_mcp_children,
_orphan_stdio_pids,
_stdio_pgids,
_stdio_pids,
_lock,
)
with _lock:
_stdio_pids.clear()
_orphan_stdio_pids.clear()
_stdio_pgids.clear()
_orphan_stdio_pids.add(parent.pid)
_stdio_pgids[parent.pid] = parent_pgid
try:
_kill_orphaned_mcp_children()
finally:
# Belt-and-suspenders: ensure grandchild is dead even if test fails.
try:
os.kill(grandchild_pid, signal.SIGKILL)
except ProcessLookupError:
pass
# Grandchild should be gone — SIGTERM via killpg in phase 1 reached it.
deadline = _time.time() + 3
while _time.time() < deadline and psutil.pid_exists(grandchild_pid):
_time.sleep(0.05)
assert not psutil.pid_exists(grandchild_pid), (
"grandchild survived killpg-based reaping (issue #23799 regression)"
)
# ---------------------------------------------------------------------------
# Fix 3: MCP reload timeout (cli.py)
# ---------------------------------------------------------------------------

View file

@ -1395,9 +1395,22 @@ class MCPServerTask:
# Capture the newly spawned subprocess PID for force-kill cleanup.
new_pids = _snapshot_child_pids() - pids_before
if new_pids:
# Capture pgid while the child is alive — once it exits we
# can no longer call ``os.getpgid`` on it, and the cleanup
# sweep needs the pgid to reach any reparented descendants
# (e.g. ``claude mcp serve`` spawned by a stdio wrapper).
new_pgids: Dict[int, int] = {}
for _pid in new_pids:
try:
new_pgids[_pid] = os.getpgid(_pid)
except (AttributeError, ProcessLookupError, OSError):
# AttributeError: Windows (os.getpgid is POSIX-only)
# ProcessLookupError: child raced and already exited
pass
with _lock:
for _pid in new_pids:
_stdio_pids[_pid] = self.name
_stdio_pgids.update(new_pgids)
async with ClientSession(
read_stream, write_stream, **sampling_kwargs
) as session:
@ -1416,16 +1429,33 @@ class MCPServerTask:
# on Linux, where setsid() children escape the parent cgroup).
# Mark them as orphans so the next cleanup sweep can reap them.
if new_pids:
from gateway.status import _pid_exists
_killpg = getattr(os, "killpg", None)
with _lock:
for _pid in new_pids:
_stdio_pids.pop(_pid, None)
for pid in new_pids:
# ``os.kill(pid, 0)`` is NOT a no-op on Windows
# (bpo-14484). Use the cross-platform check.
from gateway.status import _pid_exists
if not _pid_exists(pid):
continue # process already exited — nothing to do
_orphan_stdio_pids.add(pid)
pid_alive = _pid_exists(pid)
pgroup_alive = False
pgid = _stdio_pgids.get(pid)
if not pid_alive and pgid is not None and _killpg is not None:
# Direct child exited but descendants may still be
# in its pgroup (e.g. ``claude mcp serve`` spawned
# by an MCP wrapper that exited first). Probe with
# signal 0 — succeeds iff any pgroup member is alive.
try:
_killpg(pgid, 0)
pgroup_alive = True
except (ProcessLookupError, PermissionError, OSError):
pgroup_alive = False
if pid_alive or pgroup_alive:
_orphan_stdio_pids.add(pid)
else:
# Nothing left to reap — drop the pgid entry so
# PID-reuse can't surface stale pgroup state later.
_stdio_pgids.pop(pid, None)
async def _run_http(self, config: dict):
"""Run the server using HTTP/StreamableHTTP transport."""
@ -2224,6 +2254,19 @@ _stdio_pids: Dict[int, str] = {} # pid -> server_name
# sessions (e.g. concurrent cron jobs or live user chats).
_orphan_stdio_pids: set = set()
# Process-group IDs of stdio MCP subprocesses, captured at spawn time.
# The MCP SDK spawns stdio children with ``start_new_session=True`` so each
# direct child becomes its own session/pgroup leader (PGID == its own PID).
# Grandchildren spawned by that child (e.g. a wrapper MCP server that itself
# launches helper subprocesses like ``claude mcp serve``) inherit that PGID
# unless they call ``setsid`` themselves. When the direct child exits, those
# grandchildren reparent to init/systemd-user but keep the original PGID, so
# ``killpg(pgid, sig)`` still reaches them. Tracked separately from
# ``_stdio_pids`` so we retain the PGID even after the direct child has
# exited and been removed from the active map. Empty on Windows
# (``os.getpgid`` is POSIX-only).
_stdio_pgids: Dict[int, int] = {} # pid -> pgid
def _snapshot_child_pids() -> set:
"""Return a set of current child process PIDs.
@ -3640,6 +3683,12 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
survivors, avoiding shared-resource collisions when multiple hermes
processes run on the same host (each has its own ``_stdio_pids`` dict).
On POSIX, signals are sent via ``os.killpg`` to the spawn-time pgid when
one is tracked, so reparented grandchildren in the same process group
(e.g. ``claude mcp serve`` spawned by a stdio MCP wrapper that exited
first) are reaped alongside the direct child. Falls back to ``os.kill``
on Windows and when no pgid is recorded.
With ``include_active=True`` also kills every PID in ``_stdio_pids``
used only at final shutdown, after the MCP event loop has stopped and no
sessions can still be in flight.
@ -3654,20 +3703,42 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
if include_active:
pids.update(dict(_stdio_pids))
_stdio_pids.clear()
# Snapshot pgids for the pids we're about to kill, then drop the
# entries so a future spawn can't collide with stale state.
pgids: Dict[int, int] = {pid: _stdio_pgids[pid] for pid in pids if pid in _stdio_pgids}
for pid in pgids:
_stdio_pgids.pop(pid, None)
# Fast path: no tracked stdio PIDs to reap. Skip the SIGTERM/sleep/SIGKILL
# dance entirely — otherwise every MCP-free shutdown pays a 2s sleep tax.
if not pids:
return
# Phase 1: SIGTERM (graceful)
for pid, server_name in pids.items():
def _send_signal(pid: int, sig: int, server_name: str) -> None:
"""SIGTERM/SIGKILL via pgroup on POSIX, fall back to pid signal."""
pgid = pgids.get(pid)
killpg = getattr(os, "killpg", None)
if pgid is not None and killpg is not None:
try:
killpg(pgid, sig)
return
except (ProcessLookupError, PermissionError, OSError) as exc:
# Pgroup gone (all members exited) or refused — fall back to
# the per-pid path so we still try the direct child if alive.
logger.debug(
"killpg(%d, %d) failed for MCP server '%s': %s; falling back to kill(pid)",
pgid, sig, server_name, exc,
)
try:
os.kill(pid, _signal.SIGTERM)
logger.debug("Sent SIGTERM to orphaned MCP process %d (%s)", pid, server_name)
os.kill(pid, sig)
except (ProcessLookupError, PermissionError, OSError):
pass
# Phase 1: SIGTERM (graceful)
for pid, server_name in pids.items():
_send_signal(pid, _signal.SIGTERM, server_name)
logger.debug("Sent SIGTERM to orphaned MCP process %d (%s)", pid, server_name)
# Phase 2: Wait for graceful exit
time.sleep(2)
@ -3679,14 +3750,11 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
for pid, server_name in pids.items():
if not _pid_exists(pid):
continue # Good — exited after SIGTERM
try:
os.kill(pid, _sigkill)
logger.warning(
"Force-killed MCP process %d (%s) after SIGTERM timeout",
pid, server_name,
)
except (ProcessLookupError, PermissionError, OSError):
pass
_send_signal(pid, _sigkill, server_name)
logger.warning(
"Force-killed MCP process %d (%s) after SIGTERM timeout",
pid, server_name,
)
def _stop_mcp_loop():