diff --git a/tests/tools/test_mcp_stability.py b/tests/tools/test_mcp_stability.py index 1dd76959854..32e539c7ad2 100644 --- a/tests/tools/test_mcp_stability.py +++ b/tests/tools/test_mcp_stability.py @@ -171,6 +171,221 @@ class TestStdioPidTracking: assert fake_pid not in _orphan_stdio_pids +# --------------------------------------------------------------------------- +# Fix 2b: stdio descendant reaping via process group (issue #23799) +# --------------------------------------------------------------------------- +# +# When a stdio MCP wrapper (e.g. ``openclaw mcp serve``) itself spawns a +# helper subprocess (``claude mcp serve``) and then exits, the helper +# reparents to systemd-user and is invisible to the per-pid orphan reaper. +# The fix captures the wrapper's pgid at spawn time and reaps via killpg, +# which reaches same-group descendants whether or not the direct pid is alive. + +class TestStdioPgroupReaping: + """_kill_orphaned_mcp_children reaps via killpg when a pgid is tracked.""" + + def _reset_state(self): + from tools.mcp_tool import _stdio_pids, _orphan_stdio_pids, _stdio_pgids, _lock + with _lock: + _stdio_pids.clear() + _orphan_stdio_pids.clear() + _stdio_pgids.clear() + + def test_killpg_used_when_pgid_tracked(self, monkeypatch): + """SIGTERM and SIGKILL route through killpg when pgid is known.""" + from tools.mcp_tool import ( + _kill_orphaned_mcp_children, + _orphan_stdio_pids, + _stdio_pgids, + _lock, + ) + + self._reset_state() + fake_pid = 525252 + fake_pgid = 525252 # session leader: pgid == pid + with _lock: + _orphan_stdio_pids.add(fake_pid) + _stdio_pgids[fake_pid] = fake_pgid + + fake_sigkill = 9 + monkeypatch.setattr(signal, "SIGKILL", fake_sigkill, raising=False) + + # Ensure os.killpg exists on this platform for the test to make sense; + # the production fallback path is covered by the per-pid tests above. + if not hasattr(os, "killpg"): + pytest.skip("os.killpg not available on this platform") + + with patch("tools.mcp_tool.os.killpg") as mock_killpg, \ + patch("tools.mcp_tool.os.kill") as mock_kill, \ + patch("gateway.status._pid_exists", return_value=True), \ + patch("time.sleep"): + _kill_orphaned_mcp_children() + + # Both phases should have used killpg (pgroup reach), not per-pid kill. + mock_killpg.assert_any_call(fake_pgid, signal.SIGTERM) + mock_killpg.assert_any_call(fake_pgid, fake_sigkill) + assert mock_killpg.call_count == 2 + mock_kill.assert_not_called() + + with _lock: + assert fake_pid not in _orphan_stdio_pids + assert fake_pid not in _stdio_pgids + + def test_killpg_failure_falls_back_to_kill(self, monkeypatch): + """If killpg raises ProcessLookupError (pgroup gone), try os.kill.""" + from tools.mcp_tool import ( + _kill_orphaned_mcp_children, + _orphan_stdio_pids, + _stdio_pgids, + _lock, + ) + + self._reset_state() + fake_pid = 636363 + fake_pgid = 636363 + with _lock: + _orphan_stdio_pids.add(fake_pid) + _stdio_pgids[fake_pid] = fake_pgid + + if not hasattr(os, "killpg"): + pytest.skip("os.killpg not available on this platform") + + with patch( + "tools.mcp_tool.os.killpg", + side_effect=ProcessLookupError("no such process group"), + ) as mock_killpg, \ + patch("tools.mcp_tool.os.kill") as mock_kill, \ + patch("gateway.status._pid_exists", return_value=False), \ + patch("time.sleep"): + _kill_orphaned_mcp_children() + + # killpg was attempted (phase 1 SIGTERM) and fell back to os.kill. + # Phase 3 skips because _pid_exists returns False (direct pid gone). + mock_killpg.assert_called() + mock_kill.assert_any_call(fake_pid, signal.SIGTERM) + + with _lock: + assert fake_pid not in _orphan_stdio_pids + assert fake_pid not in _stdio_pgids + + def test_no_pgid_uses_per_pid_kill(self, monkeypatch): + """When no pgid is recorded (e.g. Windows), fall back to os.kill.""" + from tools.mcp_tool import ( + _kill_orphaned_mcp_children, + _orphan_stdio_pids, + _stdio_pgids, + _lock, + ) + + self._reset_state() + fake_pid = 747474 + with _lock: + _orphan_stdio_pids.add(fake_pid) + # No entry in _stdio_pgids. + + with patch("tools.mcp_tool.os.kill") as mock_kill, \ + patch("gateway.status._pid_exists", return_value=False), \ + patch("time.sleep"): + # killpg may or may not exist; either way the no-pgid path skips it. + _kill_orphaned_mcp_children() + + mock_kill.assert_any_call(fake_pid, signal.SIGTERM) + + with _lock: + assert fake_pid not in _orphan_stdio_pids + + @pytest.mark.live_system_guard_bypass + @pytest.mark.skipif( + not hasattr(os, "killpg") or not hasattr(os, "setsid"), + reason="POSIX-only: requires os.killpg and os.setsid", + ) + def test_grandchild_reaped_via_pgroup(self, tmp_path): + """End-to-end: parent spawns grandchild, parent exits, killpg reaps grandchild. + + Mirrors issue #23799: a stdio MCP wrapper (parent) launches a long-lived + helper subprocess (grandchild) in the same process group, then the + wrapper exits while the grandchild keeps running. killpg on the pgid + captured at spawn time must still deliver the signal to the grandchild. + + Marked ``live_system_guard_bypass`` because this test genuinely needs + real signal delivery to its own subprocess tree (the conftest guard + only knows the test's *initial* children; the spawned tree here is + outside that allowlist). + """ + import subprocess + import sys + import time as _time + + psutil = pytest.importorskip("psutil") + + # Grandchild: sleep forever, write its pid then wait. + grandchild_pid_file = tmp_path / "grandchild.pid" + grandchild_script = tmp_path / "grandchild.py" + grandchild_script.write_text( + "import os, sys, time\n" + f"open({str(grandchild_pid_file)!r}, 'w').write(str(os.getpid()))\n" + "while True:\n" + " time.sleep(0.5)\n" + ) + + # Parent: spawn grandchild, exit immediately (without killing it). + parent_script = tmp_path / "parent.py" + parent_script.write_text( + "import subprocess, sys\n" + f"subprocess.Popen([sys.executable, {str(grandchild_script)!r}])\n" + # Parent exits — grandchild reparents to init. + ) + + # Spawn parent in its own session (mirrors stdio_client behaviour). + parent = subprocess.Popen( + [sys.executable, str(parent_script)], + start_new_session=True, + ) + parent_pgid = os.getpgid(parent.pid) + # Wait for parent to exit and grandchild to spin up. + parent.wait(timeout=5) + deadline = _time.time() + 5 + while _time.time() < deadline and not grandchild_pid_file.exists(): + _time.sleep(0.05) + assert grandchild_pid_file.exists(), "grandchild did not start" + grandchild_pid = int(grandchild_pid_file.read_text().strip()) + + # Sanity: grandchild is alive and shares the parent's pgid. + assert psutil.pid_exists(grandchild_pid) + assert os.getpgid(grandchild_pid) == parent_pgid + + # Drive the reaper: register the parent pid + pgid as an orphan. + from tools.mcp_tool import ( + _kill_orphaned_mcp_children, + _orphan_stdio_pids, + _stdio_pgids, + _stdio_pids, + _lock, + ) + with _lock: + _stdio_pids.clear() + _orphan_stdio_pids.clear() + _stdio_pgids.clear() + _orphan_stdio_pids.add(parent.pid) + _stdio_pgids[parent.pid] = parent_pgid + try: + _kill_orphaned_mcp_children() + finally: + # Belt-and-suspenders: ensure grandchild is dead even if test fails. + try: + os.kill(grandchild_pid, signal.SIGKILL) + except ProcessLookupError: + pass + + # Grandchild should be gone — SIGTERM via killpg in phase 1 reached it. + deadline = _time.time() + 3 + while _time.time() < deadline and psutil.pid_exists(grandchild_pid): + _time.sleep(0.05) + assert not psutil.pid_exists(grandchild_pid), ( + "grandchild survived killpg-based reaping (issue #23799 regression)" + ) + + # --------------------------------------------------------------------------- # Fix 3: MCP reload timeout (cli.py) # --------------------------------------------------------------------------- diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index 593994caa09..9794b5e8592 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -1395,9 +1395,22 @@ class MCPServerTask: # Capture the newly spawned subprocess PID for force-kill cleanup. new_pids = _snapshot_child_pids() - pids_before if new_pids: + # Capture pgid while the child is alive — once it exits we + # can no longer call ``os.getpgid`` on it, and the cleanup + # sweep needs the pgid to reach any reparented descendants + # (e.g. ``claude mcp serve`` spawned by a stdio wrapper). + new_pgids: Dict[int, int] = {} + for _pid in new_pids: + try: + new_pgids[_pid] = os.getpgid(_pid) + except (AttributeError, ProcessLookupError, OSError): + # AttributeError: Windows (os.getpgid is POSIX-only) + # ProcessLookupError: child raced and already exited + pass with _lock: for _pid in new_pids: _stdio_pids[_pid] = self.name + _stdio_pgids.update(new_pgids) async with ClientSession( read_stream, write_stream, **sampling_kwargs ) as session: @@ -1416,16 +1429,33 @@ class MCPServerTask: # on Linux, where setsid() children escape the parent cgroup). # Mark them as orphans so the next cleanup sweep can reap them. if new_pids: + from gateway.status import _pid_exists + _killpg = getattr(os, "killpg", None) with _lock: for _pid in new_pids: _stdio_pids.pop(_pid, None) for pid in new_pids: # ``os.kill(pid, 0)`` is NOT a no-op on Windows # (bpo-14484). Use the cross-platform check. - from gateway.status import _pid_exists - if not _pid_exists(pid): - continue # process already exited — nothing to do - _orphan_stdio_pids.add(pid) + pid_alive = _pid_exists(pid) + pgroup_alive = False + pgid = _stdio_pgids.get(pid) + if not pid_alive and pgid is not None and _killpg is not None: + # Direct child exited but descendants may still be + # in its pgroup (e.g. ``claude mcp serve`` spawned + # by an MCP wrapper that exited first). Probe with + # signal 0 — succeeds iff any pgroup member is alive. + try: + _killpg(pgid, 0) + pgroup_alive = True + except (ProcessLookupError, PermissionError, OSError): + pgroup_alive = False + if pid_alive or pgroup_alive: + _orphan_stdio_pids.add(pid) + else: + # Nothing left to reap — drop the pgid entry so + # PID-reuse can't surface stale pgroup state later. + _stdio_pgids.pop(pid, None) async def _run_http(self, config: dict): """Run the server using HTTP/StreamableHTTP transport.""" @@ -2224,6 +2254,19 @@ _stdio_pids: Dict[int, str] = {} # pid -> server_name # sessions (e.g. concurrent cron jobs or live user chats). _orphan_stdio_pids: set = set() +# Process-group IDs of stdio MCP subprocesses, captured at spawn time. +# The MCP SDK spawns stdio children with ``start_new_session=True`` so each +# direct child becomes its own session/pgroup leader (PGID == its own PID). +# Grandchildren spawned by that child (e.g. a wrapper MCP server that itself +# launches helper subprocesses like ``claude mcp serve``) inherit that PGID +# unless they call ``setsid`` themselves. When the direct child exits, those +# grandchildren reparent to init/systemd-user but keep the original PGID, so +# ``killpg(pgid, sig)`` still reaches them. Tracked separately from +# ``_stdio_pids`` so we retain the PGID even after the direct child has +# exited and been removed from the active map. Empty on Windows +# (``os.getpgid`` is POSIX-only). +_stdio_pgids: Dict[int, int] = {} # pid -> pgid + def _snapshot_child_pids() -> set: """Return a set of current child process PIDs. @@ -3640,6 +3683,12 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None: survivors, avoiding shared-resource collisions when multiple hermes processes run on the same host (each has its own ``_stdio_pids`` dict). + On POSIX, signals are sent via ``os.killpg`` to the spawn-time pgid when + one is tracked, so reparented grandchildren in the same process group + (e.g. ``claude mcp serve`` spawned by a stdio MCP wrapper that exited + first) are reaped alongside the direct child. Falls back to ``os.kill`` + on Windows and when no pgid is recorded. + With ``include_active=True`` also kills every PID in ``_stdio_pids`` — used only at final shutdown, after the MCP event loop has stopped and no sessions can still be in flight. @@ -3654,20 +3703,42 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None: if include_active: pids.update(dict(_stdio_pids)) _stdio_pids.clear() + # Snapshot pgids for the pids we're about to kill, then drop the + # entries so a future spawn can't collide with stale state. + pgids: Dict[int, int] = {pid: _stdio_pgids[pid] for pid in pids if pid in _stdio_pgids} + for pid in pgids: + _stdio_pgids.pop(pid, None) # Fast path: no tracked stdio PIDs to reap. Skip the SIGTERM/sleep/SIGKILL # dance entirely — otherwise every MCP-free shutdown pays a 2s sleep tax. if not pids: return - # Phase 1: SIGTERM (graceful) - for pid, server_name in pids.items(): + def _send_signal(pid: int, sig: int, server_name: str) -> None: + """SIGTERM/SIGKILL via pgroup on POSIX, fall back to pid signal.""" + pgid = pgids.get(pid) + killpg = getattr(os, "killpg", None) + if pgid is not None and killpg is not None: + try: + killpg(pgid, sig) + return + except (ProcessLookupError, PermissionError, OSError) as exc: + # Pgroup gone (all members exited) or refused — fall back to + # the per-pid path so we still try the direct child if alive. + logger.debug( + "killpg(%d, %d) failed for MCP server '%s': %s; falling back to kill(pid)", + pgid, sig, server_name, exc, + ) try: - os.kill(pid, _signal.SIGTERM) - logger.debug("Sent SIGTERM to orphaned MCP process %d (%s)", pid, server_name) + os.kill(pid, sig) except (ProcessLookupError, PermissionError, OSError): pass + # Phase 1: SIGTERM (graceful) + for pid, server_name in pids.items(): + _send_signal(pid, _signal.SIGTERM, server_name) + logger.debug("Sent SIGTERM to orphaned MCP process %d (%s)", pid, server_name) + # Phase 2: Wait for graceful exit time.sleep(2) @@ -3679,14 +3750,11 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None: for pid, server_name in pids.items(): if not _pid_exists(pid): continue # Good — exited after SIGTERM - try: - os.kill(pid, _sigkill) - logger.warning( - "Force-killed MCP process %d (%s) after SIGTERM timeout", - pid, server_name, - ) - except (ProcessLookupError, PermissionError, OSError): - pass + _send_signal(pid, _sigkill, server_name) + logger.warning( + "Force-killed MCP process %d (%s) after SIGTERM timeout", + pid, server_name, + ) def _stop_mcp_loop():