diff --git a/tests/tools/test_mcp_stability.py b/tests/tools/test_mcp_stability.py index 2c3734274d8..0b9ba92050f 100644 --- a/tests/tools/test_mcp_stability.py +++ b/tests/tools/test_mcp_stability.py @@ -170,6 +170,71 @@ class TestStdioPidTracking: mock_kill.assert_any_call(fake_pid, signal.SIGTERM) assert mock_sleep.called + def test_kill_orphaned_skips_recycled_pid(self): + """PID-reuse guard: a recycled PID (start-time changed) is NOT signalled. + + Regression test: once an MCP child exits and is reaped, the kernel can + recycle its PID/PGID onto an unrelated process group. The sweep must + not signal the stale number, or it kills a stranger (observed: a desktop + browser whose session leader reused a dead MCP child's PID). + """ + from tools.mcp_tool import ( + _kill_orphaned_mcp_children, + _orphan_stdio_pids, + _stdio_pgids, + _stdio_starttimes, + _lock, + ) + + fake_pid = 454545 + with _lock: + _orphan_stdio_pids.clear() + _orphan_stdio_pids.add(fake_pid) + _stdio_pgids[fake_pid] = fake_pid + _stdio_starttimes[fake_pid] = 111111 # recorded at spawn + + # Current start time differs -> PID was recycled -> guard must skip. + with patch("gateway.status.get_process_start_time", return_value=222222), \ + patch("tools.mcp_tool.os.killpg") as mock_killpg, \ + patch("tools.mcp_tool.os.kill") as mock_kill, \ + patch("gateway.status._pid_exists", return_value=True), \ + patch("tools.mcp_tool.time.sleep"): + _kill_orphaned_mcp_children() + + mock_killpg.assert_not_called() + mock_kill.assert_not_called() + with _lock: + _stdio_pgids.pop(fake_pid, None) + _stdio_starttimes.pop(fake_pid, None) + + def test_kill_orphaned_signals_when_start_time_matches(self): + """The orphan IS signalled when its leader start-time still matches.""" + from tools.mcp_tool import ( + _kill_orphaned_mcp_children, + _orphan_stdio_pids, + _stdio_pgids, + _stdio_starttimes, + _lock, + ) + + fake_pid = 464646 + with _lock: + _orphan_stdio_pids.clear() + _orphan_stdio_pids.add(fake_pid) + _stdio_pgids[fake_pid] = fake_pid + _stdio_starttimes[fake_pid] = 333333 + + with patch("gateway.status.get_process_start_time", return_value=333333), \ + patch("tools.mcp_tool.os.killpg") as mock_killpg, \ + patch("gateway.status._pid_exists", return_value=False), \ + patch("tools.mcp_tool.time.sleep"): + _kill_orphaned_mcp_children() + + mock_killpg.assert_any_call(fake_pid, signal.SIGTERM) + with _lock: + _stdio_pgids.pop(fake_pid, None) + _stdio_starttimes.pop(fake_pid, None) + with _lock: assert fake_pid not in _orphan_stdio_pids diff --git a/tools/environments/local.py b/tools/environments/local.py index 4cc65d80af5..ba30ce49926 100644 --- a/tools/environments/local.py +++ b/tools/environments/local.py @@ -557,6 +557,10 @@ class LocalEnvironment(BaseEnvironment): if not _IS_WINDOWS: try: proc._hermes_pgid = os.getpgid(proc.pid) + # Record the group leader's start time so _kill_process can + # detect PID/PGID recycling before signalling the group. + from gateway.status import get_process_start_time + proc._hermes_pgid_start = get_process_start_time(proc.pid) except ProcessLookupError: pass @@ -601,12 +605,33 @@ class LocalEnvironment(BaseEnvironment): if _IS_WINDOWS: proc.terminate() else: + expected_start = getattr(proc, "_hermes_pgid_start", None) + + def _leader_is_ours(pgid) -> bool: + # The setsid group leader's PID == its PGID. Confirm it is + # still the process we spawned before signalling the whole + # group, so a recycled PID/PGID can never take down an + # unrelated process group. When no start-time baseline was + # captured (e.g. macOS, which has no /proc), fall back to the + # legacy best-effort behaviour rather than refusing to kill. + if pgid is None: + return False + if expected_start is None: + return True + from gateway.status import get_process_start_time + return get_process_start_time(pgid) == expected_start + try: pgid = os.getpgid(proc.pid) except ProcessLookupError: pgid = getattr(proc, "_hermes_pgid", None) - if pgid is None: - raise + + if not _leader_is_ours(pgid): + # Leader exited and its PID/PGID may have been recycled onto + # an unrelated process group; signalling the stale number is + # unsafe. Bail out — a rare orphaned grandchild may leak, + # which is strictly preferable to killing a stranger. + return try: os.killpg(pgid, signal.SIGTERM) # windows-footgun: ok — POSIX process-group SIGTERM (guarded by _IS_WINDOWS above) @@ -619,6 +644,11 @@ class LocalEnvironment(BaseEnvironment): if _wait_for_group_exit(pgid, 1.0): return + if not _leader_is_ours(pgid): + # Leader exited during the grace window; do not escalate to + # SIGKILL on a possibly-recycled PGID. + return + try: # POSIX-only: _IS_WINDOWS is handled by the outer branch. os.killpg(pgid, signal.SIGKILL) # windows-footgun: ok — POSIX process-group SIGKILL diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index ccfe69d55b0..db9e43acfe3 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -1415,6 +1415,8 @@ class MCPServerTask: # sweep needs the pgid to reach any reparented descendants # (e.g. ``claude mcp serve`` spawned by a stdio wrapper). new_pgids: Dict[int, int] = {} + new_starts: Dict[int, int] = {} + from gateway.status import get_process_start_time for _pid in new_pids: try: new_pgids[_pid] = os.getpgid(_pid) @@ -1422,10 +1424,17 @@ class MCPServerTask: # AttributeError: Windows (os.getpgid is POSIX-only) # ProcessLookupError: child raced and already exited pass + # Record the leader's start time so a later sweep can + # detect PID/PGID recycling before signalling (see + # _stdio_starttimes). + _start = get_process_start_time(_pid) + if _start is not None: + new_starts[_pid] = _start with _lock: for _pid in new_pids: _stdio_pids[_pid] = self.name _stdio_pgids.update(new_pgids) + _stdio_starttimes.update(new_starts) async with ClientSession( read_stream, write_stream, **sampling_kwargs ) as session: @@ -2397,6 +2406,15 @@ _orphan_stdio_pids: set = set() # (``os.getpgid`` is POSIX-only). _stdio_pgids: Dict[int, int] = {} # pid -> pgid +# Spawn-time start ticks (/proc//stat field 22) of each stdio child's +# pgroup leader, captured alongside the PGID. PIDs/PGIDs are recycled by the +# kernel once the original process exits and is reaped, so a long-lived +# tracker holding a bare PGID is unsafe: by the time a sweep runs, that number +# may name an unrelated process group (observed in the wild: a desktop browser +# whose session leader happened to reuse a dead MCP child's PID). We re-check +# the leader's start time before signalling so a recycled PGID is never killed. +_stdio_starttimes: Dict[int, int] = {} # pid -> leader start ticks + def _snapshot_child_pids() -> set: """Return a set of current child process PIDs. @@ -3839,11 +3857,14 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None: if include_active: pids.update(dict(_stdio_pids)) _stdio_pids.clear() - # Snapshot pgids for the pids we're about to kill, then drop the - # entries so a future spawn can't collide with stale state. + # Snapshot pgids + spawn-time leader start ticks for the pids we're + # about to kill, then drop the entries so a future spawn can't collide + # with stale state. pgids: Dict[int, int] = {pid: _stdio_pgids[pid] for pid in pids if pid in _stdio_pgids} - for pid in pgids: + starts: Dict[int, int] = {pid: _stdio_starttimes[pid] for pid in pids if pid in _stdio_starttimes} + for pid in pids: _stdio_pgids.pop(pid, None) + _stdio_starttimes.pop(pid, None) # Fast path: no tracked stdio PIDs to reap. Skip the SIGTERM/sleep/SIGKILL # dance entirely — otherwise every MCP-free shutdown pays a 2s sleep tax. @@ -3852,6 +3873,23 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None: def _send_signal(pid: int, sig: int, server_name: str) -> None: """SIGTERM/SIGKILL via pgroup on POSIX, fall back to pid signal.""" + # PID-reuse guard: only signal if ``pid`` still names the process we + # spawned. Once an MCP child exits and is reaped the kernel may recycle + # its PID/PGID onto an unrelated process group; signalling the stale + # number would kill a stranger (e.g. a recycled PGID landing on a + # desktop browser's session leader). Skip when the leader's start time + # no longer matches. With no recorded start time we fall through to the + # legacy best-effort path (capture raced the child's exit). + expected_start = starts.get(pid) + if expected_start is not None: + from gateway.status import get_process_start_time + if get_process_start_time(pid) != expected_start: + logger.debug( + "Skip signalling MCP pid %d (%s): start-time mismatch — " + "PID was recycled; refusing to kill an unrelated process group.", + pid, server_name, + ) + return pgid = pgids.get(pid) killpg = getattr(os, "killpg", None) if pgid is not None and killpg is not None: