mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
fix(mcp): re-validate PGID ownership before killing orphaned process groups
The stdio-MCP orphan reaper (`_kill_orphaned_mcp_children`) and the local environment's `_kill_process` both signalled a process *group* via `os.killpg(pgid, sig)` using a PGID captured at spawn time. Once the original child exits and is reaped, the kernel can recycle that PID/PGID onto an unrelated process group, so a later sweep would `killpg` a stranger. Observed in the wild: a long-running gateway reaped a recycled PGID that had landed on a desktop browser's session leader, SIGTERM-ing Firefox at irregular intervals (proven with the kernel `signal:signal_generate` tracepoint — `comm=firefox` killed by the hermes gateway via a recycled group id). Fix: record each leader's start time (`/proc/<pid>/stat` field 22, via the existing `gateway.status.get_process_start_time`) alongside the PGID, and re-check it before signalling. A PGID whose leader's start time no longer matches — or whose leader is gone — is never signalled. When no start time is available (e.g. macOS has no /proc) the guard degrades to the prior best-effort behaviour, so non-Linux platforms are unaffected. Adds regression tests covering both the recycled-PID (skip) and matching-PID (signal) paths. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
30412a9771
commit
7bd1f8a2d1
3 changed files with 138 additions and 5 deletions
|
|
@ -170,6 +170,71 @@ class TestStdioPidTracking:
|
|||
mock_kill.assert_any_call(fake_pid, signal.SIGTERM)
|
||||
assert mock_sleep.called
|
||||
|
||||
def test_kill_orphaned_skips_recycled_pid(self):
|
||||
"""PID-reuse guard: a recycled PID (start-time changed) is NOT signalled.
|
||||
|
||||
Regression test: once an MCP child exits and is reaped, the kernel can
|
||||
recycle its PID/PGID onto an unrelated process group. The sweep must
|
||||
not signal the stale number, or it kills a stranger (observed: a desktop
|
||||
browser whose session leader reused a dead MCP child's PID).
|
||||
"""
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_stdio_pgids,
|
||||
_stdio_starttimes,
|
||||
_lock,
|
||||
)
|
||||
|
||||
fake_pid = 454545
|
||||
with _lock:
|
||||
_orphan_stdio_pids.clear()
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
_stdio_pgids[fake_pid] = fake_pid
|
||||
_stdio_starttimes[fake_pid] = 111111 # recorded at spawn
|
||||
|
||||
# Current start time differs -> PID was recycled -> guard must skip.
|
||||
with patch("gateway.status.get_process_start_time", return_value=222222), \
|
||||
patch("tools.mcp_tool.os.killpg") as mock_killpg, \
|
||||
patch("tools.mcp_tool.os.kill") as mock_kill, \
|
||||
patch("gateway.status._pid_exists", return_value=True), \
|
||||
patch("tools.mcp_tool.time.sleep"):
|
||||
_kill_orphaned_mcp_children()
|
||||
|
||||
mock_killpg.assert_not_called()
|
||||
mock_kill.assert_not_called()
|
||||
with _lock:
|
||||
_stdio_pgids.pop(fake_pid, None)
|
||||
_stdio_starttimes.pop(fake_pid, None)
|
||||
|
||||
def test_kill_orphaned_signals_when_start_time_matches(self):
|
||||
"""The orphan IS signalled when its leader start-time still matches."""
|
||||
from tools.mcp_tool import (
|
||||
_kill_orphaned_mcp_children,
|
||||
_orphan_stdio_pids,
|
||||
_stdio_pgids,
|
||||
_stdio_starttimes,
|
||||
_lock,
|
||||
)
|
||||
|
||||
fake_pid = 464646
|
||||
with _lock:
|
||||
_orphan_stdio_pids.clear()
|
||||
_orphan_stdio_pids.add(fake_pid)
|
||||
_stdio_pgids[fake_pid] = fake_pid
|
||||
_stdio_starttimes[fake_pid] = 333333
|
||||
|
||||
with patch("gateway.status.get_process_start_time", return_value=333333), \
|
||||
patch("tools.mcp_tool.os.killpg") as mock_killpg, \
|
||||
patch("gateway.status._pid_exists", return_value=False), \
|
||||
patch("tools.mcp_tool.time.sleep"):
|
||||
_kill_orphaned_mcp_children()
|
||||
|
||||
mock_killpg.assert_any_call(fake_pid, signal.SIGTERM)
|
||||
with _lock:
|
||||
_stdio_pgids.pop(fake_pid, None)
|
||||
_stdio_starttimes.pop(fake_pid, None)
|
||||
|
||||
with _lock:
|
||||
assert fake_pid not in _orphan_stdio_pids
|
||||
|
||||
|
|
|
|||
|
|
@ -557,6 +557,10 @@ class LocalEnvironment(BaseEnvironment):
|
|||
if not _IS_WINDOWS:
|
||||
try:
|
||||
proc._hermes_pgid = os.getpgid(proc.pid)
|
||||
# Record the group leader's start time so _kill_process can
|
||||
# detect PID/PGID recycling before signalling the group.
|
||||
from gateway.status import get_process_start_time
|
||||
proc._hermes_pgid_start = get_process_start_time(proc.pid)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
|
|
@ -601,12 +605,33 @@ class LocalEnvironment(BaseEnvironment):
|
|||
if _IS_WINDOWS:
|
||||
proc.terminate()
|
||||
else:
|
||||
expected_start = getattr(proc, "_hermes_pgid_start", None)
|
||||
|
||||
def _leader_is_ours(pgid) -> bool:
|
||||
# The setsid group leader's PID == its PGID. Confirm it is
|
||||
# still the process we spawned before signalling the whole
|
||||
# group, so a recycled PID/PGID can never take down an
|
||||
# unrelated process group. When no start-time baseline was
|
||||
# captured (e.g. macOS, which has no /proc), fall back to the
|
||||
# legacy best-effort behaviour rather than refusing to kill.
|
||||
if pgid is None:
|
||||
return False
|
||||
if expected_start is None:
|
||||
return True
|
||||
from gateway.status import get_process_start_time
|
||||
return get_process_start_time(pgid) == expected_start
|
||||
|
||||
try:
|
||||
pgid = os.getpgid(proc.pid)
|
||||
except ProcessLookupError:
|
||||
pgid = getattr(proc, "_hermes_pgid", None)
|
||||
if pgid is None:
|
||||
raise
|
||||
|
||||
if not _leader_is_ours(pgid):
|
||||
# Leader exited and its PID/PGID may have been recycled onto
|
||||
# an unrelated process group; signalling the stale number is
|
||||
# unsafe. Bail out — a rare orphaned grandchild may leak,
|
||||
# which is strictly preferable to killing a stranger.
|
||||
return
|
||||
|
||||
try:
|
||||
os.killpg(pgid, signal.SIGTERM) # windows-footgun: ok — POSIX process-group SIGTERM (guarded by _IS_WINDOWS above)
|
||||
|
|
@ -619,6 +644,11 @@ class LocalEnvironment(BaseEnvironment):
|
|||
if _wait_for_group_exit(pgid, 1.0):
|
||||
return
|
||||
|
||||
if not _leader_is_ours(pgid):
|
||||
# Leader exited during the grace window; do not escalate to
|
||||
# SIGKILL on a possibly-recycled PGID.
|
||||
return
|
||||
|
||||
try:
|
||||
# POSIX-only: _IS_WINDOWS is handled by the outer branch.
|
||||
os.killpg(pgid, signal.SIGKILL) # windows-footgun: ok — POSIX process-group SIGKILL
|
||||
|
|
|
|||
|
|
@ -1415,6 +1415,8 @@ class MCPServerTask:
|
|||
# sweep needs the pgid to reach any reparented descendants
|
||||
# (e.g. ``claude mcp serve`` spawned by a stdio wrapper).
|
||||
new_pgids: Dict[int, int] = {}
|
||||
new_starts: Dict[int, int] = {}
|
||||
from gateway.status import get_process_start_time
|
||||
for _pid in new_pids:
|
||||
try:
|
||||
new_pgids[_pid] = os.getpgid(_pid)
|
||||
|
|
@ -1422,10 +1424,17 @@ class MCPServerTask:
|
|||
# AttributeError: Windows (os.getpgid is POSIX-only)
|
||||
# ProcessLookupError: child raced and already exited
|
||||
pass
|
||||
# Record the leader's start time so a later sweep can
|
||||
# detect PID/PGID recycling before signalling (see
|
||||
# _stdio_starttimes).
|
||||
_start = get_process_start_time(_pid)
|
||||
if _start is not None:
|
||||
new_starts[_pid] = _start
|
||||
with _lock:
|
||||
for _pid in new_pids:
|
||||
_stdio_pids[_pid] = self.name
|
||||
_stdio_pgids.update(new_pgids)
|
||||
_stdio_starttimes.update(new_starts)
|
||||
async with ClientSession(
|
||||
read_stream, write_stream, **sampling_kwargs
|
||||
) as session:
|
||||
|
|
@ -2397,6 +2406,15 @@ _orphan_stdio_pids: set = set()
|
|||
# (``os.getpgid`` is POSIX-only).
|
||||
_stdio_pgids: Dict[int, int] = {} # pid -> pgid
|
||||
|
||||
# Spawn-time start ticks (/proc/<pid>/stat field 22) of each stdio child's
|
||||
# pgroup leader, captured alongside the PGID. PIDs/PGIDs are recycled by the
|
||||
# kernel once the original process exits and is reaped, so a long-lived
|
||||
# tracker holding a bare PGID is unsafe: by the time a sweep runs, that number
|
||||
# may name an unrelated process group (observed in the wild: a desktop browser
|
||||
# whose session leader happened to reuse a dead MCP child's PID). We re-check
|
||||
# the leader's start time before signalling so a recycled PGID is never killed.
|
||||
_stdio_starttimes: Dict[int, int] = {} # pid -> leader start ticks
|
||||
|
||||
|
||||
def _snapshot_child_pids() -> set:
|
||||
"""Return a set of current child process PIDs.
|
||||
|
|
@ -3839,11 +3857,14 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
|
|||
if include_active:
|
||||
pids.update(dict(_stdio_pids))
|
||||
_stdio_pids.clear()
|
||||
# Snapshot pgids for the pids we're about to kill, then drop the
|
||||
# entries so a future spawn can't collide with stale state.
|
||||
# Snapshot pgids + spawn-time leader start ticks for the pids we're
|
||||
# about to kill, then drop the entries so a future spawn can't collide
|
||||
# with stale state.
|
||||
pgids: Dict[int, int] = {pid: _stdio_pgids[pid] for pid in pids if pid in _stdio_pgids}
|
||||
for pid in pgids:
|
||||
starts: Dict[int, int] = {pid: _stdio_starttimes[pid] for pid in pids if pid in _stdio_starttimes}
|
||||
for pid in pids:
|
||||
_stdio_pgids.pop(pid, None)
|
||||
_stdio_starttimes.pop(pid, None)
|
||||
|
||||
# Fast path: no tracked stdio PIDs to reap. Skip the SIGTERM/sleep/SIGKILL
|
||||
# dance entirely — otherwise every MCP-free shutdown pays a 2s sleep tax.
|
||||
|
|
@ -3852,6 +3873,23 @@ def _kill_orphaned_mcp_children(include_active: bool = False) -> None:
|
|||
|
||||
def _send_signal(pid: int, sig: int, server_name: str) -> None:
|
||||
"""SIGTERM/SIGKILL via pgroup on POSIX, fall back to pid signal."""
|
||||
# PID-reuse guard: only signal if ``pid`` still names the process we
|
||||
# spawned. Once an MCP child exits and is reaped the kernel may recycle
|
||||
# its PID/PGID onto an unrelated process group; signalling the stale
|
||||
# number would kill a stranger (e.g. a recycled PGID landing on a
|
||||
# desktop browser's session leader). Skip when the leader's start time
|
||||
# no longer matches. With no recorded start time we fall through to the
|
||||
# legacy best-effort path (capture raced the child's exit).
|
||||
expected_start = starts.get(pid)
|
||||
if expected_start is not None:
|
||||
from gateway.status import get_process_start_time
|
||||
if get_process_start_time(pid) != expected_start:
|
||||
logger.debug(
|
||||
"Skip signalling MCP pid %d (%s): start-time mismatch — "
|
||||
"PID was recycled; refusing to kill an unrelated process group.",
|
||||
pid, server_name,
|
||||
)
|
||||
return
|
||||
pgid = pgids.get(pid)
|
||||
killpg = getattr(os, "killpg", None)
|
||||
if pgid is not None and killpg is not None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue