mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
fix(process-registry): re-validate PID identity before killing host processes
The background-process registry signalled host PIDs (recovery adoption,
detached-session kill, tree-kill) using a number captured at spawn, guarded
only by a bare liveness check. Once a session's process exits and is reaped the
kernel recycles that PID onto an unrelated process, so an alive-but-different
PID passed the check and got tree-killed.
Observed in the wild: a recycled background-session PID landed on Firefox's
session leader; a later kill/refresh walked its process tree and SIGTERMed
every tab — Firefox "closing" at irregular intervals with no crash/coredump.
This is the same PID/PGID-recycling class fixed for the MCP orphan reaper in
7bd1f8a2d, but the process_registry subsystem was never guarded — so the bug
persisted.
Fix: record each host process's kernel start time (/proc/<pid>/stat field 22)
at spawn, persist it in the checkpoint, and re-validate it before every signal
via `_host_pid_is_ours`. A PID whose start time no longer matches — or that is
gone — is never signalled:
- recover_from_checkpoint: a recycled PID is not adopted as a session.
- _refresh_detached_session: a recycled detached PID is marked exited.
- kill_process / _terminate_host_pid: refuse to tree-kill a stranger.
Legacy checkpoints and platforms without /proc (no baseline) degrade to the
prior best-effort liveness behaviour, so nothing else changes.
Adds TestPidReuseGuard: real-process tests proving a mismatched start time
refuses termination while a matching one still kills, plus recovery/refresh
recycling paths. 74 registry + 22 MCP-stability tests green.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
84e1d31e54
commit
e447723149
2 changed files with 241 additions and 47 deletions
|
|
@ -1318,3 +1318,121 @@ class TestTerminateHostPidPosix:
|
|||
pr.ProcessRegistry._terminate_host_pid(12345)
|
||||
|
||||
assert kill_calls == [(12345, signal.SIGTERM)]
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# PID-reuse guard — a recycled PID/PGID must never be signalled.
|
||||
#
|
||||
# Regression: once a background-session process exits and is reaped, the kernel
|
||||
# can recycle its PID onto an unrelated process (observed in the wild landing on
|
||||
# a desktop browser's session leader, whose whole tree we then SIGTERMed —
|
||||
# Firefox dying at irregular intervals). Identity is re-validated via the
|
||||
# kernel start time captured at spawn before any signal is sent.
|
||||
# =========================================================================
|
||||
|
||||
class TestPidReuseGuard:
|
||||
def test_terminate_refuses_when_start_time_mismatches(self, registry):
|
||||
"""A live PID whose start time changed (recycled) is NOT killed."""
|
||||
proc = _spawn_python_sleep(30)
|
||||
try:
|
||||
real_start = ProcessRegistry._safe_host_start_time(proc.pid)
|
||||
assert real_start is not None, "no /proc start time on this platform?"
|
||||
# Simulate recycling: the recorded baseline no longer matches.
|
||||
registry._terminate_host_pid(proc.pid, expected_start=real_start + 1)
|
||||
# The process must still be alive — the guard refused to signal it.
|
||||
assert not _wait_until(lambda: proc.poll() is not None, timeout=1.0)
|
||||
assert proc.poll() is None
|
||||
finally:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_terminate_kills_when_start_time_matches(self, registry):
|
||||
"""The genuine process (start time matches) IS terminated."""
|
||||
proc = _spawn_python_sleep(30)
|
||||
try:
|
||||
real_start = ProcessRegistry._safe_host_start_time(proc.pid)
|
||||
registry._terminate_host_pid(proc.pid, expected_start=real_start)
|
||||
assert _wait_until(lambda: proc.poll() is not None, timeout=5.0)
|
||||
finally:
|
||||
if proc.poll() is None:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_terminate_without_baseline_is_best_effort(self, registry):
|
||||
"""No baseline (legacy) → degrade to prior unconditional behaviour."""
|
||||
proc = _spawn_python_sleep(30)
|
||||
try:
|
||||
registry._terminate_host_pid(proc.pid) # expected_start=None
|
||||
assert _wait_until(lambda: proc.poll() is not None, timeout=5.0)
|
||||
finally:
|
||||
if proc.poll() is None:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_recover_skips_recycled_pid(self, registry, tmp_path):
|
||||
"""Checkpoint PID is alive but its start time changed → not adopted."""
|
||||
wrong_start = (ProcessRegistry._safe_host_start_time(os.getpid()) or 0) + 999
|
||||
checkpoint = tmp_path / "procs.json"
|
||||
checkpoint.write_text(json.dumps([{
|
||||
"session_id": "proc_recycled",
|
||||
"command": "sleep 999",
|
||||
"pid": os.getpid(), # alive...
|
||||
"pid_scope": "host",
|
||||
"host_start_time": wrong_start, # ...but a different process now
|
||||
"task_id": "t1",
|
||||
}]))
|
||||
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
||||
assert registry.recover_from_checkpoint() == 0
|
||||
assert len(registry._running) == 0
|
||||
|
||||
def test_recover_adopts_when_start_time_matches(self, registry, tmp_path):
|
||||
"""Checkpoint PID alive AND start time matches → adopted as before."""
|
||||
real_start = ProcessRegistry._safe_host_start_time(os.getpid())
|
||||
checkpoint = tmp_path / "procs.json"
|
||||
checkpoint.write_text(json.dumps([{
|
||||
"session_id": "proc_match",
|
||||
"command": "sleep 999",
|
||||
"pid": os.getpid(),
|
||||
"pid_scope": "host",
|
||||
"host_start_time": real_start,
|
||||
"task_id": "t1",
|
||||
}]))
|
||||
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
||||
assert registry.recover_from_checkpoint() == 1
|
||||
|
||||
def test_legacy_checkpoint_without_start_time_still_recovers(self, registry, tmp_path):
|
||||
"""Entries written before host_start_time existed degrade to liveness."""
|
||||
checkpoint = tmp_path / "procs.json"
|
||||
checkpoint.write_text(json.dumps([{
|
||||
"session_id": "proc_legacy",
|
||||
"command": "sleep 999",
|
||||
"pid": os.getpid(),
|
||||
"pid_scope": "host",
|
||||
"task_id": "t1",
|
||||
}]))
|
||||
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
||||
assert registry.recover_from_checkpoint() == 1
|
||||
|
||||
def test_write_checkpoint_backfills_host_start_time(self, registry, tmp_path):
|
||||
"""A host session is checkpointed with a kernel start time recorded."""
|
||||
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "procs.json"):
|
||||
s = _make_session()
|
||||
s.pid = os.getpid()
|
||||
s.pid_scope = "host"
|
||||
registry._running[s.id] = s
|
||||
registry._write_checkpoint()
|
||||
data = json.loads((tmp_path / "procs.json").read_text())
|
||||
assert data[0]["host_start_time"] is not None
|
||||
|
||||
def test_refresh_detached_marks_recycled_pid_exited(self, registry):
|
||||
"""A detached session whose PID got recycled is moved to finished."""
|
||||
wrong_start = (ProcessRegistry._safe_host_start_time(os.getpid()) or 0) + 999
|
||||
s = _make_session(sid="proc_detached")
|
||||
s.pid = os.getpid() # alive, but...
|
||||
s.pid_scope = "host"
|
||||
s.detached = True
|
||||
s.host_start_time = wrong_start # ...identity no longer matches
|
||||
registry._running[s.id] = s
|
||||
refreshed = registry._refresh_detached_session(s)
|
||||
assert refreshed.exited is True
|
||||
assert s.id in registry._finished
|
||||
|
|
|
|||
|
|
@ -97,7 +97,8 @@ class ProcessSession:
|
|||
process: Optional[subprocess.Popen] = None # Popen handle (local only)
|
||||
env_ref: Any = None # Reference to the environment object
|
||||
cwd: Optional[str] = None # Working directory
|
||||
started_at: float = 0.0 # time.time() of spawn
|
||||
started_at: float = 0.0 # time.time() of spawn (wall clock)
|
||||
host_start_time: Optional[int] = None # kernel start ticks (/proc/<pid>/stat f22) — PID-reuse guard
|
||||
exited: bool = False # Whether the process has finished
|
||||
exit_code: Optional[int] = None # Exit code (None if still running)
|
||||
completion_reason: str = "exited" # exited|killed|lost|failed_start|already_exited
|
||||
|
|
@ -428,12 +429,47 @@ class ProcessRegistry:
|
|||
from gateway.status import _pid_exists
|
||||
return _pid_exists(pid)
|
||||
|
||||
@staticmethod
|
||||
def _safe_host_start_time(pid: Optional[int]) -> Optional[int]:
|
||||
"""Kernel start ticks for a host PID, or None when unavailable."""
|
||||
if not pid:
|
||||
return None
|
||||
try:
|
||||
from gateway.status import get_process_start_time
|
||||
return get_process_start_time(pid)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _host_pid_is_ours(cls, pid: Optional[int], expected_start: Optional[int]) -> bool:
|
||||
"""True only if ``pid`` is alive AND still the process we spawned.
|
||||
|
||||
The kernel recycles PID/PGID numbers once a process exits and is reaped,
|
||||
so a stored PID can later name an *unrelated* process — observed in the
|
||||
wild as a recycled number landing on a desktop browser's session leader,
|
||||
which our tree-kill then SIGTERMs (Firefox dying at irregular intervals).
|
||||
We compare the kernel start time captured at spawn against the live one;
|
||||
a mismatch means the number was recycled and must never be signalled.
|
||||
|
||||
When no baseline was captured (legacy checkpoints, or platforms without
|
||||
``/proc``) we degrade to a bare liveness check rather than refusing to
|
||||
act, preserving prior best-effort behaviour.
|
||||
"""
|
||||
if not cls._is_host_pid_alive(pid):
|
||||
return False
|
||||
if expected_start is None:
|
||||
return True
|
||||
return cls._safe_host_start_time(pid) == expected_start
|
||||
|
||||
def _refresh_detached_session(self, session: Optional[ProcessSession]) -> Optional[ProcessSession]:
|
||||
"""Update recovered host-PID sessions when the underlying process has exited."""
|
||||
if session is None or session.exited or not session.detached or session.pid_scope != "host":
|
||||
return session
|
||||
|
||||
if self._is_host_pid_alive(session.pid):
|
||||
# Identity-aware liveness: a recycled PID (alive but a different process
|
||||
# than we spawned) must be treated as "our process exited", so it is
|
||||
# moved to finished and can never be tree-killed by a later kill().
|
||||
if self._host_pid_is_ours(session.pid, session.host_start_time):
|
||||
return session
|
||||
|
||||
with session._lock:
|
||||
|
|
@ -447,10 +483,16 @@ class ProcessRegistry:
|
|||
self._move_to_finished(session)
|
||||
return session
|
||||
|
||||
@staticmethod
|
||||
def _terminate_host_pid(pid: int) -> None:
|
||||
@classmethod
|
||||
def _terminate_host_pid(cls, pid: int, expected_start: Optional[int] = None) -> None:
|
||||
"""Terminate a host-visible PID and its descendants.
|
||||
|
||||
``expected_start`` is the kernel start time captured when we spawned the
|
||||
process. When provided, it is re-validated against the live PID before
|
||||
any signal is sent; a mismatch (or a dead PID) means the number was
|
||||
recycled onto an unrelated process and we refuse to touch it, so a stale
|
||||
background-session PID can never tree-kill a browser or other stranger.
|
||||
|
||||
POSIX: walks the process tree with ``psutil`` and SIGTERMs
|
||||
children before the parent so subprocess trees (e.g. Chromium
|
||||
renderers/GPU helpers spawned by an ``agent-browser`` daemon)
|
||||
|
|
@ -479,6 +521,15 @@ class ProcessRegistry:
|
|||
POSIX and a missing ``taskkill.exe`` on Windows (effectively
|
||||
unreachable on real Windows installs, but cheap insurance).
|
||||
"""
|
||||
if expected_start is not None and not cls._host_pid_is_ours(pid, expected_start):
|
||||
# PID was recycled (start time changed) or is gone — never signal a
|
||||
# stranger. A leaked orphan is strictly preferable to killing e.g.
|
||||
# a browser whose session leader reused this dead session's PID.
|
||||
logger.warning(
|
||||
"Refusing to terminate host pid %d: start-time mismatch — "
|
||||
"PID was recycled onto an unrelated process.", pid,
|
||||
)
|
||||
return
|
||||
if _IS_WINDOWS:
|
||||
try:
|
||||
subprocess.run(
|
||||
|
|
@ -573,6 +624,7 @@ class ProcessRegistry:
|
|||
dimensions=(30, 120),
|
||||
)
|
||||
session.pid = pty_proc.pid
|
||||
session.host_start_time = self._safe_host_start_time(session.pid)
|
||||
# Store the pty handle on the session for read/write
|
||||
session._pty = pty_proc
|
||||
|
||||
|
|
@ -625,6 +677,7 @@ class ProcessRegistry:
|
|||
|
||||
session.process = proc
|
||||
session.pid = proc.pid
|
||||
session.host_start_time = self._safe_host_start_time(session.pid)
|
||||
|
||||
try:
|
||||
# Start output reader thread
|
||||
|
|
@ -1239,7 +1292,10 @@ class ProcessRegistry:
|
|||
# Non-local -- kill inside sandbox
|
||||
session.env_ref.execute(f"kill {session.pid} 2>/dev/null", timeout=5)
|
||||
elif session.detached and session.pid_scope == "host" and session.pid:
|
||||
if not self._is_host_pid_alive(session.pid):
|
||||
# Identity check, not bare liveness: if the PID is gone OR was
|
||||
# recycled onto an unrelated process, treat our process as
|
||||
# exited and never tree-kill the stranger.
|
||||
if not self._host_pid_is_ours(session.pid, session.host_start_time):
|
||||
with session._lock:
|
||||
session.exited = True
|
||||
session.exit_code = None
|
||||
|
|
@ -1248,7 +1304,7 @@ class ProcessRegistry:
|
|||
"status": "already_exited",
|
||||
"exit_code": session.exit_code,
|
||||
}
|
||||
self._terminate_host_pid(session.pid)
|
||||
self._terminate_host_pid(session.pid, session.host_start_time)
|
||||
else:
|
||||
return {
|
||||
"status": "error",
|
||||
|
|
@ -1461,11 +1517,17 @@ class ProcessRegistry:
|
|||
entries = []
|
||||
for s in self._running.values():
|
||||
if not s.exited:
|
||||
# Lazily backfill the kernel start time for host PIDs so
|
||||
# recovery after restart can detect PID recycling even
|
||||
# for sessions spawned before this field existed.
|
||||
if s.host_start_time is None and s.pid_scope == "host" and s.pid:
|
||||
s.host_start_time = self._safe_host_start_time(s.pid)
|
||||
entries.append({
|
||||
"session_id": s.id,
|
||||
"command": s.command,
|
||||
"pid": s.pid,
|
||||
"pid_scope": s.pid_scope,
|
||||
"host_start_time": s.host_start_time,
|
||||
"cwd": s.cwd,
|
||||
"started_at": s.started_at,
|
||||
"task_id": s.task_id,
|
||||
|
|
@ -1520,49 +1582,63 @@ class ProcessRegistry:
|
|||
)
|
||||
continue
|
||||
|
||||
# Check if PID is still alive
|
||||
alive = self._is_host_pid_alive(pid)
|
||||
# The PID must be alive AND still the same process we spawned. A
|
||||
# bare liveness check is unsafe: across a restart (especially a
|
||||
# reboot or long uptime) the kernel may have recycled this number
|
||||
# onto an unrelated process — adopting it would let a later kill or
|
||||
# watcher tree-kill a stranger (e.g. a browser). Re-validate the
|
||||
# kernel start time recorded in the checkpoint.
|
||||
recorded_start = entry.get("host_start_time")
|
||||
if not self._host_pid_is_ours(pid, recorded_start):
|
||||
if self._is_host_pid_alive(pid):
|
||||
logger.info(
|
||||
"Not recovering session %s: pid %d is alive but its "
|
||||
"start time no longer matches — PID was recycled onto "
|
||||
"an unrelated process; refusing to adopt it.",
|
||||
entry.get("session_id", "?"), pid,
|
||||
)
|
||||
continue
|
||||
|
||||
if alive:
|
||||
session = ProcessSession(
|
||||
id=entry["session_id"],
|
||||
command=entry.get("command", "unknown"),
|
||||
task_id=entry.get("task_id", ""),
|
||||
session_key=entry.get("session_key", ""),
|
||||
pid=pid,
|
||||
pid_scope=pid_scope,
|
||||
cwd=entry.get("cwd"),
|
||||
started_at=entry.get("started_at", time.time()),
|
||||
detached=True, # Can't read output, but can report status + kill
|
||||
watcher_platform=entry.get("watcher_platform", ""),
|
||||
watcher_chat_id=entry.get("watcher_chat_id", ""),
|
||||
watcher_user_id=entry.get("watcher_user_id", ""),
|
||||
watcher_user_name=entry.get("watcher_user_name", ""),
|
||||
watcher_thread_id=entry.get("watcher_thread_id", ""),
|
||||
watcher_message_id=entry.get("watcher_message_id", ""),
|
||||
watcher_interval=entry.get("watcher_interval", 0),
|
||||
notify_on_complete=entry.get("notify_on_complete", False),
|
||||
watch_patterns=entry.get("watch_patterns", []),
|
||||
)
|
||||
with self._lock:
|
||||
self._running[session.id] = session
|
||||
recovered += 1
|
||||
logger.info("Recovered detached process: %s (pid=%d)", session.command[:60], pid)
|
||||
session = ProcessSession(
|
||||
id=entry["session_id"],
|
||||
command=entry.get("command", "unknown"),
|
||||
task_id=entry.get("task_id", ""),
|
||||
session_key=entry.get("session_key", ""),
|
||||
pid=pid,
|
||||
host_start_time=recorded_start,
|
||||
pid_scope=pid_scope,
|
||||
cwd=entry.get("cwd"),
|
||||
started_at=entry.get("started_at", time.time()),
|
||||
detached=True, # Can't read output, but can report status + kill
|
||||
watcher_platform=entry.get("watcher_platform", ""),
|
||||
watcher_chat_id=entry.get("watcher_chat_id", ""),
|
||||
watcher_user_id=entry.get("watcher_user_id", ""),
|
||||
watcher_user_name=entry.get("watcher_user_name", ""),
|
||||
watcher_thread_id=entry.get("watcher_thread_id", ""),
|
||||
watcher_message_id=entry.get("watcher_message_id", ""),
|
||||
watcher_interval=entry.get("watcher_interval", 0),
|
||||
notify_on_complete=entry.get("notify_on_complete", False),
|
||||
watch_patterns=entry.get("watch_patterns", []),
|
||||
)
|
||||
with self._lock:
|
||||
self._running[session.id] = session
|
||||
recovered += 1
|
||||
logger.info("Recovered detached process: %s (pid=%d)", session.command[:60], pid)
|
||||
|
||||
# Re-enqueue watcher so gateway can resume notifications
|
||||
if session.watcher_interval > 0:
|
||||
self.pending_watchers.append({
|
||||
"session_id": session.id,
|
||||
"check_interval": session.watcher_interval,
|
||||
"session_key": session.session_key,
|
||||
"platform": session.watcher_platform,
|
||||
"chat_id": session.watcher_chat_id,
|
||||
"user_id": session.watcher_user_id,
|
||||
"user_name": session.watcher_user_name,
|
||||
"thread_id": session.watcher_thread_id,
|
||||
"message_id": session.watcher_message_id,
|
||||
"notify_on_complete": session.notify_on_complete,
|
||||
})
|
||||
# Re-enqueue watcher so gateway can resume notifications
|
||||
if session.watcher_interval > 0:
|
||||
self.pending_watchers.append({
|
||||
"session_id": session.id,
|
||||
"check_interval": session.watcher_interval,
|
||||
"session_key": session.session_key,
|
||||
"platform": session.watcher_platform,
|
||||
"chat_id": session.watcher_chat_id,
|
||||
"user_id": session.watcher_user_id,
|
||||
"user_name": session.watcher_user_name,
|
||||
"thread_id": session.watcher_thread_id,
|
||||
"message_id": session.watcher_message_id,
|
||||
"notify_on_complete": session.notify_on_complete,
|
||||
})
|
||||
|
||||
self._write_checkpoint()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue