From e44772314915ecf3ada2674c3f8790e4a6fb8f57 Mon Sep 17 00:00:00 2001 From: valentt Date: Thu, 11 Jun 2026 00:54:11 +0200 Subject: [PATCH] fix(process-registry): re-validate PID identity before killing host processes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The background-process registry signalled host PIDs (recovery adoption, detached-session kill, tree-kill) using a number captured at spawn, guarded only by a bare liveness check. Once a session's process exits and is reaped the kernel recycles that PID onto an unrelated process, so an alive-but-different PID passed the check and got tree-killed. Observed in the wild: a recycled background-session PID landed on Firefox's session leader; a later kill/refresh walked its process tree and SIGTERMed every tab — Firefox "closing" at irregular intervals with no crash/coredump. This is the same PID/PGID-recycling class fixed for the MCP orphan reaper in 7bd1f8a2d, but the process_registry subsystem was never guarded — so the bug persisted. Fix: record each host process's kernel start time (/proc//stat field 22) at spawn, persist it in the checkpoint, and re-validate it before every signal via `_host_pid_is_ours`. A PID whose start time no longer matches — or that is gone — is never signalled: - recover_from_checkpoint: a recycled PID is not adopted as a session. - _refresh_detached_session: a recycled detached PID is marked exited. - kill_process / _terminate_host_pid: refuse to tree-kill a stranger. Legacy checkpoints and platforms without /proc (no baseline) degrade to the prior best-effort liveness behaviour, so nothing else changes. Adds TestPidReuseGuard: real-process tests proving a mismatched start time refuses termination while a matching one still kills, plus recovery/refresh recycling paths. 74 registry + 22 MCP-stability tests green. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/tools/test_process_registry.py | 118 +++++++++++++++++++ tools/process_registry.py | 170 +++++++++++++++++++-------- 2 files changed, 241 insertions(+), 47 deletions(-) diff --git a/tests/tools/test_process_registry.py b/tests/tools/test_process_registry.py index 967849a194a..524a977b524 100644 --- a/tests/tools/test_process_registry.py +++ b/tests/tools/test_process_registry.py @@ -1318,3 +1318,121 @@ class TestTerminateHostPidPosix: pr.ProcessRegistry._terminate_host_pid(12345) assert kill_calls == [(12345, signal.SIGTERM)] + + +# ========================================================================= +# PID-reuse guard — a recycled PID/PGID must never be signalled. +# +# Regression: once a background-session process exits and is reaped, the kernel +# can recycle its PID onto an unrelated process (observed in the wild landing on +# a desktop browser's session leader, whose whole tree we then SIGTERMed — +# Firefox dying at irregular intervals). Identity is re-validated via the +# kernel start time captured at spawn before any signal is sent. +# ========================================================================= + +class TestPidReuseGuard: + def test_terminate_refuses_when_start_time_mismatches(self, registry): + """A live PID whose start time changed (recycled) is NOT killed.""" + proc = _spawn_python_sleep(30) + try: + real_start = ProcessRegistry._safe_host_start_time(proc.pid) + assert real_start is not None, "no /proc start time on this platform?" + # Simulate recycling: the recorded baseline no longer matches. + registry._terminate_host_pid(proc.pid, expected_start=real_start + 1) + # The process must still be alive — the guard refused to signal it. + assert not _wait_until(lambda: proc.poll() is not None, timeout=1.0) + assert proc.poll() is None + finally: + proc.kill() + proc.wait() + + def test_terminate_kills_when_start_time_matches(self, registry): + """The genuine process (start time matches) IS terminated.""" + proc = _spawn_python_sleep(30) + try: + real_start = ProcessRegistry._safe_host_start_time(proc.pid) + registry._terminate_host_pid(proc.pid, expected_start=real_start) + assert _wait_until(lambda: proc.poll() is not None, timeout=5.0) + finally: + if proc.poll() is None: + proc.kill() + proc.wait() + + def test_terminate_without_baseline_is_best_effort(self, registry): + """No baseline (legacy) → degrade to prior unconditional behaviour.""" + proc = _spawn_python_sleep(30) + try: + registry._terminate_host_pid(proc.pid) # expected_start=None + assert _wait_until(lambda: proc.poll() is not None, timeout=5.0) + finally: + if proc.poll() is None: + proc.kill() + proc.wait() + + def test_recover_skips_recycled_pid(self, registry, tmp_path): + """Checkpoint PID is alive but its start time changed → not adopted.""" + wrong_start = (ProcessRegistry._safe_host_start_time(os.getpid()) or 0) + 999 + checkpoint = tmp_path / "procs.json" + checkpoint.write_text(json.dumps([{ + "session_id": "proc_recycled", + "command": "sleep 999", + "pid": os.getpid(), # alive... + "pid_scope": "host", + "host_start_time": wrong_start, # ...but a different process now + "task_id": "t1", + }])) + with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint): + assert registry.recover_from_checkpoint() == 0 + assert len(registry._running) == 0 + + def test_recover_adopts_when_start_time_matches(self, registry, tmp_path): + """Checkpoint PID alive AND start time matches → adopted as before.""" + real_start = ProcessRegistry._safe_host_start_time(os.getpid()) + checkpoint = tmp_path / "procs.json" + checkpoint.write_text(json.dumps([{ + "session_id": "proc_match", + "command": "sleep 999", + "pid": os.getpid(), + "pid_scope": "host", + "host_start_time": real_start, + "task_id": "t1", + }])) + with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint): + assert registry.recover_from_checkpoint() == 1 + + def test_legacy_checkpoint_without_start_time_still_recovers(self, registry, tmp_path): + """Entries written before host_start_time existed degrade to liveness.""" + checkpoint = tmp_path / "procs.json" + checkpoint.write_text(json.dumps([{ + "session_id": "proc_legacy", + "command": "sleep 999", + "pid": os.getpid(), + "pid_scope": "host", + "task_id": "t1", + }])) + with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint): + assert registry.recover_from_checkpoint() == 1 + + def test_write_checkpoint_backfills_host_start_time(self, registry, tmp_path): + """A host session is checkpointed with a kernel start time recorded.""" + with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "procs.json"): + s = _make_session() + s.pid = os.getpid() + s.pid_scope = "host" + registry._running[s.id] = s + registry._write_checkpoint() + data = json.loads((tmp_path / "procs.json").read_text()) + assert data[0]["host_start_time"] is not None + + def test_refresh_detached_marks_recycled_pid_exited(self, registry): + """A detached session whose PID got recycled is moved to finished.""" + wrong_start = (ProcessRegistry._safe_host_start_time(os.getpid()) or 0) + 999 + s = _make_session(sid="proc_detached") + s.pid = os.getpid() # alive, but... + s.pid_scope = "host" + s.detached = True + s.host_start_time = wrong_start # ...identity no longer matches + registry._running[s.id] = s + refreshed = registry._refresh_detached_session(s) + assert refreshed.exited is True + assert s.id in registry._finished diff --git a/tools/process_registry.py b/tools/process_registry.py index a8bd30b083b..3d20e02d56f 100644 --- a/tools/process_registry.py +++ b/tools/process_registry.py @@ -97,7 +97,8 @@ class ProcessSession: process: Optional[subprocess.Popen] = None # Popen handle (local only) env_ref: Any = None # Reference to the environment object cwd: Optional[str] = None # Working directory - started_at: float = 0.0 # time.time() of spawn + started_at: float = 0.0 # time.time() of spawn (wall clock) + host_start_time: Optional[int] = None # kernel start ticks (/proc//stat f22) — PID-reuse guard exited: bool = False # Whether the process has finished exit_code: Optional[int] = None # Exit code (None if still running) completion_reason: str = "exited" # exited|killed|lost|failed_start|already_exited @@ -428,12 +429,47 @@ class ProcessRegistry: from gateway.status import _pid_exists return _pid_exists(pid) + @staticmethod + def _safe_host_start_time(pid: Optional[int]) -> Optional[int]: + """Kernel start ticks for a host PID, or None when unavailable.""" + if not pid: + return None + try: + from gateway.status import get_process_start_time + return get_process_start_time(pid) + except Exception: + return None + + @classmethod + def _host_pid_is_ours(cls, pid: Optional[int], expected_start: Optional[int]) -> bool: + """True only if ``pid`` is alive AND still the process we spawned. + + The kernel recycles PID/PGID numbers once a process exits and is reaped, + so a stored PID can later name an *unrelated* process — observed in the + wild as a recycled number landing on a desktop browser's session leader, + which our tree-kill then SIGTERMs (Firefox dying at irregular intervals). + We compare the kernel start time captured at spawn against the live one; + a mismatch means the number was recycled and must never be signalled. + + When no baseline was captured (legacy checkpoints, or platforms without + ``/proc``) we degrade to a bare liveness check rather than refusing to + act, preserving prior best-effort behaviour. + """ + if not cls._is_host_pid_alive(pid): + return False + if expected_start is None: + return True + return cls._safe_host_start_time(pid) == expected_start + def _refresh_detached_session(self, session: Optional[ProcessSession]) -> Optional[ProcessSession]: """Update recovered host-PID sessions when the underlying process has exited.""" if session is None or session.exited or not session.detached or session.pid_scope != "host": return session - if self._is_host_pid_alive(session.pid): + # Identity-aware liveness: a recycled PID (alive but a different process + # than we spawned) must be treated as "our process exited", so it is + # moved to finished and can never be tree-killed by a later kill(). + if self._host_pid_is_ours(session.pid, session.host_start_time): return session with session._lock: @@ -447,10 +483,16 @@ class ProcessRegistry: self._move_to_finished(session) return session - @staticmethod - def _terminate_host_pid(pid: int) -> None: + @classmethod + def _terminate_host_pid(cls, pid: int, expected_start: Optional[int] = None) -> None: """Terminate a host-visible PID and its descendants. + ``expected_start`` is the kernel start time captured when we spawned the + process. When provided, it is re-validated against the live PID before + any signal is sent; a mismatch (or a dead PID) means the number was + recycled onto an unrelated process and we refuse to touch it, so a stale + background-session PID can never tree-kill a browser or other stranger. + POSIX: walks the process tree with ``psutil`` and SIGTERMs children before the parent so subprocess trees (e.g. Chromium renderers/GPU helpers spawned by an ``agent-browser`` daemon) @@ -479,6 +521,15 @@ class ProcessRegistry: POSIX and a missing ``taskkill.exe`` on Windows (effectively unreachable on real Windows installs, but cheap insurance). """ + if expected_start is not None and not cls._host_pid_is_ours(pid, expected_start): + # PID was recycled (start time changed) or is gone — never signal a + # stranger. A leaked orphan is strictly preferable to killing e.g. + # a browser whose session leader reused this dead session's PID. + logger.warning( + "Refusing to terminate host pid %d: start-time mismatch — " + "PID was recycled onto an unrelated process.", pid, + ) + return if _IS_WINDOWS: try: subprocess.run( @@ -573,6 +624,7 @@ class ProcessRegistry: dimensions=(30, 120), ) session.pid = pty_proc.pid + session.host_start_time = self._safe_host_start_time(session.pid) # Store the pty handle on the session for read/write session._pty = pty_proc @@ -625,6 +677,7 @@ class ProcessRegistry: session.process = proc session.pid = proc.pid + session.host_start_time = self._safe_host_start_time(session.pid) try: # Start output reader thread @@ -1239,7 +1292,10 @@ class ProcessRegistry: # Non-local -- kill inside sandbox session.env_ref.execute(f"kill {session.pid} 2>/dev/null", timeout=5) elif session.detached and session.pid_scope == "host" and session.pid: - if not self._is_host_pid_alive(session.pid): + # Identity check, not bare liveness: if the PID is gone OR was + # recycled onto an unrelated process, treat our process as + # exited and never tree-kill the stranger. + if not self._host_pid_is_ours(session.pid, session.host_start_time): with session._lock: session.exited = True session.exit_code = None @@ -1248,7 +1304,7 @@ class ProcessRegistry: "status": "already_exited", "exit_code": session.exit_code, } - self._terminate_host_pid(session.pid) + self._terminate_host_pid(session.pid, session.host_start_time) else: return { "status": "error", @@ -1461,11 +1517,17 @@ class ProcessRegistry: entries = [] for s in self._running.values(): if not s.exited: + # Lazily backfill the kernel start time for host PIDs so + # recovery after restart can detect PID recycling even + # for sessions spawned before this field existed. + if s.host_start_time is None and s.pid_scope == "host" and s.pid: + s.host_start_time = self._safe_host_start_time(s.pid) entries.append({ "session_id": s.id, "command": s.command, "pid": s.pid, "pid_scope": s.pid_scope, + "host_start_time": s.host_start_time, "cwd": s.cwd, "started_at": s.started_at, "task_id": s.task_id, @@ -1520,49 +1582,63 @@ class ProcessRegistry: ) continue - # Check if PID is still alive - alive = self._is_host_pid_alive(pid) + # The PID must be alive AND still the same process we spawned. A + # bare liveness check is unsafe: across a restart (especially a + # reboot or long uptime) the kernel may have recycled this number + # onto an unrelated process — adopting it would let a later kill or + # watcher tree-kill a stranger (e.g. a browser). Re-validate the + # kernel start time recorded in the checkpoint. + recorded_start = entry.get("host_start_time") + if not self._host_pid_is_ours(pid, recorded_start): + if self._is_host_pid_alive(pid): + logger.info( + "Not recovering session %s: pid %d is alive but its " + "start time no longer matches — PID was recycled onto " + "an unrelated process; refusing to adopt it.", + entry.get("session_id", "?"), pid, + ) + continue - if alive: - session = ProcessSession( - id=entry["session_id"], - command=entry.get("command", "unknown"), - task_id=entry.get("task_id", ""), - session_key=entry.get("session_key", ""), - pid=pid, - pid_scope=pid_scope, - cwd=entry.get("cwd"), - started_at=entry.get("started_at", time.time()), - detached=True, # Can't read output, but can report status + kill - watcher_platform=entry.get("watcher_platform", ""), - watcher_chat_id=entry.get("watcher_chat_id", ""), - watcher_user_id=entry.get("watcher_user_id", ""), - watcher_user_name=entry.get("watcher_user_name", ""), - watcher_thread_id=entry.get("watcher_thread_id", ""), - watcher_message_id=entry.get("watcher_message_id", ""), - watcher_interval=entry.get("watcher_interval", 0), - notify_on_complete=entry.get("notify_on_complete", False), - watch_patterns=entry.get("watch_patterns", []), - ) - with self._lock: - self._running[session.id] = session - recovered += 1 - logger.info("Recovered detached process: %s (pid=%d)", session.command[:60], pid) + session = ProcessSession( + id=entry["session_id"], + command=entry.get("command", "unknown"), + task_id=entry.get("task_id", ""), + session_key=entry.get("session_key", ""), + pid=pid, + host_start_time=recorded_start, + pid_scope=pid_scope, + cwd=entry.get("cwd"), + started_at=entry.get("started_at", time.time()), + detached=True, # Can't read output, but can report status + kill + watcher_platform=entry.get("watcher_platform", ""), + watcher_chat_id=entry.get("watcher_chat_id", ""), + watcher_user_id=entry.get("watcher_user_id", ""), + watcher_user_name=entry.get("watcher_user_name", ""), + watcher_thread_id=entry.get("watcher_thread_id", ""), + watcher_message_id=entry.get("watcher_message_id", ""), + watcher_interval=entry.get("watcher_interval", 0), + notify_on_complete=entry.get("notify_on_complete", False), + watch_patterns=entry.get("watch_patterns", []), + ) + with self._lock: + self._running[session.id] = session + recovered += 1 + logger.info("Recovered detached process: %s (pid=%d)", session.command[:60], pid) - # Re-enqueue watcher so gateway can resume notifications - if session.watcher_interval > 0: - self.pending_watchers.append({ - "session_id": session.id, - "check_interval": session.watcher_interval, - "session_key": session.session_key, - "platform": session.watcher_platform, - "chat_id": session.watcher_chat_id, - "user_id": session.watcher_user_id, - "user_name": session.watcher_user_name, - "thread_id": session.watcher_thread_id, - "message_id": session.watcher_message_id, - "notify_on_complete": session.notify_on_complete, - }) + # Re-enqueue watcher so gateway can resume notifications + if session.watcher_interval > 0: + self.pending_watchers.append({ + "session_id": session.id, + "check_interval": session.watcher_interval, + "session_key": session.session_key, + "platform": session.watcher_platform, + "chat_id": session.watcher_chat_id, + "user_id": session.watcher_user_id, + "user_name": session.watcher_user_name, + "thread_id": session.watcher_thread_id, + "message_id": session.watcher_message_id, + "notify_on_complete": session.notify_on_complete, + }) self._write_checkpoint()