fix(browser): verify daemon identity before orphan reaper kills a PID (#14073)

The browser orphan reaper reads a daemon PID from a `.pid` file in a
world-writable, predictably-named temp dir (`/tmp/agent-browser-h_*`) it
does not write itself, then tree-kills that PID via `_terminate_host_pid`
after only a liveness check. A same-user actor could plant a fake socket
dir whose `.pid` points at an arbitrary victim process, and OS PID reuse
after the real daemon exits could land the recorded PID on an unrelated
process — either way an arbitrary same-user process (and its whole tree)
gets SIGTERMed. Local DoS.

Add `_verify_reapable_browser_daemon()`, gated before the kill: via psutil
(a hard dep, fine cross-platform for the same-user processes the reaper can
signal) require both (1) identity — `agent-browser` in the process
name/cmdline — and (2) binding — the live process references *this* session's
socket dir in its cmdline or `AGENT_BROWSER_SOCKET_DIR`. The binding check is
the real spoof defense: a planted/recycled PID won't embed our exact socket
path. Fail-closed on any ambiguity (unreadable cmdline, no match), leaving the
process and its socket dir untouched for a later sweep.

Builds on @sgaofen's fix in #14394 (cmdline identity check); rewritten to use
psutil instead of `/proc`+`ps` (cross-platform, Windows-covered) and to add
the session-socket-dir binding check for recycled-PID / spoof resistance.

Co-authored-by: sgaofen <135070653+sgaofen@users.noreply.github.com>
This commit is contained in:
teknium1 2026-06-21 13:39:41 -07:00 committed by Teknium
parent 4d4ba0831e
commit 624580e836
2 changed files with 229 additions and 0 deletions

View file

@ -85,7 +85,10 @@ class TestReapOrphanedBrowserSessions:
# Post-#21561 the liveness probe goes through
# ``gateway.status._pid_exists`` (which wraps ``psutil.pid_exists``
# so it's safe on Windows — ``os.kill(pid, 0)`` is bpo-14484).
# The identity guard (#14073) is mocked True here — its own behavior
# is covered by TestReaperIdentityGuard below.
with patch("gateway.status._pid_exists", return_value=True), \
patch("tools.browser_tool._verify_reapable_browser_daemon", return_value=True), \
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
@ -136,6 +139,7 @@ class TestReapOrphanedBrowserSessions:
terminate_calls.append(pid)
with patch("gateway.status._pid_exists", return_value=True), \
patch("tools.browser_tool._verify_reapable_browser_daemon", return_value=True), \
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
@ -229,6 +233,7 @@ class TestOwnerPidCrossProcess:
pid_alive = {999999999: False, 12345: True}
with patch("gateway.status._pid_exists",
side_effect=lambda pid: pid_alive.get(int(pid), False)), \
patch("tools.browser_tool._verify_reapable_browser_daemon", return_value=True), \
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
@ -380,6 +385,133 @@ class TestOwnerPidCrossProcess:
assert session_name in socket_dir_arg
class TestReaperIdentityGuard:
"""Tests for _verify_reapable_browser_daemon — the #14073 fix.
The reaper reads daemon PIDs from world-writable, predictably-named temp
dirs. Before tree-killing a live PID it must confirm the process really is
*this* session's agent-browser daemon, defeating planted pid files and
recycled PIDs that would otherwise become an arbitrary same-user DoS.
"""
class _FakeProc:
def __init__(self, name="agent-browser", cmdline=None, environ=None,
raise_environ=False):
self._name = name
self._cmdline = cmdline if cmdline is not None else []
self._environ = environ or {}
self._raise_environ = raise_environ
def name(self):
return self._name
def cmdline(self):
return self._cmdline
def environ(self):
if self._raise_environ:
import psutil
raise psutil.AccessDenied()
return self._environ
def _run(self, fake_proc, socket_dir, session_name="h_sess123456",
daemon_pid=12345, no_such=False, access_denied=False):
import psutil
from tools.browser_tool import _verify_reapable_browser_daemon
def _factory(pid):
if no_such:
raise psutil.NoSuchProcess(pid)
if access_denied:
raise psutil.AccessDenied(pid)
return fake_proc
with patch("psutil.Process", side_effect=_factory):
return _verify_reapable_browser_daemon(
daemon_pid, socket_dir, session_name)
def test_real_daemon_bound_via_cmdline_is_reapable(self):
socket_dir = "/tmp/agent-browser-h_sess123456"
proc = self._FakeProc(
name="agent-browser",
cmdline=["agent-browser", "open", "--session", "h_sess123456",
"--socket-dir", socket_dir],
)
assert self._run(proc, socket_dir) is True
def test_daemon_bound_via_environ_is_reapable(self):
socket_dir = "/tmp/agent-browser-h_sess123456"
proc = self._FakeProc(
name="agent-browser-linux-x64",
cmdline=["agent-browser-linux-x64", "daemon"], # no dir in cmd
environ={"AGENT_BROWSER_SOCKET_DIR": socket_dir},
)
assert self._run(proc, socket_dir) is True
def test_planted_pid_for_non_browser_process_is_refused(self):
"""A planted .pid pointing at e.g. `sleep 600` must NOT be reaped."""
socket_dir = "/tmp/agent-browser-h_sess123456"
proc = self._FakeProc(name="sleep", cmdline=["/bin/sleep", "600"])
assert self._run(proc, socket_dir) is False
def test_recycled_pid_browser_not_bound_to_our_dir_is_refused(self):
"""An agent-browser process for a DIFFERENT session must not be reaped.
Models PID reuse / a concurrent unrelated daemon: it looks like
agent-browser but is bound to another socket dir.
"""
socket_dir = "/tmp/agent-browser-h_sess123456"
proc = self._FakeProc(
name="agent-browser",
cmdline=["agent-browser", "open", "--session", "h_OTHER999",
"--socket-dir", "/tmp/agent-browser-h_OTHER999"],
environ={"AGENT_BROWSER_SOCKET_DIR":
"/tmp/agent-browser-h_OTHER999"},
)
assert self._run(proc, socket_dir) is False
def test_browser_name_but_environ_denied_and_no_cmdline_bind_refused(self):
"""Looks like browser, cmdline doesn't bind, environ() denied -> refuse."""
socket_dir = "/tmp/agent-browser-h_sess123456"
proc = self._FakeProc(
name="agent-browser",
cmdline=["agent-browser", "daemon"], # no dir
raise_environ=True,
)
assert self._run(proc, socket_dir) is False
def test_vanished_process_is_not_reapable(self):
socket_dir = "/tmp/agent-browser-h_sess123456"
assert self._run(None, socket_dir, no_such=True) is False
def test_access_denied_on_identity_read_refuses(self):
socket_dir = "/tmp/agent-browser-h_sess123456"
assert self._run(None, socket_dir, access_denied=True) is False
def test_planted_pid_survives_full_reaper_path(self, fake_tmpdir):
"""End-to-end through the reaper: a planted non-browser PID is spared.
No owner_pid (legacy path), not tracked, PID 'alive' but the live
process is `sleep`, not agent-browser, so it must be left alone and the
socket dir retained.
"""
from tools.browser_tool import _reap_orphaned_browser_sessions
d = _make_socket_dir(fake_tmpdir, "h_planted9999", pid=12345)
terminate_calls = []
proc = self._FakeProc(name="sleep", cmdline=["/bin/sleep", "600"])
with patch("gateway.status._pid_exists", return_value=True), \
patch("psutil.Process", return_value=proc), \
patch("tools.process_registry.ProcessRegistry._terminate_host_pid",
side_effect=lambda pid: terminate_calls.append(pid)):
_reap_orphaned_browser_sessions()
assert terminate_calls == [], "planted non-browser PID must not be killed"
assert d.exists(), "socket dir retained for a later sweep"
class TestEmergencyCleanupRunsReaper:
"""Verify atexit-registered cleanup sweeps orphans even without an active session."""

View file

@ -1320,6 +1320,92 @@ def _write_owner_pid(socket_dir: str, session_name: str) -> None:
session_name, exc)
def _verify_reapable_browser_daemon(daemon_pid: int, socket_dir: str,
session_name: str) -> bool:
"""Confirm a live PID is genuinely *this* session's agent-browser daemon.
The orphan reaper scans world-writable, predictably-named temp paths
(``/tmp/agent-browser-h_*`` etc.) and reads a daemon PID from a ``.pid``
file we do not write ourselves the agent-browser daemon writes it. A
same-user actor can therefore plant a fake socket dir whose ``.pid`` points
at an arbitrary victim process, or a recycled PID can land on an unrelated
process after the real daemon exits. Either way, terminating that PID
(a *tree* kill via ``_terminate_host_pid``) is an arbitrary-process DoS.
Before reaping we require, via ``psutil`` (a hard dependency, cross-platform
for same-user processes the only processes the reaper can signal):
1. **Identity** the process looks like agent-browser: ``agent-browser``
appears in its name or command line.
2. **Binding** the process is bound to *this* session's socket dir: the
socket dir path (or its basename) appears in the command line, or in
``AGENT_BROWSER_SOCKET_DIR`` in the process environment.
Requirement (2) is the real spoof defense: a planted process pointing at a
victim PID will not have the victim's cmdline/environ referencing our
socket dir. An attacker would need a process that genuinely embeds this
exact session path i.e. a real daemon they already own and could signal
directly. Fail-closed: any ambiguity (unreadable cmdline, no match) means
we refuse to reap and leave the process and its socket dir alone.
Returns ``True`` only when both checks pass.
"""
try:
import psutil
except ImportError: # psutil is a hard dep; defensive only
logger.warning(
"Refusing to reap browser daemon PID %d (session %s): "
"psutil unavailable for identity verification",
daemon_pid, session_name)
return False
try:
proc = psutil.Process(daemon_pid)
name = (proc.name() or "").lower()
cmdline = " ".join(proc.cmdline() or []).lower()
except psutil.NoSuchProcess:
# Vanished between the liveness check and now — nothing to reap.
return False
except (psutil.AccessDenied, OSError) as exc:
logger.warning(
"Refusing to reap browser daemon PID %d (session %s): "
"could not read process identity (%s)",
daemon_pid, session_name, exc)
return False
looks_like_browser = "agent-browser" in name or "agent-browser" in cmdline
if not looks_like_browser:
logger.warning(
"Refusing to reap PID %d (session %s): not an agent-browser "
"process (name=%r)", daemon_pid, session_name, name)
return False
# Binding check: the live process must reference *this* socket dir.
socket_dir_l = socket_dir.lower()
socket_base_l = os.path.basename(socket_dir).lower()
bound = socket_dir_l in cmdline or (
socket_base_l and socket_base_l in cmdline)
if not bound:
try:
env_dir = (proc.environ() or {}).get(
"AGENT_BROWSER_SOCKET_DIR", "")
bound = bool(env_dir) and os.path.normpath(env_dir) == \
os.path.normpath(socket_dir)
except (psutil.AccessDenied, psutil.NoSuchProcess, OSError):
# environ() can be denied even same-user on some platforms.
# cmdline already failed to bind — fail closed.
bound = False
if not bound:
logger.warning(
"Refusing to reap agent-browser PID %d: not bound to session "
"socket dir %s (possible recycled PID or planted pid file)",
daemon_pid, socket_dir)
return False
return True
def _reap_orphaned_browser_sessions():
"""Scan for orphaned agent-browser daemon processes from previous runs.
@ -1415,6 +1501,17 @@ def _reap_orphaned_browser_sessions():
shutil.rmtree(socket_dir, ignore_errors=True)
continue
# The PID is live — but the .pid file lives in a world-writable,
# predictably-named temp dir we don't write ourselves, and PIDs get
# recycled after the real daemon exits. Verify the process really is
# *this* session's agent-browser daemon before tree-killing it; refuse
# otherwise (don't touch the process, leave the socket dir for a later
# sweep once the imposter PID is gone). Fixes the arbitrary same-user
# process DoS in issue #14073.
if not _verify_reapable_browser_daemon(
daemon_pid, socket_dir, session_name):
continue
# Daemon is alive and its owner is dead (or legacy + untracked). Reap.
# Use the process-tree termination helper so Chromium children
# (renderer, GPU, etc.) are cleaned up, not just the daemon parent.