fix(browser): use process-tree termination for daemon cleanup

os.kill(pid, SIGTERM) only signals the parent, leaving Chromium child
    processes (renderer, GPU, etc.) orphaned.  Reuse the existing
    ProcessRegistry._terminate_host_pid() helper which walks the process
    tree leaf-up via psutil, terminating children before the parent.
This commit is contained in:
Yuan Li 2026-05-20 19:24:57 +08:00 committed by Teknium
parent 72ff3e909c
commit 22f3f5a75a
2 changed files with 40 additions and 37 deletions

View file

@ -72,7 +72,7 @@ class TestReapOrphanedBrowserSessions:
assert not d.exists()
def test_orphaned_alive_daemon_is_killed(self, fake_tmpdir):
"""Alive daemon not tracked by _active_sessions gets SIGTERM (legacy path).
"""Alive daemon not tracked by _active_sessions is terminated (legacy path).
No owner_pid file => falls back to tracked_names check.
"""
@ -82,18 +82,17 @@ class TestReapOrphanedBrowserSessions:
kill_calls = []
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
# Don't actually kill anything
def mock_terminate(pid):
kill_calls.append(pid)
# Post-#21561 the liveness probe goes through
# ``gateway.status._pid_exists`` (which wraps ``psutil.pid_exists``
# so it's safe on Windows — ``os.kill(pid, 0)`` is bpo-14484).
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
assert (12345, signal.SIGTERM) in kill_calls
assert 12345 in kill_calls
def test_tracked_session_is_not_reaped(self, fake_tmpdir):
"""Sessions tracked in _active_sessions are left alone (legacy path)."""
@ -108,13 +107,13 @@ class TestReapOrphanedBrowserSessions:
kill_calls = []
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
def mock_terminate(pid):
kill_calls.append(pid)
with patch("os.kill", side_effect=mock_kill):
with patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
# Should NOT have tried to kill anything
# Should NOT have tried to terminate anything
assert len(kill_calls) == 0
# Dir should still exist
assert d.exists()
@ -126,23 +125,24 @@ class TestReapOrphanedBrowserSessions:
``gateway.status._pid_exists`` (which wraps ``psutil.pid_exists``
because ``os.kill(pid, 0)`` is a footgun on Windows bpo-14484).
With no owner_pid file and no tracked-name entry, the reaper
SIGTERMs the daemon and removes its socket dir regardless of
whether SIGTERM succeeded (best-effort semantics).
terminates the daemon (and its process tree) and removes its socket
dir regardless of whether termination succeeded (best-effort
semantics).
"""
from tools.browser_tool import _reap_orphaned_browser_sessions
d = _make_socket_dir(fake_tmpdir, "h_perm1234567", pid=12345)
sigterm_calls = []
terminate_calls = []
def mock_kill(pid, sig):
sigterm_calls.append((pid, sig))
def mock_terminate(pid):
terminate_calls.append(pid)
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
assert (12345, signal.SIGTERM) in sigterm_calls
assert 12345 in terminate_calls
assert not d.exists()
def test_cdp_sessions_are_also_reaped(self, fake_tmpdir):
@ -203,15 +203,15 @@ class TestOwnerPidCrossProcess:
kill_calls = []
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
def mock_terminate(pid):
kill_calls.append(pid)
# Owner alive → reaper skips without ever probing the daemon.
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
assert (12345, signal.SIGTERM) not in kill_calls
assert 12345 not in kill_calls
assert d.exists()
def test_dead_owner_triggers_reap(self, fake_tmpdir):
@ -225,17 +225,17 @@ class TestOwnerPidCrossProcess:
kill_calls = []
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
def mock_terminate(pid):
kill_calls.append(pid)
# Owner 999999999 dead, daemon 12345 alive.
pid_alive = {999999999: False, 12345: True}
with patch("gateway.status._pid_exists",
side_effect=lambda pid: pid_alive.get(int(pid), False)), \
patch("os.kill", side_effect=mock_kill):
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
assert (12345, signal.SIGTERM) in kill_calls
assert 12345 in kill_calls
assert not d.exists()
def test_corrupt_owner_pid_falls_back_to_legacy(self, fake_tmpdir):
@ -253,15 +253,15 @@ class TestOwnerPidCrossProcess:
kill_calls = []
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
def mock_terminate(pid):
kill_calls.append(pid)
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
# Legacy path took over → tracked → not reaped
assert (12345, signal.SIGTERM) not in kill_calls
assert 12345 not in kill_calls
assert d.exists()
def test_owner_pid_permission_error_treated_as_alive(self, fake_tmpdir):
@ -280,16 +280,16 @@ class TestOwnerPidCrossProcess:
kill_calls = []
def mock_kill(pid, sig):
kill_calls.append((pid, sig))
def mock_terminate(pid):
kill_calls.append(pid)
# Owner 22222 reported alive (PermissionError collapses to True
# inside _pid_exists). Daemon never probed, never SIGTERMed.
# inside _pid_exists). Daemon never probed, never terminated.
with patch("gateway.status._pid_exists", return_value=True), \
patch("os.kill", side_effect=mock_kill):
patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate):
_reap_orphaned_browser_sessions()
assert (12345, signal.SIGTERM) not in kill_calls
assert 12345 not in kill_calls
assert d.exists()
def test_write_owner_pid_creates_file_with_current_pid(

View file

@ -102,7 +102,6 @@ from plugins.browser.firecrawl.provider import ( # noqa: F401
FirecrawlBrowserProvider as FirecrawlProvider,
)
from tools.tool_backend_helpers import normalize_browser_cloud_provider
# Camofox local anti-detection browser backend (optional).
# When CAMOFOX_URL is set, all browser operations route through the
# camofox REST API instead of the agent-browser CLI.
@ -1386,8 +1385,11 @@ def _reap_orphaned_browser_sessions():
continue
# Daemon is alive and its owner is dead (or legacy + untracked). Reap.
# Use the process-tree termination helper so Chromium children
# (renderer, GPU, etc.) are cleaned up, not just the daemon parent.
try:
os.kill(daemon_pid, signal.SIGTERM)
from tools.process_registry import ProcessRegistry
ProcessRegistry._terminate_host_pid(daemon_pid)
logger.info("Reaped orphaned browser daemon PID %d (session %s)",
daemon_pid, session_name)
reaped += 1
@ -3437,8 +3439,9 @@ def _cleanup_single_browser_session(task_id: str) -> None:
pid_file = os.path.join(socket_dir, f"{session_name}.pid")
if os.path.isfile(pid_file):
try:
from tools.process_registry import ProcessRegistry
daemon_pid = int(Path(pid_file).read_text(encoding="utf-8").strip())
os.kill(daemon_pid, signal.SIGTERM)
ProcessRegistry._terminate_host_pid(daemon_pid)
logger.debug("Killed daemon pid %s for %s", daemon_pid, session_name)
except (ProcessLookupError, ValueError, PermissionError, OSError):
logger.debug("Could not kill daemon pid for %s (already dead or inaccessible)", session_name)