From 22f3f5a75a660ba50ee7bec6a3d2b9eba6613e7e Mon Sep 17 00:00:00 2001 From: Yuan Li Date: Wed, 20 May 2026 19:24:57 +0800 Subject: [PATCH] fix(browser): use process-tree termination for daemon cleanup os.kill(pid, SIGTERM) only signals the parent, leaving Chromium child processes (renderer, GPU, etc.) orphaned. Reuse the existing ProcessRegistry._terminate_host_pid() helper which walks the process tree leaf-up via psutil, terminating children before the parent. --- tests/tools/test_browser_orphan_reaper.py | 68 +++++++++++------------ tools/browser_tool.py | 9 ++- 2 files changed, 40 insertions(+), 37 deletions(-) diff --git a/tests/tools/test_browser_orphan_reaper.py b/tests/tools/test_browser_orphan_reaper.py index 0724cbd6311..edd8bda6c2d 100644 --- a/tests/tools/test_browser_orphan_reaper.py +++ b/tests/tools/test_browser_orphan_reaper.py @@ -72,7 +72,7 @@ class TestReapOrphanedBrowserSessions: assert not d.exists() def test_orphaned_alive_daemon_is_killed(self, fake_tmpdir): - """Alive daemon not tracked by _active_sessions gets SIGTERM (legacy path). + """Alive daemon not tracked by _active_sessions is terminated (legacy path). No owner_pid file => falls back to tracked_names check. """ @@ -82,18 +82,17 @@ class TestReapOrphanedBrowserSessions: kill_calls = [] - def mock_kill(pid, sig): - kill_calls.append((pid, sig)) - # Don't actually kill anything + def mock_terminate(pid): + kill_calls.append(pid) # Post-#21561 the liveness probe goes through # ``gateway.status._pid_exists`` (which wraps ``psutil.pid_exists`` # so it's safe on Windows — ``os.kill(pid, 0)`` is bpo-14484). with patch("gateway.status._pid_exists", return_value=True), \ - patch("os.kill", side_effect=mock_kill): + patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate): _reap_orphaned_browser_sessions() - assert (12345, signal.SIGTERM) in kill_calls + assert 12345 in kill_calls def test_tracked_session_is_not_reaped(self, fake_tmpdir): """Sessions tracked in _active_sessions are left alone (legacy path).""" @@ -108,13 +107,13 @@ class TestReapOrphanedBrowserSessions: kill_calls = [] - def mock_kill(pid, sig): - kill_calls.append((pid, sig)) + def mock_terminate(pid): + kill_calls.append(pid) - with patch("os.kill", side_effect=mock_kill): + with patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate): _reap_orphaned_browser_sessions() - # Should NOT have tried to kill anything + # Should NOT have tried to terminate anything assert len(kill_calls) == 0 # Dir should still exist assert d.exists() @@ -126,23 +125,24 @@ class TestReapOrphanedBrowserSessions: ``gateway.status._pid_exists`` (which wraps ``psutil.pid_exists`` because ``os.kill(pid, 0)`` is a footgun on Windows — bpo-14484). With no owner_pid file and no tracked-name entry, the reaper - SIGTERMs the daemon and removes its socket dir regardless of - whether SIGTERM succeeded (best-effort semantics). + terminates the daemon (and its process tree) and removes its socket + dir regardless of whether termination succeeded (best-effort + semantics). """ from tools.browser_tool import _reap_orphaned_browser_sessions d = _make_socket_dir(fake_tmpdir, "h_perm1234567", pid=12345) - sigterm_calls = [] + terminate_calls = [] - def mock_kill(pid, sig): - sigterm_calls.append((pid, sig)) + def mock_terminate(pid): + terminate_calls.append(pid) with patch("gateway.status._pid_exists", return_value=True), \ - patch("os.kill", side_effect=mock_kill): + patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate): _reap_orphaned_browser_sessions() - assert (12345, signal.SIGTERM) in sigterm_calls + assert 12345 in terminate_calls assert not d.exists() def test_cdp_sessions_are_also_reaped(self, fake_tmpdir): @@ -203,15 +203,15 @@ class TestOwnerPidCrossProcess: kill_calls = [] - def mock_kill(pid, sig): - kill_calls.append((pid, sig)) + def mock_terminate(pid): + kill_calls.append(pid) # Owner alive → reaper skips without ever probing the daemon. with patch("gateway.status._pid_exists", return_value=True), \ - patch("os.kill", side_effect=mock_kill): + patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate): _reap_orphaned_browser_sessions() - assert (12345, signal.SIGTERM) not in kill_calls + assert 12345 not in kill_calls assert d.exists() def test_dead_owner_triggers_reap(self, fake_tmpdir): @@ -225,17 +225,17 @@ class TestOwnerPidCrossProcess: kill_calls = [] - def mock_kill(pid, sig): - kill_calls.append((pid, sig)) + def mock_terminate(pid): + kill_calls.append(pid) # Owner 999999999 dead, daemon 12345 alive. pid_alive = {999999999: False, 12345: True} with patch("gateway.status._pid_exists", side_effect=lambda pid: pid_alive.get(int(pid), False)), \ - patch("os.kill", side_effect=mock_kill): + patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate): _reap_orphaned_browser_sessions() - assert (12345, signal.SIGTERM) in kill_calls + assert 12345 in kill_calls assert not d.exists() def test_corrupt_owner_pid_falls_back_to_legacy(self, fake_tmpdir): @@ -253,15 +253,15 @@ class TestOwnerPidCrossProcess: kill_calls = [] - def mock_kill(pid, sig): - kill_calls.append((pid, sig)) + def mock_terminate(pid): + kill_calls.append(pid) with patch("gateway.status._pid_exists", return_value=True), \ - patch("os.kill", side_effect=mock_kill): + patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate): _reap_orphaned_browser_sessions() # Legacy path took over → tracked → not reaped - assert (12345, signal.SIGTERM) not in kill_calls + assert 12345 not in kill_calls assert d.exists() def test_owner_pid_permission_error_treated_as_alive(self, fake_tmpdir): @@ -280,16 +280,16 @@ class TestOwnerPidCrossProcess: kill_calls = [] - def mock_kill(pid, sig): - kill_calls.append((pid, sig)) + def mock_terminate(pid): + kill_calls.append(pid) # Owner 22222 reported alive (PermissionError collapses to True - # inside _pid_exists). Daemon never probed, never SIGTERMed. + # inside _pid_exists). Daemon never probed, never terminated. with patch("gateway.status._pid_exists", return_value=True), \ - patch("os.kill", side_effect=mock_kill): + patch("tools.process_registry.ProcessRegistry._terminate_host_pid", side_effect=mock_terminate): _reap_orphaned_browser_sessions() - assert (12345, signal.SIGTERM) not in kill_calls + assert 12345 not in kill_calls assert d.exists() def test_write_owner_pid_creates_file_with_current_pid( diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 447f6500714..f252544edc1 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -102,7 +102,6 @@ from plugins.browser.firecrawl.provider import ( # noqa: F401 FirecrawlBrowserProvider as FirecrawlProvider, ) from tools.tool_backend_helpers import normalize_browser_cloud_provider - # Camofox local anti-detection browser backend (optional). # When CAMOFOX_URL is set, all browser operations route through the # camofox REST API instead of the agent-browser CLI. @@ -1386,8 +1385,11 @@ def _reap_orphaned_browser_sessions(): continue # Daemon is alive and its owner is dead (or legacy + untracked). Reap. + # Use the process-tree termination helper so Chromium children + # (renderer, GPU, etc.) are cleaned up, not just the daemon parent. try: - os.kill(daemon_pid, signal.SIGTERM) + from tools.process_registry import ProcessRegistry + ProcessRegistry._terminate_host_pid(daemon_pid) logger.info("Reaped orphaned browser daemon PID %d (session %s)", daemon_pid, session_name) reaped += 1 @@ -3437,8 +3439,9 @@ def _cleanup_single_browser_session(task_id: str) -> None: pid_file = os.path.join(socket_dir, f"{session_name}.pid") if os.path.isfile(pid_file): try: + from tools.process_registry import ProcessRegistry daemon_pid = int(Path(pid_file).read_text(encoding="utf-8").strip()) - os.kill(daemon_pid, signal.SIGTERM) + ProcessRegistry._terminate_host_pid(daemon_pid) logger.debug("Killed daemon pid %s for %s", daemon_pid, session_name) except (ProcessLookupError, ValueError, PermissionError, OSError): logger.debug("Could not kill daemon pid for %s (already dead or inaccessible)", session_name)