From 75380de4301a74f96e74ee8f68a572b97b42d908 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 11 Apr 2026 14:02:46 -0700 Subject: [PATCH] fix: reap orphaned browser sessions on startup (#7931) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a Python process exits uncleanly (SIGKILL, crash, gateway restart via hermes update), in-memory _active_sessions tracking is lost but the agent-browser node daemons and their Chromium child processes keep running indefinitely. On a long-running system this causes unbounded memory growth — 24 orphaned sessions consumed 7.6 GB on a production machine over 9 days. Add _reap_orphaned_browser_sessions() which scans the tmp directory for agent-browser-{h_*,cdp_*} socket dirs on cleanup thread startup. For each dir not tracked by the current process, reads the daemon PID file and sends SIGTERM if the daemon is still alive. Handles edge cases: dead PIDs, corrupt PID files, permission errors, foreign processes. The reaper runs once on thread startup (not every 30s) to avoid races with sessions being actively created by concurrent agents. --- tests/tools/test_browser_orphan_reaper.py | 158 ++++++++++++++++++++++ tools/browser_tool.py | 91 +++++++++++++ 2 files changed, 249 insertions(+) create mode 100644 tests/tools/test_browser_orphan_reaper.py diff --git a/tests/tools/test_browser_orphan_reaper.py b/tests/tools/test_browser_orphan_reaper.py new file mode 100644 index 0000000000..254dad7db7 --- /dev/null +++ b/tests/tools/test_browser_orphan_reaper.py @@ -0,0 +1,158 @@ +"""Tests for _reap_orphaned_browser_sessions() — kills orphaned agent-browser +daemons whose Python parent exited without cleaning up.""" + +import os +import signal +import textwrap +from pathlib import Path +from unittest.mock import patch, MagicMock + +import pytest + + +@pytest.fixture +def fake_tmpdir(tmp_path): + """Patch _socket_safe_tmpdir to return a temp dir we control.""" + with patch("tools.browser_tool._socket_safe_tmpdir", return_value=str(tmp_path)): + yield tmp_path + + +@pytest.fixture(autouse=True) +def _isolate_sessions(): + """Ensure _active_sessions is empty for each test.""" + import tools.browser_tool as bt + orig = bt._active_sessions.copy() + bt._active_sessions.clear() + yield + bt._active_sessions.clear() + bt._active_sessions.update(orig) + + +def _make_socket_dir(tmpdir, session_name, pid=None): + """Create a fake agent-browser socket directory with optional PID file.""" + d = tmpdir / f"agent-browser-{session_name}" + d.mkdir() + if pid is not None: + (d / f"{session_name}.pid").write_text(str(pid)) + return d + + +class TestReapOrphanedBrowserSessions: + """Tests for the orphan reaper function.""" + + def test_no_socket_dirs_is_noop(self, fake_tmpdir): + """No socket dirs => nothing happens, no errors.""" + from tools.browser_tool import _reap_orphaned_browser_sessions + _reap_orphaned_browser_sessions() # should not raise + + def test_stale_dir_without_pid_file_is_removed(self, fake_tmpdir): + """Socket dir with no PID file is cleaned up.""" + from tools.browser_tool import _reap_orphaned_browser_sessions + d = _make_socket_dir(fake_tmpdir, "h_abc1234567") + assert d.exists() + _reap_orphaned_browser_sessions() + assert not d.exists() + + def test_stale_dir_with_dead_pid_is_removed(self, fake_tmpdir): + """Socket dir whose daemon PID is dead gets cleaned up.""" + from tools.browser_tool import _reap_orphaned_browser_sessions + d = _make_socket_dir(fake_tmpdir, "h_dead123456", pid=999999999) + assert d.exists() + _reap_orphaned_browser_sessions() + assert not d.exists() + + def test_orphaned_alive_daemon_is_killed(self, fake_tmpdir): + """Alive daemon not tracked by _active_sessions gets SIGTERM.""" + from tools.browser_tool import _reap_orphaned_browser_sessions + + d = _make_socket_dir(fake_tmpdir, "h_orphan12345", pid=12345) + + kill_calls = [] + original_kill = os.kill + + def mock_kill(pid, sig): + kill_calls.append((pid, sig)) + if sig == 0: + return # pretend process exists + # Don't actually kill anything + + with patch("os.kill", side_effect=mock_kill): + _reap_orphaned_browser_sessions() + + # Should have checked existence (sig 0) then killed (SIGTERM) + assert (12345, 0) in kill_calls + assert (12345, signal.SIGTERM) in kill_calls + + def test_tracked_session_is_not_reaped(self, fake_tmpdir): + """Sessions tracked in _active_sessions are left alone.""" + import tools.browser_tool as bt + from tools.browser_tool import _reap_orphaned_browser_sessions + + session_name = "h_tracked1234" + d = _make_socket_dir(fake_tmpdir, session_name, pid=12345) + + # Register the session as actively tracked + bt._active_sessions["some_task"] = {"session_name": session_name} + + kill_calls = [] + + def mock_kill(pid, sig): + kill_calls.append((pid, sig)) + + with patch("os.kill", side_effect=mock_kill): + _reap_orphaned_browser_sessions() + + # Should NOT have tried to kill anything + assert len(kill_calls) == 0 + # Dir should still exist + assert d.exists() + + def test_permission_error_on_kill_check_skips(self, fake_tmpdir): + """If we can't check the PID (PermissionError), skip it.""" + from tools.browser_tool import _reap_orphaned_browser_sessions + + d = _make_socket_dir(fake_tmpdir, "h_perm1234567", pid=12345) + + def mock_kill(pid, sig): + if sig == 0: + raise PermissionError("not our process") + + with patch("os.kill", side_effect=mock_kill): + _reap_orphaned_browser_sessions() + + # Dir should still exist (we didn't touch someone else's process) + assert d.exists() + + def test_cdp_sessions_are_also_reaped(self, fake_tmpdir): + """CDP sessions (cdp_ prefix) are also scanned.""" + from tools.browser_tool import _reap_orphaned_browser_sessions + + d = _make_socket_dir(fake_tmpdir, "cdp_abc1234567") + assert d.exists() + _reap_orphaned_browser_sessions() + # No PID file → cleaned up + assert not d.exists() + + def test_non_hermes_dirs_are_ignored(self, fake_tmpdir): + """Socket dirs that don't match our naming pattern are left alone.""" + from tools.browser_tool import _reap_orphaned_browser_sessions + + # Create a dir that doesn't match h_* or cdp_* pattern + d = fake_tmpdir / "agent-browser-other_session" + d.mkdir() + (d / "other_session.pid").write_text("12345") + + _reap_orphaned_browser_sessions() + + # Should NOT be touched + assert d.exists() + + def test_corrupt_pid_file_is_cleaned(self, fake_tmpdir): + """PID file with non-integer content is cleaned up.""" + from tools.browser_tool import _reap_orphaned_browser_sessions + + d = _make_socket_dir(fake_tmpdir, "h_corrupt1234") + (d / "h_corrupt1234.pid").write_text("not-a-number") + + _reap_orphaned_browser_sessions() + assert not d.exists() diff --git a/tools/browser_tool.py b/tools/browser_tool.py index ed3cfbb9bb..bb24866066 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -473,13 +473,104 @@ def _cleanup_inactive_browser_sessions(): logger.warning("Error cleaning up inactive session %s: %s", task_id, e) +def _reap_orphaned_browser_sessions(): + """Scan for orphaned agent-browser daemon processes from previous runs. + + When the Python process that created a browser session exits uncleanly + (SIGKILL, crash, gateway restart), the in-memory ``_active_sessions`` + tracking is lost but the node + Chromium processes keep running. + + This function scans the tmp directory for ``agent-browser-*`` socket dirs + left behind by previous runs, reads the daemon PID files, and kills any + daemons that are still alive but not tracked by the current process. + + Called once on cleanup-thread startup — not every 30 seconds — to avoid + races with sessions being actively created. + """ + import glob + + tmpdir = _socket_safe_tmpdir() + pattern = os.path.join(tmpdir, "agent-browser-h_*") + socket_dirs = glob.glob(pattern) + # Also pick up CDP sessions + socket_dirs += glob.glob(os.path.join(tmpdir, "agent-browser-cdp_*")) + + if not socket_dirs: + return + + # Build set of session_names currently tracked by this process + with _cleanup_lock: + tracked_names = { + info.get("session_name") + for info in _active_sessions.values() + if info.get("session_name") + } + + reaped = 0 + for socket_dir in socket_dirs: + dir_name = os.path.basename(socket_dir) + # dir_name is "agent-browser-{session_name}" + session_name = dir_name.removeprefix("agent-browser-") + if not session_name: + continue + + # Skip sessions that we are actively tracking + if session_name in tracked_names: + continue + + pid_file = os.path.join(socket_dir, f"{session_name}.pid") + if not os.path.isfile(pid_file): + # No PID file — just a stale dir, remove it + shutil.rmtree(socket_dir, ignore_errors=True) + continue + + try: + daemon_pid = int(Path(pid_file).read_text().strip()) + except (ValueError, OSError): + shutil.rmtree(socket_dir, ignore_errors=True) + continue + + # Check if the daemon is still alive + try: + os.kill(daemon_pid, 0) # signal 0 = existence check + except ProcessLookupError: + # Already dead, just clean up the dir + shutil.rmtree(socket_dir, ignore_errors=True) + continue + except PermissionError: + # Alive but owned by someone else — leave it alone + continue + + # Daemon is alive and not tracked — orphan. Kill it. + try: + os.kill(daemon_pid, signal.SIGTERM) + logger.info("Reaped orphaned browser daemon PID %d (session %s)", + daemon_pid, session_name) + reaped += 1 + except (ProcessLookupError, PermissionError, OSError): + pass + + # Clean up the socket directory + shutil.rmtree(socket_dir, ignore_errors=True) + + if reaped: + logger.info("Reaped %d orphaned browser session(s) from previous run(s)", reaped) + + def _browser_cleanup_thread_worker(): """ Background thread that periodically cleans up inactive browser sessions. Runs every 30 seconds and checks for sessions that haven't been used within the BROWSER_SESSION_INACTIVITY_TIMEOUT period. + On first run, also reaps orphaned sessions from previous process lifetimes. """ + # One-time orphan reap on startup + try: + _reap_orphaned_browser_sessions() + except Exception as e: + logger.warning("Orphan reap error: %s", e) + while _cleanup_running: try: _cleanup_inactive_browser_sessions()