mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-26 01:01:40 +00:00
fix: two process leaks (agent-browser daemons, paste.rs sleepers) (#11843)
Both fixes close process leaks observed in production (18+ orphaned
agent-browser node daemons, 15+ orphaned paste.rs sleep interpreters
accumulated over ~3 days, ~2.7 GB RSS).
## agent-browser daemon leak
Previously the orphan reaper (_reap_orphaned_browser_sessions) only ran
from _start_browser_cleanup_thread, which is only invoked on the first
browser tool call in a process. Hermes sessions that never used the
browser never swept orphans, and the cross-process orphan detection
relied on in-process _active_sessions, which doesn't see other hermes
PIDs' sessions (race risk).
- Write <session>.owner_pid alongside the socket dir recording the
hermes PID that owns the daemon (extracted into _write_owner_pid for
direct testability).
- Reaper prefers owner_pid liveness over in-process _active_sessions.
Cross-process safe: concurrent hermes instances won't reap each
other's daemons. Legacy tracked_names fallback kept for daemons
that predate owner_pid.
- atexit handler (_emergency_cleanup_all_sessions) now always runs
the reaper, not just when this process had active sessions —
every clean hermes exit sweeps accumulated orphans.
## paste.rs auto-delete leak
_schedule_auto_delete spawned a detached Python subprocess per call
that slept 6 hours then issued DELETE requests. No dedup, no tracking —
every 'hermes debug share' invocation added ~20 MB of resident Python
interpreters that stuck around until the sleep finished.
- Replaced the spawn with ~/.hermes/pastes/pending.json: records
{url, expire_at} entries.
- _sweep_expired_pastes() synchronously DELETEs past-due entries on
every 'hermes debug' invocation (run_debug() dispatcher).
- Network failures stay in pending.json for up to 24h, then give up
(paste.rs's own retention handles the 'user never runs hermes again'
edge case).
- Zero subprocesses; regression test asserts subprocess/Popen/time.sleep
never appear in the function source (skipping docstrings via AST).
## Validation
| | Before | After |
|------------------------------|---------------|--------------|
| Orphan agent-browser daemons | 18 accumulated| 2 (live) |
| paste.rs sleep interpreters | 15 accumulated| 0 |
| RSS reclaimed | - | ~2.7 GB |
| Targeted tests | - | 2253 pass |
E2E verified: alive-owner daemons NOT reaped; dead-owner daemons
SIGTERM'd and socket dirs cleaned; pending.json sweep deletes expired
entries without spawning subprocesses.
This commit is contained in:
parent
64b354719f
commit
304fb921bf
4 changed files with 736 additions and 80 deletions
|
|
@ -28,12 +28,22 @@ def _isolate_sessions():
|
|||
bt._active_sessions.update(orig)
|
||||
|
||||
|
||||
def _make_socket_dir(tmpdir, session_name, pid=None):
|
||||
"""Create a fake agent-browser socket directory with optional PID file."""
|
||||
def _make_socket_dir(tmpdir, session_name, pid=None, owner_pid=None):
|
||||
"""Create a fake agent-browser socket directory with optional PID files.
|
||||
|
||||
Args:
|
||||
tmpdir: base temp directory
|
||||
session_name: name like "h_abc1234567" or "cdp_abc1234567"
|
||||
pid: daemon PID to write to <session>.pid (None = no file)
|
||||
owner_pid: owning hermes PID to write to <session>.owner_pid
|
||||
(None = no file; tests the legacy path)
|
||||
"""
|
||||
d = tmpdir / f"agent-browser-{session_name}"
|
||||
d.mkdir()
|
||||
if pid is not None:
|
||||
(d / f"{session_name}.pid").write_text(str(pid))
|
||||
if owner_pid is not None:
|
||||
(d / f"{session_name}.owner_pid").write_text(str(owner_pid))
|
||||
return d
|
||||
|
||||
|
||||
|
|
@ -62,7 +72,10 @@ class TestReapOrphanedBrowserSessions:
|
|||
assert not d.exists()
|
||||
|
||||
def test_orphaned_alive_daemon_is_killed(self, fake_tmpdir):
|
||||
"""Alive daemon not tracked by _active_sessions gets SIGTERM."""
|
||||
"""Alive daemon not tracked by _active_sessions gets SIGTERM (legacy path).
|
||||
|
||||
No owner_pid file => falls back to tracked_names check.
|
||||
"""
|
||||
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||
|
||||
d = _make_socket_dir(fake_tmpdir, "h_orphan12345", pid=12345)
|
||||
|
|
@ -84,7 +97,7 @@ class TestReapOrphanedBrowserSessions:
|
|||
assert (12345, signal.SIGTERM) in kill_calls
|
||||
|
||||
def test_tracked_session_is_not_reaped(self, fake_tmpdir):
|
||||
"""Sessions tracked in _active_sessions are left alone."""
|
||||
"""Sessions tracked in _active_sessions are left alone (legacy path)."""
|
||||
import tools.browser_tool as bt
|
||||
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||
|
||||
|
|
@ -156,3 +169,240 @@ class TestReapOrphanedBrowserSessions:
|
|||
|
||||
_reap_orphaned_browser_sessions()
|
||||
assert not d.exists()
|
||||
|
||||
|
||||
class TestOwnerPidCrossProcess:
|
||||
"""Tests for owner_pid-based cross-process safe reaping.
|
||||
|
||||
The owner_pid file records which hermes process owns a daemon so that
|
||||
concurrent hermes processes don't reap each other's active browser
|
||||
sessions. Added to fix orphan accumulation from crashed processes.
|
||||
"""
|
||||
|
||||
def test_alive_owner_is_not_reaped_even_when_untracked(self, fake_tmpdir):
|
||||
"""Daemon with alive owner_pid is NOT reaped, even if not in our _active_sessions.
|
||||
|
||||
This is the core cross-process safety check: Process B scanning while
|
||||
Process A is using a browser must not kill A's daemon.
|
||||
"""
|
||||
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||
|
||||
# Use our own PID as the "owner" — guaranteed alive
|
||||
d = _make_socket_dir(
|
||||
fake_tmpdir, "h_alive_owner", pid=12345, owner_pid=os.getpid()
|
||||
)
|
||||
|
||||
kill_calls = []
|
||||
|
||||
def mock_kill(pid, sig):
|
||||
kill_calls.append((pid, sig))
|
||||
if pid == os.getpid() and sig == 0:
|
||||
return # real existence check: owner alive
|
||||
if sig == 0:
|
||||
return # pretend daemon exists too
|
||||
# Don't actually kill anything
|
||||
|
||||
with patch("os.kill", side_effect=mock_kill):
|
||||
_reap_orphaned_browser_sessions()
|
||||
|
||||
# We should have checked the owner (sig 0) but never tried to kill
|
||||
# the daemon.
|
||||
assert (12345, signal.SIGTERM) not in kill_calls
|
||||
# Dir should still exist
|
||||
assert d.exists()
|
||||
|
||||
def test_dead_owner_triggers_reap(self, fake_tmpdir):
|
||||
"""Daemon whose owner_pid is dead gets reaped."""
|
||||
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||
|
||||
# PID 999999999 almost certainly doesn't exist
|
||||
d = _make_socket_dir(
|
||||
fake_tmpdir, "h_dead_owner1", pid=12345, owner_pid=999999999
|
||||
)
|
||||
|
||||
kill_calls = []
|
||||
|
||||
def mock_kill(pid, sig):
|
||||
kill_calls.append((pid, sig))
|
||||
if pid == 999999999 and sig == 0:
|
||||
raise ProcessLookupError # owner dead
|
||||
if pid == 12345 and sig == 0:
|
||||
return # daemon still alive
|
||||
# SIGTERM to daemon — noop in test
|
||||
|
||||
with patch("os.kill", side_effect=mock_kill):
|
||||
_reap_orphaned_browser_sessions()
|
||||
|
||||
# Owner checked (returned dead), daemon checked (alive), daemon killed
|
||||
assert (999999999, 0) in kill_calls
|
||||
assert (12345, 0) in kill_calls
|
||||
assert (12345, signal.SIGTERM) in kill_calls
|
||||
# Dir cleaned up
|
||||
assert not d.exists()
|
||||
|
||||
def test_corrupt_owner_pid_falls_back_to_legacy(self, fake_tmpdir):
|
||||
"""Corrupt owner_pid file → fall back to tracked_names check."""
|
||||
import tools.browser_tool as bt
|
||||
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||
|
||||
session_name = "h_corrupt_own"
|
||||
d = _make_socket_dir(fake_tmpdir, session_name, pid=12345)
|
||||
# Write garbage to owner_pid file
|
||||
(d / f"{session_name}.owner_pid").write_text("not-a-pid")
|
||||
|
||||
# Register session so legacy fallback leaves it alone
|
||||
bt._active_sessions["task"] = {"session_name": session_name}
|
||||
|
||||
kill_calls = []
|
||||
|
||||
def mock_kill(pid, sig):
|
||||
kill_calls.append((pid, sig))
|
||||
|
||||
with patch("os.kill", side_effect=mock_kill):
|
||||
_reap_orphaned_browser_sessions()
|
||||
|
||||
# Legacy path took over → tracked → not reaped
|
||||
assert (12345, signal.SIGTERM) not in kill_calls
|
||||
assert d.exists()
|
||||
|
||||
def test_owner_pid_permission_error_treated_as_alive(self, fake_tmpdir):
|
||||
"""If os.kill(owner, 0) raises PermissionError, treat owner as alive.
|
||||
|
||||
PermissionError means the PID exists but is owned by a different user —
|
||||
we must not assume the owner is dead (could kill someone else's daemon).
|
||||
"""
|
||||
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||
|
||||
d = _make_socket_dir(
|
||||
fake_tmpdir, "h_perm_owner1", pid=12345, owner_pid=22222
|
||||
)
|
||||
|
||||
kill_calls = []
|
||||
|
||||
def mock_kill(pid, sig):
|
||||
kill_calls.append((pid, sig))
|
||||
if pid == 22222 and sig == 0:
|
||||
raise PermissionError("not our user")
|
||||
|
||||
with patch("os.kill", side_effect=mock_kill):
|
||||
_reap_orphaned_browser_sessions()
|
||||
|
||||
# Must NOT have tried to kill the daemon
|
||||
assert (12345, signal.SIGTERM) not in kill_calls
|
||||
assert d.exists()
|
||||
|
||||
def test_write_owner_pid_creates_file_with_current_pid(
|
||||
self, fake_tmpdir, monkeypatch
|
||||
):
|
||||
"""_write_owner_pid(dir, session) writes <session>.owner_pid with os.getpid()."""
|
||||
import tools.browser_tool as bt
|
||||
|
||||
session_name = "h_ownertest01"
|
||||
socket_dir = fake_tmpdir / f"agent-browser-{session_name}"
|
||||
socket_dir.mkdir()
|
||||
|
||||
bt._write_owner_pid(str(socket_dir), session_name)
|
||||
|
||||
owner_pid_file = socket_dir / f"{session_name}.owner_pid"
|
||||
assert owner_pid_file.exists()
|
||||
assert owner_pid_file.read_text().strip() == str(os.getpid())
|
||||
|
||||
def test_write_owner_pid_is_idempotent(self, fake_tmpdir):
|
||||
"""Calling _write_owner_pid twice leaves a single owner_pid file."""
|
||||
import tools.browser_tool as bt
|
||||
|
||||
session_name = "h_idempot1234"
|
||||
socket_dir = fake_tmpdir / f"agent-browser-{session_name}"
|
||||
socket_dir.mkdir()
|
||||
|
||||
bt._write_owner_pid(str(socket_dir), session_name)
|
||||
bt._write_owner_pid(str(socket_dir), session_name)
|
||||
|
||||
files = list(socket_dir.glob("*.owner_pid"))
|
||||
assert len(files) == 1
|
||||
assert files[0].read_text().strip() == str(os.getpid())
|
||||
|
||||
def test_write_owner_pid_swallows_oserror(self, fake_tmpdir, monkeypatch):
|
||||
"""OSError (e.g. permission denied) doesn't propagate — the reaper
|
||||
falls back to the legacy tracked_names heuristic in that case.
|
||||
"""
|
||||
import tools.browser_tool as bt
|
||||
|
||||
def raise_oserror(*a, **kw):
|
||||
raise OSError("permission denied")
|
||||
|
||||
monkeypatch.setattr("builtins.open", raise_oserror)
|
||||
|
||||
# Must not raise
|
||||
bt._write_owner_pid(str(fake_tmpdir), "h_readonly123")
|
||||
|
||||
def test_run_browser_command_calls_write_owner_pid(
|
||||
self, fake_tmpdir, monkeypatch
|
||||
):
|
||||
"""_run_browser_command wires _write_owner_pid after mkdir."""
|
||||
import tools.browser_tool as bt
|
||||
|
||||
session_name = "h_wiringtest1"
|
||||
|
||||
# Short-circuit Popen so we exit after the owner_pid write
|
||||
class _FakePopen:
|
||||
def __init__(self, *a, **kw):
|
||||
raise RuntimeError("short-circuit after owner_pid")
|
||||
|
||||
monkeypatch.setattr(bt.subprocess, "Popen", _FakePopen)
|
||||
monkeypatch.setattr(bt, "_find_agent_browser", lambda: "/bin/true")
|
||||
monkeypatch.setattr(
|
||||
bt, "_requires_real_termux_browser_install", lambda *a: False
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
bt, "_get_session_info",
|
||||
lambda task_id: {"session_name": session_name},
|
||||
)
|
||||
|
||||
calls = []
|
||||
orig_write = bt._write_owner_pid
|
||||
|
||||
def _spy(*a, **kw):
|
||||
calls.append(a)
|
||||
orig_write(*a, **kw)
|
||||
|
||||
monkeypatch.setattr(bt, "_write_owner_pid", _spy)
|
||||
|
||||
with patch("tools.browser_tool._socket_safe_tmpdir", return_value=str(fake_tmpdir)):
|
||||
try:
|
||||
bt._run_browser_command(task_id="test_task", command="goto", args=[])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
assert calls, "_run_browser_command must call _write_owner_pid"
|
||||
# First positional arg is the socket_dir, second is the session_name
|
||||
socket_dir_arg, session_name_arg = calls[0][0], calls[0][1]
|
||||
assert session_name_arg == session_name
|
||||
assert session_name in socket_dir_arg
|
||||
|
||||
|
||||
class TestEmergencyCleanupRunsReaper:
|
||||
"""Verify atexit-registered cleanup sweeps orphans even without an active session."""
|
||||
|
||||
def test_emergency_cleanup_calls_reaper(self, fake_tmpdir, monkeypatch):
|
||||
"""_emergency_cleanup_all_sessions must call _reap_orphaned_browser_sessions."""
|
||||
import tools.browser_tool as bt
|
||||
|
||||
# Reset the _cleanup_done flag so the cleanup actually runs
|
||||
monkeypatch.setattr(bt, "_cleanup_done", False)
|
||||
|
||||
reaper_called = []
|
||||
orig_reaper = bt._reap_orphaned_browser_sessions
|
||||
|
||||
def _spy_reaper():
|
||||
reaper_called.append(True)
|
||||
orig_reaper()
|
||||
|
||||
monkeypatch.setattr(bt, "_reap_orphaned_browser_sessions", _spy_reaper)
|
||||
|
||||
# No active sessions — reaper should still run
|
||||
bt._emergency_cleanup_all_sessions()
|
||||
|
||||
assert reaper_called, (
|
||||
"Reaper must run on exit even with no active sessions"
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue