mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: two process leaks (agent-browser daemons, paste.rs sleepers) (#11843)
Both fixes close process leaks observed in production (18+ orphaned
agent-browser node daemons, 15+ orphaned paste.rs sleep interpreters
accumulated over ~3 days, ~2.7 GB RSS).
## agent-browser daemon leak
Previously the orphan reaper (_reap_orphaned_browser_sessions) only ran
from _start_browser_cleanup_thread, which is only invoked on the first
browser tool call in a process. Hermes sessions that never used the
browser never swept orphans, and the cross-process orphan detection
relied on in-process _active_sessions, which doesn't see other hermes
PIDs' sessions (race risk).
- Write <session>.owner_pid alongside the socket dir recording the
hermes PID that owns the daemon (extracted into _write_owner_pid for
direct testability).
- Reaper prefers owner_pid liveness over in-process _active_sessions.
Cross-process safe: concurrent hermes instances won't reap each
other's daemons. Legacy tracked_names fallback kept for daemons
that predate owner_pid.
- atexit handler (_emergency_cleanup_all_sessions) now always runs
the reaper, not just when this process had active sessions —
every clean hermes exit sweeps accumulated orphans.
## paste.rs auto-delete leak
_schedule_auto_delete spawned a detached Python subprocess per call
that slept 6 hours then issued DELETE requests. No dedup, no tracking —
every 'hermes debug share' invocation added ~20 MB of resident Python
interpreters that stuck around until the sleep finished.
- Replaced the spawn with ~/.hermes/pastes/pending.json: records
{url, expire_at} entries.
- _sweep_expired_pastes() synchronously DELETEs past-due entries on
every 'hermes debug' invocation (run_debug() dispatcher).
- Network failures stay in pending.json for up to 24h, then give up
(paste.rs's own retention handles the 'user never runs hermes again'
edge case).
- Zero subprocesses; regression test asserts subprocess/Popen/time.sleep
never appear in the function source (skipping docstrings via AST).
## Validation
| | Before | After |
|------------------------------|---------------|--------------|
| Orphan agent-browser daemons | 18 accumulated| 2 (live) |
| paste.rs sleep interpreters | 15 accumulated| 0 |
| RSS reclaimed | - | ~2.7 GB |
| Targeted tests | - | 2253 pass |
E2E verified: alive-owner daemons NOT reaped; dead-owner daemons
SIGTERM'd and socket dirs cleaned; pending.json sweep deletes expired
entries without spawning subprocesses.
This commit is contained in:
parent
64b354719f
commit
304fb921bf
4 changed files with 736 additions and 80 deletions
|
|
@ -501,40 +501,272 @@ class TestDeletePaste:
|
|||
|
||||
|
||||
class TestScheduleAutoDelete:
|
||||
def test_spawns_detached_process(self):
|
||||
"""``_schedule_auto_delete`` used to spawn a detached Python subprocess
|
||||
per call (one per paste URL batch). Those subprocesses slept 6 hours
|
||||
and accumulated forever under repeated use — 15+ orphaned interpreters
|
||||
were observed in production.
|
||||
|
||||
The new implementation is stateless: it records pending deletions to
|
||||
``~/.hermes/pastes/pending.json`` and lets ``_sweep_expired_pastes``
|
||||
handle the DELETE requests synchronously on the next ``hermes debug``
|
||||
invocation.
|
||||
"""
|
||||
|
||||
def test_does_not_spawn_subprocess(self, hermes_home):
|
||||
"""Regression guard: _schedule_auto_delete must NEVER spawn subprocesses.
|
||||
|
||||
We assert this structurally rather than by mocking Popen: the new
|
||||
implementation doesn't even import ``subprocess`` at module scope,
|
||||
so a mock patch wouldn't find it.
|
||||
"""
|
||||
import ast
|
||||
import inspect
|
||||
from hermes_cli.debug import _schedule_auto_delete
|
||||
|
||||
with patch("subprocess.Popen") as mock_popen:
|
||||
_schedule_auto_delete(
|
||||
["https://paste.rs/abc", "https://paste.rs/def"],
|
||||
delay_seconds=10,
|
||||
)
|
||||
# Strip the docstring before scanning so the regression-rationale
|
||||
# prose inside it doesn't trigger our banned-word checks.
|
||||
source = inspect.getsource(_schedule_auto_delete)
|
||||
tree = ast.parse(source)
|
||||
func_node = tree.body[0]
|
||||
if (
|
||||
func_node.body
|
||||
and isinstance(func_node.body[0], ast.Expr)
|
||||
and isinstance(func_node.body[0].value, ast.Constant)
|
||||
and isinstance(func_node.body[0].value.value, str)
|
||||
):
|
||||
func_node.body = func_node.body[1:]
|
||||
code_only = ast.unparse(func_node)
|
||||
|
||||
mock_popen.assert_called_once()
|
||||
call_args = mock_popen.call_args
|
||||
# Verify detached
|
||||
assert call_args[1]["start_new_session"] is True
|
||||
# Verify the script references both URLs
|
||||
script = call_args[0][0][2] # [python, -c, script]
|
||||
assert "paste.rs/abc" in script
|
||||
assert "paste.rs/def" in script
|
||||
assert "time.sleep(10)" in script
|
||||
assert "Popen" not in code_only, (
|
||||
"_schedule_auto_delete must not spawn subprocesses — "
|
||||
"use pending.json + _sweep_expired_pastes instead"
|
||||
)
|
||||
assert "subprocess" not in code_only, (
|
||||
"_schedule_auto_delete must not reference subprocess at all"
|
||||
)
|
||||
assert "time.sleep" not in code_only, (
|
||||
"Regression: sleeping in _schedule_auto_delete is the bug being fixed"
|
||||
)
|
||||
|
||||
def test_skips_non_paste_rs_urls(self):
|
||||
from hermes_cli.debug import _schedule_auto_delete
|
||||
# And verify that calling it doesn't produce any orphaned children
|
||||
# (it should just write pending.json synchronously).
|
||||
import os as _os
|
||||
before = set(_os.listdir("/proc")) if _os.path.exists("/proc") else None
|
||||
_schedule_auto_delete(
|
||||
["https://paste.rs/abc", "https://paste.rs/def"],
|
||||
delay_seconds=10,
|
||||
)
|
||||
if before is not None:
|
||||
after = set(_os.listdir("/proc"))
|
||||
new = after - before
|
||||
# Filter to only integer-named entries (process PIDs)
|
||||
new_pids = [p for p in new if p.isdigit()]
|
||||
# It's fine if unrelated processes appeared — we just need to make
|
||||
# sure we didn't spawn a long-sleeping one. The old bug spawned
|
||||
# a python interpreter whose cmdline contained "time.sleep".
|
||||
for pid in new_pids:
|
||||
try:
|
||||
with open(f"/proc/{pid}/cmdline", "rb") as f:
|
||||
cmdline = f.read().decode("utf-8", errors="replace")
|
||||
assert "time.sleep" not in cmdline, (
|
||||
f"Leaked sleeper subprocess PID {pid}: {cmdline}"
|
||||
)
|
||||
except OSError:
|
||||
pass # process exited already
|
||||
|
||||
with patch("subprocess.Popen") as mock_popen:
|
||||
_schedule_auto_delete(["https://dpaste.com/something"])
|
||||
def test_records_pending_to_json(self, hermes_home):
|
||||
"""Scheduled URLs are persisted to pending.json with expiration."""
|
||||
from hermes_cli.debug import _schedule_auto_delete, _pending_file
|
||||
import json
|
||||
|
||||
mock_popen.assert_not_called()
|
||||
_schedule_auto_delete(
|
||||
["https://paste.rs/abc", "https://paste.rs/def"],
|
||||
delay_seconds=10,
|
||||
)
|
||||
|
||||
def test_handles_popen_failure_gracefully(self):
|
||||
from hermes_cli.debug import _schedule_auto_delete
|
||||
pending_path = _pending_file()
|
||||
assert pending_path.exists()
|
||||
|
||||
with patch("subprocess.Popen",
|
||||
side_effect=OSError("no such file")):
|
||||
# Should not raise
|
||||
_schedule_auto_delete(["https://paste.rs/abc"])
|
||||
entries = json.loads(pending_path.read_text())
|
||||
assert len(entries) == 2
|
||||
urls = {e["url"] for e in entries}
|
||||
assert urls == {"https://paste.rs/abc", "https://paste.rs/def"}
|
||||
|
||||
# expire_at is ~now + delay_seconds
|
||||
import time
|
||||
for e in entries:
|
||||
assert e["expire_at"] > time.time()
|
||||
assert e["expire_at"] <= time.time() + 15
|
||||
|
||||
def test_skips_non_paste_rs_urls(self, hermes_home):
|
||||
"""dpaste.com URLs auto-expire — don't track them."""
|
||||
from hermes_cli.debug import _schedule_auto_delete, _pending_file
|
||||
|
||||
_schedule_auto_delete(["https://dpaste.com/something"])
|
||||
|
||||
# pending.json should not be created for non-paste.rs URLs
|
||||
assert not _pending_file().exists()
|
||||
|
||||
def test_merges_with_existing_pending(self, hermes_home):
|
||||
"""Subsequent calls merge into existing pending.json."""
|
||||
from hermes_cli.debug import _schedule_auto_delete, _load_pending
|
||||
|
||||
_schedule_auto_delete(["https://paste.rs/first"], delay_seconds=10)
|
||||
_schedule_auto_delete(["https://paste.rs/second"], delay_seconds=10)
|
||||
|
||||
entries = _load_pending()
|
||||
urls = {e["url"] for e in entries}
|
||||
assert urls == {"https://paste.rs/first", "https://paste.rs/second"}
|
||||
|
||||
def test_dedupes_same_url(self, hermes_home):
|
||||
"""Same URL recorded twice → one entry with the later expire_at."""
|
||||
from hermes_cli.debug import _schedule_auto_delete, _load_pending
|
||||
|
||||
_schedule_auto_delete(["https://paste.rs/dup"], delay_seconds=10)
|
||||
_schedule_auto_delete(["https://paste.rs/dup"], delay_seconds=100)
|
||||
|
||||
entries = _load_pending()
|
||||
assert len(entries) == 1
|
||||
assert entries[0]["url"] == "https://paste.rs/dup"
|
||||
|
||||
|
||||
class TestSweepExpiredPastes:
|
||||
"""Test the opportunistic sweep that replaces the sleeping subprocess."""
|
||||
|
||||
def test_sweep_empty_is_noop(self, hermes_home):
|
||||
from hermes_cli.debug import _sweep_expired_pastes
|
||||
|
||||
deleted, remaining = _sweep_expired_pastes()
|
||||
assert deleted == 0
|
||||
assert remaining == 0
|
||||
|
||||
def test_sweep_deletes_expired_entries(self, hermes_home):
|
||||
from hermes_cli.debug import (
|
||||
_sweep_expired_pastes,
|
||||
_save_pending,
|
||||
_load_pending,
|
||||
)
|
||||
import time
|
||||
|
||||
# Seed pending.json with one expired + one future entry
|
||||
_save_pending([
|
||||
{"url": "https://paste.rs/expired", "expire_at": time.time() - 100},
|
||||
{"url": "https://paste.rs/future", "expire_at": time.time() + 3600},
|
||||
])
|
||||
|
||||
delete_calls = []
|
||||
|
||||
def fake_delete(url):
|
||||
delete_calls.append(url)
|
||||
return True
|
||||
|
||||
with patch("hermes_cli.debug.delete_paste", side_effect=fake_delete):
|
||||
deleted, remaining = _sweep_expired_pastes()
|
||||
|
||||
assert delete_calls == ["https://paste.rs/expired"]
|
||||
assert deleted == 1
|
||||
assert remaining == 1
|
||||
|
||||
entries = _load_pending()
|
||||
urls = {e["url"] for e in entries}
|
||||
assert urls == {"https://paste.rs/future"}
|
||||
|
||||
def test_sweep_leaves_future_entries_alone(self, hermes_home):
|
||||
from hermes_cli.debug import _sweep_expired_pastes, _save_pending
|
||||
import time
|
||||
|
||||
_save_pending([
|
||||
{"url": "https://paste.rs/future1", "expire_at": time.time() + 3600},
|
||||
{"url": "https://paste.rs/future2", "expire_at": time.time() + 7200},
|
||||
])
|
||||
|
||||
with patch("hermes_cli.debug.delete_paste") as mock_delete:
|
||||
deleted, remaining = _sweep_expired_pastes()
|
||||
|
||||
mock_delete.assert_not_called()
|
||||
assert deleted == 0
|
||||
assert remaining == 2
|
||||
|
||||
def test_sweep_survives_network_failure(self, hermes_home):
|
||||
"""Failed DELETEs stay in pending.json until the 24h grace window."""
|
||||
from hermes_cli.debug import (
|
||||
_sweep_expired_pastes,
|
||||
_save_pending,
|
||||
_load_pending,
|
||||
)
|
||||
import time
|
||||
|
||||
_save_pending([
|
||||
{"url": "https://paste.rs/flaky", "expire_at": time.time() - 100},
|
||||
])
|
||||
|
||||
with patch(
|
||||
"hermes_cli.debug.delete_paste",
|
||||
side_effect=Exception("network down"),
|
||||
):
|
||||
deleted, remaining = _sweep_expired_pastes()
|
||||
|
||||
# Failure within 24h grace → kept for retry
|
||||
assert deleted == 0
|
||||
assert remaining == 1
|
||||
assert len(_load_pending()) == 1
|
||||
|
||||
def test_sweep_drops_entries_past_grace_window(self, hermes_home):
|
||||
"""After 24h past expiration, give up even on network failures."""
|
||||
from hermes_cli.debug import (
|
||||
_sweep_expired_pastes,
|
||||
_save_pending,
|
||||
_load_pending,
|
||||
)
|
||||
import time
|
||||
|
||||
# Expired 25 hours ago → past the 24h grace window
|
||||
very_old = time.time() - (25 * 3600)
|
||||
_save_pending([
|
||||
{"url": "https://paste.rs/ancient", "expire_at": very_old},
|
||||
])
|
||||
|
||||
with patch(
|
||||
"hermes_cli.debug.delete_paste",
|
||||
side_effect=Exception("network down"),
|
||||
):
|
||||
deleted, remaining = _sweep_expired_pastes()
|
||||
|
||||
assert deleted == 1
|
||||
assert remaining == 0
|
||||
assert _load_pending() == []
|
||||
|
||||
|
||||
class TestRunDebugSweepsOnInvocation:
|
||||
"""``run_debug`` must sweep expired pastes on every invocation."""
|
||||
|
||||
def test_run_debug_calls_sweep(self, hermes_home):
|
||||
from hermes_cli.debug import run_debug
|
||||
|
||||
args = MagicMock()
|
||||
args.debug_command = None # default → prints help
|
||||
|
||||
with patch("hermes_cli.debug._sweep_expired_pastes") as mock_sweep:
|
||||
run_debug(args)
|
||||
|
||||
mock_sweep.assert_called_once()
|
||||
|
||||
def test_run_debug_survives_sweep_failure(self, hermes_home, capsys):
|
||||
"""If the sweep throws, the subcommand still runs."""
|
||||
from hermes_cli.debug import run_debug
|
||||
|
||||
args = MagicMock()
|
||||
args.debug_command = None
|
||||
|
||||
with patch(
|
||||
"hermes_cli.debug._sweep_expired_pastes",
|
||||
side_effect=RuntimeError("boom"),
|
||||
):
|
||||
run_debug(args) # must not raise
|
||||
|
||||
# Default subcommand still printed help
|
||||
out = capsys.readouterr().out
|
||||
assert "Usage: hermes debug" in out
|
||||
|
||||
|
||||
class TestRunDebugDelete:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue