mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: two process leaks (agent-browser daemons, paste.rs sleepers) (#11843)
Both fixes close process leaks observed in production (18+ orphaned
agent-browser node daemons, 15+ orphaned paste.rs sleep interpreters
accumulated over ~3 days, ~2.7 GB RSS).
## agent-browser daemon leak
Previously the orphan reaper (_reap_orphaned_browser_sessions) only ran
from _start_browser_cleanup_thread, which is only invoked on the first
browser tool call in a process. Hermes sessions that never used the
browser never swept orphans, and the cross-process orphan detection
relied on in-process _active_sessions, which doesn't see other hermes
PIDs' sessions (race risk).
- Write <session>.owner_pid alongside the socket dir recording the
hermes PID that owns the daemon (extracted into _write_owner_pid for
direct testability).
- Reaper prefers owner_pid liveness over in-process _active_sessions.
Cross-process safe: concurrent hermes instances won't reap each
other's daemons. Legacy tracked_names fallback kept for daemons
that predate owner_pid.
- atexit handler (_emergency_cleanup_all_sessions) now always runs
the reaper, not just when this process had active sessions —
every clean hermes exit sweeps accumulated orphans.
## paste.rs auto-delete leak
_schedule_auto_delete spawned a detached Python subprocess per call
that slept 6 hours then issued DELETE requests. No dedup, no tracking —
every 'hermes debug share' invocation added ~20 MB of resident Python
interpreters that stuck around until the sleep finished.
- Replaced the spawn with ~/.hermes/pastes/pending.json: records
{url, expire_at} entries.
- _sweep_expired_pastes() synchronously DELETEs past-due entries on
every 'hermes debug' invocation (run_debug() dispatcher).
- Network failures stay in pending.json for up to 24h, then give up
(paste.rs's own retention handles the 'user never runs hermes again'
edge case).
- Zero subprocesses; regression test asserts subprocess/Popen/time.sleep
never appear in the function source (skipping docstrings via AST).
## Validation
| | Before | After |
|------------------------------|---------------|--------------|
| Orphan agent-browser daemons | 18 accumulated| 2 (live) |
| paste.rs sleep interpreters | 15 accumulated| 0 |
| RSS reclaimed | - | ~2.7 GB |
| Targeted tests | - | 2253 pass |
E2E verified: alive-owner daemons NOT reaped; dead-owner daemons
SIGTERM'd and socket dirs cleaned; pending.json sweep deletes expired
entries without spawning subprocesses.
This commit is contained in:
parent
64b354719f
commit
304fb921bf
4 changed files with 736 additions and 80 deletions
|
|
@ -6,7 +6,10 @@ Currently supports:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import io
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
@ -31,6 +34,119 @@ _MAX_LOG_BYTES = 512_000
|
||||||
_AUTO_DELETE_SECONDS = 21600
|
_AUTO_DELETE_SECONDS = 21600
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pending-deletion tracking (replaces the old fork-and-sleep subprocess).
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _pending_file() -> Path:
|
||||||
|
"""Path to ``~/.hermes/pastes/pending.json``.
|
||||||
|
|
||||||
|
Each entry: ``{"url": "...", "expire_at": <unix_ts>}``. Scheduled
|
||||||
|
DELETEs used to be handled by spawning a detached Python process per
|
||||||
|
paste that slept for 6 hours; those accumulated forever if the user
|
||||||
|
ran ``hermes debug share`` repeatedly. We now persist the schedule
|
||||||
|
to disk and sweep expired entries on the next debug invocation.
|
||||||
|
"""
|
||||||
|
return get_hermes_home() / "pastes" / "pending.json"
|
||||||
|
|
||||||
|
|
||||||
|
def _load_pending() -> list[dict]:
|
||||||
|
path = _pending_file()
|
||||||
|
if not path.exists():
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
if isinstance(data, list):
|
||||||
|
# Filter to well-formed entries only
|
||||||
|
return [
|
||||||
|
e for e in data
|
||||||
|
if isinstance(e, dict) and "url" in e and "expire_at" in e
|
||||||
|
]
|
||||||
|
except (OSError, ValueError, json.JSONDecodeError):
|
||||||
|
pass
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _save_pending(entries: list[dict]) -> None:
|
||||||
|
path = _pending_file()
|
||||||
|
try:
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
tmp = path.with_suffix(".json.tmp")
|
||||||
|
tmp.write_text(json.dumps(entries, indent=2), encoding="utf-8")
|
||||||
|
os.replace(tmp, path)
|
||||||
|
except OSError:
|
||||||
|
# Non-fatal — worst case the user has to run ``hermes debug delete``
|
||||||
|
# manually.
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _record_pending(urls: list[str], delay_seconds: int = _AUTO_DELETE_SECONDS) -> None:
|
||||||
|
"""Record *urls* for deletion at ``now + delay_seconds``.
|
||||||
|
|
||||||
|
Only paste.rs URLs are recorded (dpaste.com auto-expires). Entries
|
||||||
|
are merged into any existing pending.json.
|
||||||
|
"""
|
||||||
|
paste_rs_urls = [u for u in urls if _extract_paste_id(u)]
|
||||||
|
if not paste_rs_urls:
|
||||||
|
return
|
||||||
|
|
||||||
|
entries = _load_pending()
|
||||||
|
# Dedupe by URL: keep the later expire_at if same URL appears twice
|
||||||
|
by_url: dict[str, float] = {e["url"]: float(e["expire_at"]) for e in entries}
|
||||||
|
expire_at = time.time() + delay_seconds
|
||||||
|
for u in paste_rs_urls:
|
||||||
|
by_url[u] = max(expire_at, by_url.get(u, 0.0))
|
||||||
|
merged = [{"url": u, "expire_at": ts} for u, ts in by_url.items()]
|
||||||
|
_save_pending(merged)
|
||||||
|
|
||||||
|
|
||||||
|
def _sweep_expired_pastes(now: Optional[float] = None) -> tuple[int, int]:
|
||||||
|
"""Synchronously DELETE any pending pastes whose ``expire_at`` has passed.
|
||||||
|
|
||||||
|
Returns ``(deleted, remaining)``. Best-effort: failed deletes stay in
|
||||||
|
the pending file and will be retried on the next sweep. Silent —
|
||||||
|
intended to be called from every ``hermes debug`` invocation with
|
||||||
|
minimal noise.
|
||||||
|
"""
|
||||||
|
entries = _load_pending()
|
||||||
|
if not entries:
|
||||||
|
return (0, 0)
|
||||||
|
|
||||||
|
current = time.time() if now is None else now
|
||||||
|
deleted = 0
|
||||||
|
remaining: list[dict] = []
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
try:
|
||||||
|
expire_at = float(entry.get("expire_at", 0))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
continue # drop malformed entries
|
||||||
|
if expire_at > current:
|
||||||
|
remaining.append(entry)
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = entry.get("url", "")
|
||||||
|
try:
|
||||||
|
if delete_paste(url):
|
||||||
|
deleted += 1
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
# Network hiccup, 404 (already gone), etc. — drop the entry
|
||||||
|
# after a grace period; don't retry forever.
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Retain failed deletes for up to 24h past expiration, then give up.
|
||||||
|
if expire_at + 86400 > current:
|
||||||
|
remaining.append(entry)
|
||||||
|
else:
|
||||||
|
deleted += 1 # count as reaped (paste.rs will GC eventually)
|
||||||
|
|
||||||
|
if deleted:
|
||||||
|
_save_pending(remaining)
|
||||||
|
|
||||||
|
return (deleted, len(remaining))
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Privacy / delete helpers
|
# Privacy / delete helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -90,37 +206,19 @@ def delete_paste(url: str) -> bool:
|
||||||
|
|
||||||
|
|
||||||
def _schedule_auto_delete(urls: list[str], delay_seconds: int = _AUTO_DELETE_SECONDS):
|
def _schedule_auto_delete(urls: list[str], delay_seconds: int = _AUTO_DELETE_SECONDS):
|
||||||
"""Spawn a detached process to delete paste.rs pastes after *delay_seconds*.
|
"""Record *urls* for deletion ``delay_seconds`` from now.
|
||||||
|
|
||||||
The child process is fully detached (``start_new_session=True``) so it
|
Previously this spawned a detached Python subprocess per call that slept
|
||||||
survives the parent exiting (important for CLI mode). Only paste.rs
|
for 6 hours and then issued DELETE requests. Those subprocesses leaked —
|
||||||
URLs are attempted — dpaste.com pastes auto-expire on their own.
|
every ``hermes debug share`` invocation added ~20 MB of resident Python
|
||||||
|
interpreters that never exited until the sleep completed.
|
||||||
|
|
||||||
|
The replacement is stateless: we append to ``~/.hermes/pastes/pending.json``
|
||||||
|
and rely on opportunistic sweeps (``_sweep_expired_pastes``) called from
|
||||||
|
every ``hermes debug`` invocation. If the user never runs ``hermes debug``
|
||||||
|
again, paste.rs's own retention policy handles cleanup.
|
||||||
"""
|
"""
|
||||||
import subprocess
|
_record_pending(urls, delay_seconds=delay_seconds)
|
||||||
|
|
||||||
paste_rs_urls = [u for u in urls if _extract_paste_id(u)]
|
|
||||||
if not paste_rs_urls:
|
|
||||||
return
|
|
||||||
|
|
||||||
# Build a tiny inline Python script. No imports beyond stdlib.
|
|
||||||
url_list = ", ".join(f'"{u}"' for u in paste_rs_urls)
|
|
||||||
script = (
|
|
||||||
"import time, urllib.request; "
|
|
||||||
f"time.sleep({delay_seconds}); "
|
|
||||||
f"[urllib.request.urlopen(urllib.request.Request(u, method='DELETE', "
|
|
||||||
f"headers={{'User-Agent': 'hermes-agent/auto-delete'}}), timeout=15) "
|
|
||||||
f"for u in [{url_list}]]"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
subprocess.Popen(
|
|
||||||
[sys.executable, "-c", script],
|
|
||||||
start_new_session=True,
|
|
||||||
stdout=subprocess.DEVNULL,
|
|
||||||
stderr=subprocess.DEVNULL,
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
pass # Best-effort; manual delete still available.
|
|
||||||
|
|
||||||
|
|
||||||
def _delete_hint(url: str) -> str:
|
def _delete_hint(url: str) -> str:
|
||||||
|
|
@ -455,6 +553,16 @@ def run_debug_delete(args):
|
||||||
|
|
||||||
def run_debug(args):
|
def run_debug(args):
|
||||||
"""Route debug subcommands."""
|
"""Route debug subcommands."""
|
||||||
|
# Opportunistic sweep of expired pastes on every ``hermes debug`` call.
|
||||||
|
# Replaces the old per-paste sleeping subprocess that used to leak as
|
||||||
|
# one orphaned Python interpreter per scheduled deletion. Silent and
|
||||||
|
# best-effort — any failure is swallowed so ``hermes debug`` stays
|
||||||
|
# reliable even when offline.
|
||||||
|
try:
|
||||||
|
_sweep_expired_pastes()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
subcmd = getattr(args, "debug_command", None)
|
subcmd = getattr(args, "debug_command", None)
|
||||||
if subcmd == "share":
|
if subcmd == "share":
|
||||||
run_debug_share(args)
|
run_debug_share(args)
|
||||||
|
|
|
||||||
|
|
@ -501,40 +501,272 @@ class TestDeletePaste:
|
||||||
|
|
||||||
|
|
||||||
class TestScheduleAutoDelete:
|
class TestScheduleAutoDelete:
|
||||||
def test_spawns_detached_process(self):
|
"""``_schedule_auto_delete`` used to spawn a detached Python subprocess
|
||||||
|
per call (one per paste URL batch). Those subprocesses slept 6 hours
|
||||||
|
and accumulated forever under repeated use — 15+ orphaned interpreters
|
||||||
|
were observed in production.
|
||||||
|
|
||||||
|
The new implementation is stateless: it records pending deletions to
|
||||||
|
``~/.hermes/pastes/pending.json`` and lets ``_sweep_expired_pastes``
|
||||||
|
handle the DELETE requests synchronously on the next ``hermes debug``
|
||||||
|
invocation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_does_not_spawn_subprocess(self, hermes_home):
|
||||||
|
"""Regression guard: _schedule_auto_delete must NEVER spawn subprocesses.
|
||||||
|
|
||||||
|
We assert this structurally rather than by mocking Popen: the new
|
||||||
|
implementation doesn't even import ``subprocess`` at module scope,
|
||||||
|
so a mock patch wouldn't find it.
|
||||||
|
"""
|
||||||
|
import ast
|
||||||
|
import inspect
|
||||||
from hermes_cli.debug import _schedule_auto_delete
|
from hermes_cli.debug import _schedule_auto_delete
|
||||||
|
|
||||||
with patch("subprocess.Popen") as mock_popen:
|
# Strip the docstring before scanning so the regression-rationale
|
||||||
_schedule_auto_delete(
|
# prose inside it doesn't trigger our banned-word checks.
|
||||||
["https://paste.rs/abc", "https://paste.rs/def"],
|
source = inspect.getsource(_schedule_auto_delete)
|
||||||
delay_seconds=10,
|
tree = ast.parse(source)
|
||||||
)
|
func_node = tree.body[0]
|
||||||
|
if (
|
||||||
|
func_node.body
|
||||||
|
and isinstance(func_node.body[0], ast.Expr)
|
||||||
|
and isinstance(func_node.body[0].value, ast.Constant)
|
||||||
|
and isinstance(func_node.body[0].value.value, str)
|
||||||
|
):
|
||||||
|
func_node.body = func_node.body[1:]
|
||||||
|
code_only = ast.unparse(func_node)
|
||||||
|
|
||||||
mock_popen.assert_called_once()
|
assert "Popen" not in code_only, (
|
||||||
call_args = mock_popen.call_args
|
"_schedule_auto_delete must not spawn subprocesses — "
|
||||||
# Verify detached
|
"use pending.json + _sweep_expired_pastes instead"
|
||||||
assert call_args[1]["start_new_session"] is True
|
)
|
||||||
# Verify the script references both URLs
|
assert "subprocess" not in code_only, (
|
||||||
script = call_args[0][0][2] # [python, -c, script]
|
"_schedule_auto_delete must not reference subprocess at all"
|
||||||
assert "paste.rs/abc" in script
|
)
|
||||||
assert "paste.rs/def" in script
|
assert "time.sleep" not in code_only, (
|
||||||
assert "time.sleep(10)" in script
|
"Regression: sleeping in _schedule_auto_delete is the bug being fixed"
|
||||||
|
)
|
||||||
|
|
||||||
def test_skips_non_paste_rs_urls(self):
|
# And verify that calling it doesn't produce any orphaned children
|
||||||
from hermes_cli.debug import _schedule_auto_delete
|
# (it should just write pending.json synchronously).
|
||||||
|
import os as _os
|
||||||
|
before = set(_os.listdir("/proc")) if _os.path.exists("/proc") else None
|
||||||
|
_schedule_auto_delete(
|
||||||
|
["https://paste.rs/abc", "https://paste.rs/def"],
|
||||||
|
delay_seconds=10,
|
||||||
|
)
|
||||||
|
if before is not None:
|
||||||
|
after = set(_os.listdir("/proc"))
|
||||||
|
new = after - before
|
||||||
|
# Filter to only integer-named entries (process PIDs)
|
||||||
|
new_pids = [p for p in new if p.isdigit()]
|
||||||
|
# It's fine if unrelated processes appeared — we just need to make
|
||||||
|
# sure we didn't spawn a long-sleeping one. The old bug spawned
|
||||||
|
# a python interpreter whose cmdline contained "time.sleep".
|
||||||
|
for pid in new_pids:
|
||||||
|
try:
|
||||||
|
with open(f"/proc/{pid}/cmdline", "rb") as f:
|
||||||
|
cmdline = f.read().decode("utf-8", errors="replace")
|
||||||
|
assert "time.sleep" not in cmdline, (
|
||||||
|
f"Leaked sleeper subprocess PID {pid}: {cmdline}"
|
||||||
|
)
|
||||||
|
except OSError:
|
||||||
|
pass # process exited already
|
||||||
|
|
||||||
with patch("subprocess.Popen") as mock_popen:
|
def test_records_pending_to_json(self, hermes_home):
|
||||||
_schedule_auto_delete(["https://dpaste.com/something"])
|
"""Scheduled URLs are persisted to pending.json with expiration."""
|
||||||
|
from hermes_cli.debug import _schedule_auto_delete, _pending_file
|
||||||
|
import json
|
||||||
|
|
||||||
mock_popen.assert_not_called()
|
_schedule_auto_delete(
|
||||||
|
["https://paste.rs/abc", "https://paste.rs/def"],
|
||||||
|
delay_seconds=10,
|
||||||
|
)
|
||||||
|
|
||||||
def test_handles_popen_failure_gracefully(self):
|
pending_path = _pending_file()
|
||||||
from hermes_cli.debug import _schedule_auto_delete
|
assert pending_path.exists()
|
||||||
|
|
||||||
with patch("subprocess.Popen",
|
entries = json.loads(pending_path.read_text())
|
||||||
side_effect=OSError("no such file")):
|
assert len(entries) == 2
|
||||||
# Should not raise
|
urls = {e["url"] for e in entries}
|
||||||
_schedule_auto_delete(["https://paste.rs/abc"])
|
assert urls == {"https://paste.rs/abc", "https://paste.rs/def"}
|
||||||
|
|
||||||
|
# expire_at is ~now + delay_seconds
|
||||||
|
import time
|
||||||
|
for e in entries:
|
||||||
|
assert e["expire_at"] > time.time()
|
||||||
|
assert e["expire_at"] <= time.time() + 15
|
||||||
|
|
||||||
|
def test_skips_non_paste_rs_urls(self, hermes_home):
|
||||||
|
"""dpaste.com URLs auto-expire — don't track them."""
|
||||||
|
from hermes_cli.debug import _schedule_auto_delete, _pending_file
|
||||||
|
|
||||||
|
_schedule_auto_delete(["https://dpaste.com/something"])
|
||||||
|
|
||||||
|
# pending.json should not be created for non-paste.rs URLs
|
||||||
|
assert not _pending_file().exists()
|
||||||
|
|
||||||
|
def test_merges_with_existing_pending(self, hermes_home):
|
||||||
|
"""Subsequent calls merge into existing pending.json."""
|
||||||
|
from hermes_cli.debug import _schedule_auto_delete, _load_pending
|
||||||
|
|
||||||
|
_schedule_auto_delete(["https://paste.rs/first"], delay_seconds=10)
|
||||||
|
_schedule_auto_delete(["https://paste.rs/second"], delay_seconds=10)
|
||||||
|
|
||||||
|
entries = _load_pending()
|
||||||
|
urls = {e["url"] for e in entries}
|
||||||
|
assert urls == {"https://paste.rs/first", "https://paste.rs/second"}
|
||||||
|
|
||||||
|
def test_dedupes_same_url(self, hermes_home):
|
||||||
|
"""Same URL recorded twice → one entry with the later expire_at."""
|
||||||
|
from hermes_cli.debug import _schedule_auto_delete, _load_pending
|
||||||
|
|
||||||
|
_schedule_auto_delete(["https://paste.rs/dup"], delay_seconds=10)
|
||||||
|
_schedule_auto_delete(["https://paste.rs/dup"], delay_seconds=100)
|
||||||
|
|
||||||
|
entries = _load_pending()
|
||||||
|
assert len(entries) == 1
|
||||||
|
assert entries[0]["url"] == "https://paste.rs/dup"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSweepExpiredPastes:
|
||||||
|
"""Test the opportunistic sweep that replaces the sleeping subprocess."""
|
||||||
|
|
||||||
|
def test_sweep_empty_is_noop(self, hermes_home):
|
||||||
|
from hermes_cli.debug import _sweep_expired_pastes
|
||||||
|
|
||||||
|
deleted, remaining = _sweep_expired_pastes()
|
||||||
|
assert deleted == 0
|
||||||
|
assert remaining == 0
|
||||||
|
|
||||||
|
def test_sweep_deletes_expired_entries(self, hermes_home):
|
||||||
|
from hermes_cli.debug import (
|
||||||
|
_sweep_expired_pastes,
|
||||||
|
_save_pending,
|
||||||
|
_load_pending,
|
||||||
|
)
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Seed pending.json with one expired + one future entry
|
||||||
|
_save_pending([
|
||||||
|
{"url": "https://paste.rs/expired", "expire_at": time.time() - 100},
|
||||||
|
{"url": "https://paste.rs/future", "expire_at": time.time() + 3600},
|
||||||
|
])
|
||||||
|
|
||||||
|
delete_calls = []
|
||||||
|
|
||||||
|
def fake_delete(url):
|
||||||
|
delete_calls.append(url)
|
||||||
|
return True
|
||||||
|
|
||||||
|
with patch("hermes_cli.debug.delete_paste", side_effect=fake_delete):
|
||||||
|
deleted, remaining = _sweep_expired_pastes()
|
||||||
|
|
||||||
|
assert delete_calls == ["https://paste.rs/expired"]
|
||||||
|
assert deleted == 1
|
||||||
|
assert remaining == 1
|
||||||
|
|
||||||
|
entries = _load_pending()
|
||||||
|
urls = {e["url"] for e in entries}
|
||||||
|
assert urls == {"https://paste.rs/future"}
|
||||||
|
|
||||||
|
def test_sweep_leaves_future_entries_alone(self, hermes_home):
|
||||||
|
from hermes_cli.debug import _sweep_expired_pastes, _save_pending
|
||||||
|
import time
|
||||||
|
|
||||||
|
_save_pending([
|
||||||
|
{"url": "https://paste.rs/future1", "expire_at": time.time() + 3600},
|
||||||
|
{"url": "https://paste.rs/future2", "expire_at": time.time() + 7200},
|
||||||
|
])
|
||||||
|
|
||||||
|
with patch("hermes_cli.debug.delete_paste") as mock_delete:
|
||||||
|
deleted, remaining = _sweep_expired_pastes()
|
||||||
|
|
||||||
|
mock_delete.assert_not_called()
|
||||||
|
assert deleted == 0
|
||||||
|
assert remaining == 2
|
||||||
|
|
||||||
|
def test_sweep_survives_network_failure(self, hermes_home):
|
||||||
|
"""Failed DELETEs stay in pending.json until the 24h grace window."""
|
||||||
|
from hermes_cli.debug import (
|
||||||
|
_sweep_expired_pastes,
|
||||||
|
_save_pending,
|
||||||
|
_load_pending,
|
||||||
|
)
|
||||||
|
import time
|
||||||
|
|
||||||
|
_save_pending([
|
||||||
|
{"url": "https://paste.rs/flaky", "expire_at": time.time() - 100},
|
||||||
|
])
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"hermes_cli.debug.delete_paste",
|
||||||
|
side_effect=Exception("network down"),
|
||||||
|
):
|
||||||
|
deleted, remaining = _sweep_expired_pastes()
|
||||||
|
|
||||||
|
# Failure within 24h grace → kept for retry
|
||||||
|
assert deleted == 0
|
||||||
|
assert remaining == 1
|
||||||
|
assert len(_load_pending()) == 1
|
||||||
|
|
||||||
|
def test_sweep_drops_entries_past_grace_window(self, hermes_home):
|
||||||
|
"""After 24h past expiration, give up even on network failures."""
|
||||||
|
from hermes_cli.debug import (
|
||||||
|
_sweep_expired_pastes,
|
||||||
|
_save_pending,
|
||||||
|
_load_pending,
|
||||||
|
)
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Expired 25 hours ago → past the 24h grace window
|
||||||
|
very_old = time.time() - (25 * 3600)
|
||||||
|
_save_pending([
|
||||||
|
{"url": "https://paste.rs/ancient", "expire_at": very_old},
|
||||||
|
])
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"hermes_cli.debug.delete_paste",
|
||||||
|
side_effect=Exception("network down"),
|
||||||
|
):
|
||||||
|
deleted, remaining = _sweep_expired_pastes()
|
||||||
|
|
||||||
|
assert deleted == 1
|
||||||
|
assert remaining == 0
|
||||||
|
assert _load_pending() == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunDebugSweepsOnInvocation:
|
||||||
|
"""``run_debug`` must sweep expired pastes on every invocation."""
|
||||||
|
|
||||||
|
def test_run_debug_calls_sweep(self, hermes_home):
|
||||||
|
from hermes_cli.debug import run_debug
|
||||||
|
|
||||||
|
args = MagicMock()
|
||||||
|
args.debug_command = None # default → prints help
|
||||||
|
|
||||||
|
with patch("hermes_cli.debug._sweep_expired_pastes") as mock_sweep:
|
||||||
|
run_debug(args)
|
||||||
|
|
||||||
|
mock_sweep.assert_called_once()
|
||||||
|
|
||||||
|
def test_run_debug_survives_sweep_failure(self, hermes_home, capsys):
|
||||||
|
"""If the sweep throws, the subcommand still runs."""
|
||||||
|
from hermes_cli.debug import run_debug
|
||||||
|
|
||||||
|
args = MagicMock()
|
||||||
|
args.debug_command = None
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"hermes_cli.debug._sweep_expired_pastes",
|
||||||
|
side_effect=RuntimeError("boom"),
|
||||||
|
):
|
||||||
|
run_debug(args) # must not raise
|
||||||
|
|
||||||
|
# Default subcommand still printed help
|
||||||
|
out = capsys.readouterr().out
|
||||||
|
assert "Usage: hermes debug" in out
|
||||||
|
|
||||||
|
|
||||||
class TestRunDebugDelete:
|
class TestRunDebugDelete:
|
||||||
|
|
|
||||||
|
|
@ -28,12 +28,22 @@ def _isolate_sessions():
|
||||||
bt._active_sessions.update(orig)
|
bt._active_sessions.update(orig)
|
||||||
|
|
||||||
|
|
||||||
def _make_socket_dir(tmpdir, session_name, pid=None):
|
def _make_socket_dir(tmpdir, session_name, pid=None, owner_pid=None):
|
||||||
"""Create a fake agent-browser socket directory with optional PID file."""
|
"""Create a fake agent-browser socket directory with optional PID files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tmpdir: base temp directory
|
||||||
|
session_name: name like "h_abc1234567" or "cdp_abc1234567"
|
||||||
|
pid: daemon PID to write to <session>.pid (None = no file)
|
||||||
|
owner_pid: owning hermes PID to write to <session>.owner_pid
|
||||||
|
(None = no file; tests the legacy path)
|
||||||
|
"""
|
||||||
d = tmpdir / f"agent-browser-{session_name}"
|
d = tmpdir / f"agent-browser-{session_name}"
|
||||||
d.mkdir()
|
d.mkdir()
|
||||||
if pid is not None:
|
if pid is not None:
|
||||||
(d / f"{session_name}.pid").write_text(str(pid))
|
(d / f"{session_name}.pid").write_text(str(pid))
|
||||||
|
if owner_pid is not None:
|
||||||
|
(d / f"{session_name}.owner_pid").write_text(str(owner_pid))
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -62,7 +72,10 @@ class TestReapOrphanedBrowserSessions:
|
||||||
assert not d.exists()
|
assert not d.exists()
|
||||||
|
|
||||||
def test_orphaned_alive_daemon_is_killed(self, fake_tmpdir):
|
def test_orphaned_alive_daemon_is_killed(self, fake_tmpdir):
|
||||||
"""Alive daemon not tracked by _active_sessions gets SIGTERM."""
|
"""Alive daemon not tracked by _active_sessions gets SIGTERM (legacy path).
|
||||||
|
|
||||||
|
No owner_pid file => falls back to tracked_names check.
|
||||||
|
"""
|
||||||
from tools.browser_tool import _reap_orphaned_browser_sessions
|
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||||
|
|
||||||
d = _make_socket_dir(fake_tmpdir, "h_orphan12345", pid=12345)
|
d = _make_socket_dir(fake_tmpdir, "h_orphan12345", pid=12345)
|
||||||
|
|
@ -84,7 +97,7 @@ class TestReapOrphanedBrowserSessions:
|
||||||
assert (12345, signal.SIGTERM) in kill_calls
|
assert (12345, signal.SIGTERM) in kill_calls
|
||||||
|
|
||||||
def test_tracked_session_is_not_reaped(self, fake_tmpdir):
|
def test_tracked_session_is_not_reaped(self, fake_tmpdir):
|
||||||
"""Sessions tracked in _active_sessions are left alone."""
|
"""Sessions tracked in _active_sessions are left alone (legacy path)."""
|
||||||
import tools.browser_tool as bt
|
import tools.browser_tool as bt
|
||||||
from tools.browser_tool import _reap_orphaned_browser_sessions
|
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||||
|
|
||||||
|
|
@ -156,3 +169,240 @@ class TestReapOrphanedBrowserSessions:
|
||||||
|
|
||||||
_reap_orphaned_browser_sessions()
|
_reap_orphaned_browser_sessions()
|
||||||
assert not d.exists()
|
assert not d.exists()
|
||||||
|
|
||||||
|
|
||||||
|
class TestOwnerPidCrossProcess:
|
||||||
|
"""Tests for owner_pid-based cross-process safe reaping.
|
||||||
|
|
||||||
|
The owner_pid file records which hermes process owns a daemon so that
|
||||||
|
concurrent hermes processes don't reap each other's active browser
|
||||||
|
sessions. Added to fix orphan accumulation from crashed processes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_alive_owner_is_not_reaped_even_when_untracked(self, fake_tmpdir):
|
||||||
|
"""Daemon with alive owner_pid is NOT reaped, even if not in our _active_sessions.
|
||||||
|
|
||||||
|
This is the core cross-process safety check: Process B scanning while
|
||||||
|
Process A is using a browser must not kill A's daemon.
|
||||||
|
"""
|
||||||
|
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||||
|
|
||||||
|
# Use our own PID as the "owner" — guaranteed alive
|
||||||
|
d = _make_socket_dir(
|
||||||
|
fake_tmpdir, "h_alive_owner", pid=12345, owner_pid=os.getpid()
|
||||||
|
)
|
||||||
|
|
||||||
|
kill_calls = []
|
||||||
|
|
||||||
|
def mock_kill(pid, sig):
|
||||||
|
kill_calls.append((pid, sig))
|
||||||
|
if pid == os.getpid() and sig == 0:
|
||||||
|
return # real existence check: owner alive
|
||||||
|
if sig == 0:
|
||||||
|
return # pretend daemon exists too
|
||||||
|
# Don't actually kill anything
|
||||||
|
|
||||||
|
with patch("os.kill", side_effect=mock_kill):
|
||||||
|
_reap_orphaned_browser_sessions()
|
||||||
|
|
||||||
|
# We should have checked the owner (sig 0) but never tried to kill
|
||||||
|
# the daemon.
|
||||||
|
assert (12345, signal.SIGTERM) not in kill_calls
|
||||||
|
# Dir should still exist
|
||||||
|
assert d.exists()
|
||||||
|
|
||||||
|
def test_dead_owner_triggers_reap(self, fake_tmpdir):
|
||||||
|
"""Daemon whose owner_pid is dead gets reaped."""
|
||||||
|
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||||
|
|
||||||
|
# PID 999999999 almost certainly doesn't exist
|
||||||
|
d = _make_socket_dir(
|
||||||
|
fake_tmpdir, "h_dead_owner1", pid=12345, owner_pid=999999999
|
||||||
|
)
|
||||||
|
|
||||||
|
kill_calls = []
|
||||||
|
|
||||||
|
def mock_kill(pid, sig):
|
||||||
|
kill_calls.append((pid, sig))
|
||||||
|
if pid == 999999999 and sig == 0:
|
||||||
|
raise ProcessLookupError # owner dead
|
||||||
|
if pid == 12345 and sig == 0:
|
||||||
|
return # daemon still alive
|
||||||
|
# SIGTERM to daemon — noop in test
|
||||||
|
|
||||||
|
with patch("os.kill", side_effect=mock_kill):
|
||||||
|
_reap_orphaned_browser_sessions()
|
||||||
|
|
||||||
|
# Owner checked (returned dead), daemon checked (alive), daemon killed
|
||||||
|
assert (999999999, 0) in kill_calls
|
||||||
|
assert (12345, 0) in kill_calls
|
||||||
|
assert (12345, signal.SIGTERM) in kill_calls
|
||||||
|
# Dir cleaned up
|
||||||
|
assert not d.exists()
|
||||||
|
|
||||||
|
def test_corrupt_owner_pid_falls_back_to_legacy(self, fake_tmpdir):
|
||||||
|
"""Corrupt owner_pid file → fall back to tracked_names check."""
|
||||||
|
import tools.browser_tool as bt
|
||||||
|
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||||
|
|
||||||
|
session_name = "h_corrupt_own"
|
||||||
|
d = _make_socket_dir(fake_tmpdir, session_name, pid=12345)
|
||||||
|
# Write garbage to owner_pid file
|
||||||
|
(d / f"{session_name}.owner_pid").write_text("not-a-pid")
|
||||||
|
|
||||||
|
# Register session so legacy fallback leaves it alone
|
||||||
|
bt._active_sessions["task"] = {"session_name": session_name}
|
||||||
|
|
||||||
|
kill_calls = []
|
||||||
|
|
||||||
|
def mock_kill(pid, sig):
|
||||||
|
kill_calls.append((pid, sig))
|
||||||
|
|
||||||
|
with patch("os.kill", side_effect=mock_kill):
|
||||||
|
_reap_orphaned_browser_sessions()
|
||||||
|
|
||||||
|
# Legacy path took over → tracked → not reaped
|
||||||
|
assert (12345, signal.SIGTERM) not in kill_calls
|
||||||
|
assert d.exists()
|
||||||
|
|
||||||
|
def test_owner_pid_permission_error_treated_as_alive(self, fake_tmpdir):
|
||||||
|
"""If os.kill(owner, 0) raises PermissionError, treat owner as alive.
|
||||||
|
|
||||||
|
PermissionError means the PID exists but is owned by a different user —
|
||||||
|
we must not assume the owner is dead (could kill someone else's daemon).
|
||||||
|
"""
|
||||||
|
from tools.browser_tool import _reap_orphaned_browser_sessions
|
||||||
|
|
||||||
|
d = _make_socket_dir(
|
||||||
|
fake_tmpdir, "h_perm_owner1", pid=12345, owner_pid=22222
|
||||||
|
)
|
||||||
|
|
||||||
|
kill_calls = []
|
||||||
|
|
||||||
|
def mock_kill(pid, sig):
|
||||||
|
kill_calls.append((pid, sig))
|
||||||
|
if pid == 22222 and sig == 0:
|
||||||
|
raise PermissionError("not our user")
|
||||||
|
|
||||||
|
with patch("os.kill", side_effect=mock_kill):
|
||||||
|
_reap_orphaned_browser_sessions()
|
||||||
|
|
||||||
|
# Must NOT have tried to kill the daemon
|
||||||
|
assert (12345, signal.SIGTERM) not in kill_calls
|
||||||
|
assert d.exists()
|
||||||
|
|
||||||
|
def test_write_owner_pid_creates_file_with_current_pid(
|
||||||
|
self, fake_tmpdir, monkeypatch
|
||||||
|
):
|
||||||
|
"""_write_owner_pid(dir, session) writes <session>.owner_pid with os.getpid()."""
|
||||||
|
import tools.browser_tool as bt
|
||||||
|
|
||||||
|
session_name = "h_ownertest01"
|
||||||
|
socket_dir = fake_tmpdir / f"agent-browser-{session_name}"
|
||||||
|
socket_dir.mkdir()
|
||||||
|
|
||||||
|
bt._write_owner_pid(str(socket_dir), session_name)
|
||||||
|
|
||||||
|
owner_pid_file = socket_dir / f"{session_name}.owner_pid"
|
||||||
|
assert owner_pid_file.exists()
|
||||||
|
assert owner_pid_file.read_text().strip() == str(os.getpid())
|
||||||
|
|
||||||
|
def test_write_owner_pid_is_idempotent(self, fake_tmpdir):
|
||||||
|
"""Calling _write_owner_pid twice leaves a single owner_pid file."""
|
||||||
|
import tools.browser_tool as bt
|
||||||
|
|
||||||
|
session_name = "h_idempot1234"
|
||||||
|
socket_dir = fake_tmpdir / f"agent-browser-{session_name}"
|
||||||
|
socket_dir.mkdir()
|
||||||
|
|
||||||
|
bt._write_owner_pid(str(socket_dir), session_name)
|
||||||
|
bt._write_owner_pid(str(socket_dir), session_name)
|
||||||
|
|
||||||
|
files = list(socket_dir.glob("*.owner_pid"))
|
||||||
|
assert len(files) == 1
|
||||||
|
assert files[0].read_text().strip() == str(os.getpid())
|
||||||
|
|
||||||
|
def test_write_owner_pid_swallows_oserror(self, fake_tmpdir, monkeypatch):
|
||||||
|
"""OSError (e.g. permission denied) doesn't propagate — the reaper
|
||||||
|
falls back to the legacy tracked_names heuristic in that case.
|
||||||
|
"""
|
||||||
|
import tools.browser_tool as bt
|
||||||
|
|
||||||
|
def raise_oserror(*a, **kw):
|
||||||
|
raise OSError("permission denied")
|
||||||
|
|
||||||
|
monkeypatch.setattr("builtins.open", raise_oserror)
|
||||||
|
|
||||||
|
# Must not raise
|
||||||
|
bt._write_owner_pid(str(fake_tmpdir), "h_readonly123")
|
||||||
|
|
||||||
|
def test_run_browser_command_calls_write_owner_pid(
|
||||||
|
self, fake_tmpdir, monkeypatch
|
||||||
|
):
|
||||||
|
"""_run_browser_command wires _write_owner_pid after mkdir."""
|
||||||
|
import tools.browser_tool as bt
|
||||||
|
|
||||||
|
session_name = "h_wiringtest1"
|
||||||
|
|
||||||
|
# Short-circuit Popen so we exit after the owner_pid write
|
||||||
|
class _FakePopen:
|
||||||
|
def __init__(self, *a, **kw):
|
||||||
|
raise RuntimeError("short-circuit after owner_pid")
|
||||||
|
|
||||||
|
monkeypatch.setattr(bt.subprocess, "Popen", _FakePopen)
|
||||||
|
monkeypatch.setattr(bt, "_find_agent_browser", lambda: "/bin/true")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
bt, "_requires_real_termux_browser_install", lambda *a: False
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
bt, "_get_session_info",
|
||||||
|
lambda task_id: {"session_name": session_name},
|
||||||
|
)
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
orig_write = bt._write_owner_pid
|
||||||
|
|
||||||
|
def _spy(*a, **kw):
|
||||||
|
calls.append(a)
|
||||||
|
orig_write(*a, **kw)
|
||||||
|
|
||||||
|
monkeypatch.setattr(bt, "_write_owner_pid", _spy)
|
||||||
|
|
||||||
|
with patch("tools.browser_tool._socket_safe_tmpdir", return_value=str(fake_tmpdir)):
|
||||||
|
try:
|
||||||
|
bt._run_browser_command(task_id="test_task", command="goto", args=[])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
assert calls, "_run_browser_command must call _write_owner_pid"
|
||||||
|
# First positional arg is the socket_dir, second is the session_name
|
||||||
|
socket_dir_arg, session_name_arg = calls[0][0], calls[0][1]
|
||||||
|
assert session_name_arg == session_name
|
||||||
|
assert session_name in socket_dir_arg
|
||||||
|
|
||||||
|
|
||||||
|
class TestEmergencyCleanupRunsReaper:
|
||||||
|
"""Verify atexit-registered cleanup sweeps orphans even without an active session."""
|
||||||
|
|
||||||
|
def test_emergency_cleanup_calls_reaper(self, fake_tmpdir, monkeypatch):
|
||||||
|
"""_emergency_cleanup_all_sessions must call _reap_orphaned_browser_sessions."""
|
||||||
|
import tools.browser_tool as bt
|
||||||
|
|
||||||
|
# Reset the _cleanup_done flag so the cleanup actually runs
|
||||||
|
monkeypatch.setattr(bt, "_cleanup_done", False)
|
||||||
|
|
||||||
|
reaper_called = []
|
||||||
|
orig_reaper = bt._reap_orphaned_browser_sessions
|
||||||
|
|
||||||
|
def _spy_reaper():
|
||||||
|
reaper_called.append(True)
|
||||||
|
orig_reaper()
|
||||||
|
|
||||||
|
monkeypatch.setattr(bt, "_reap_orphaned_browser_sessions", _spy_reaper)
|
||||||
|
|
||||||
|
# No active sessions — reaper should still run
|
||||||
|
bt._emergency_cleanup_all_sessions()
|
||||||
|
|
||||||
|
assert reaper_called, (
|
||||||
|
"Reaper must run on exit even with no active sessions"
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -459,27 +459,38 @@ def _emergency_cleanup_all_sessions():
|
||||||
"""
|
"""
|
||||||
Emergency cleanup of all active browser sessions.
|
Emergency cleanup of all active browser sessions.
|
||||||
Called on process exit or interrupt to prevent orphaned sessions.
|
Called on process exit or interrupt to prevent orphaned sessions.
|
||||||
|
|
||||||
|
Also runs the orphan reaper to clean up daemons left behind by previously
|
||||||
|
crashed hermes processes — this way every clean hermes exit sweeps
|
||||||
|
accumulated orphans, not just ones that actively used the browser tool.
|
||||||
"""
|
"""
|
||||||
global _cleanup_done
|
global _cleanup_done
|
||||||
if _cleanup_done:
|
if _cleanup_done:
|
||||||
return
|
return
|
||||||
_cleanup_done = True
|
_cleanup_done = True
|
||||||
|
|
||||||
if not _active_sessions:
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info("Emergency cleanup: closing %s active session(s)...",
|
|
||||||
len(_active_sessions))
|
|
||||||
|
|
||||||
|
# Clean up this process's own sessions first, so their owner_pid files
|
||||||
|
# are removed before the reaper scans.
|
||||||
|
if _active_sessions:
|
||||||
|
logger.info("Emergency cleanup: closing %s active session(s)...",
|
||||||
|
len(_active_sessions))
|
||||||
|
try:
|
||||||
|
cleanup_all_browsers()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Emergency cleanup error: %s", e)
|
||||||
|
finally:
|
||||||
|
with _cleanup_lock:
|
||||||
|
_active_sessions.clear()
|
||||||
|
_session_last_activity.clear()
|
||||||
|
_recording_sessions.clear()
|
||||||
|
|
||||||
|
# Sweep orphans from other crashed hermes processes. Safe even if we
|
||||||
|
# never used the browser — uses owner_pid liveness to avoid reaping
|
||||||
|
# daemons owned by other live hermes processes.
|
||||||
try:
|
try:
|
||||||
cleanup_all_browsers()
|
_reap_orphaned_browser_sessions()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Emergency cleanup error: %s", e)
|
logger.debug("Orphan reap on exit failed: %s", e)
|
||||||
finally:
|
|
||||||
with _cleanup_lock:
|
|
||||||
_active_sessions.clear()
|
|
||||||
_session_last_activity.clear()
|
|
||||||
_recording_sessions.clear()
|
|
||||||
|
|
||||||
|
|
||||||
# Register cleanup via atexit only. Previous versions installed SIGINT/SIGTERM
|
# Register cleanup via atexit only. Previous versions installed SIGINT/SIGTERM
|
||||||
|
|
@ -523,6 +534,24 @@ def _cleanup_inactive_browser_sessions():
|
||||||
logger.warning("Error cleaning up inactive session %s: %s", task_id, e)
|
logger.warning("Error cleaning up inactive session %s: %s", task_id, e)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_owner_pid(socket_dir: str, session_name: str) -> None:
|
||||||
|
"""Record the current hermes PID as the owner of a browser socket dir.
|
||||||
|
|
||||||
|
Written atomically to ``<socket_dir>/<session_name>.owner_pid`` so the
|
||||||
|
orphan reaper can distinguish daemons owned by a live hermes process
|
||||||
|
(don't reap) from daemons whose owner crashed (reap). Best-effort —
|
||||||
|
an OSError here just falls back to the legacy ``tracked_names``
|
||||||
|
heuristic in the reaper.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
path = os.path.join(socket_dir, f"{session_name}.owner_pid")
|
||||||
|
with open(path, "w") as f:
|
||||||
|
f.write(str(os.getpid()))
|
||||||
|
except OSError as exc:
|
||||||
|
logger.debug("Could not write owner_pid file for %s: %s",
|
||||||
|
session_name, exc)
|
||||||
|
|
||||||
|
|
||||||
def _reap_orphaned_browser_sessions():
|
def _reap_orphaned_browser_sessions():
|
||||||
"""Scan for orphaned agent-browser daemon processes from previous runs.
|
"""Scan for orphaned agent-browser daemon processes from previous runs.
|
||||||
|
|
||||||
|
|
@ -532,10 +561,19 @@ def _reap_orphaned_browser_sessions():
|
||||||
|
|
||||||
This function scans the tmp directory for ``agent-browser-*`` socket dirs
|
This function scans the tmp directory for ``agent-browser-*`` socket dirs
|
||||||
left behind by previous runs, reads the daemon PID files, and kills any
|
left behind by previous runs, reads the daemon PID files, and kills any
|
||||||
daemons that are still alive but not tracked by the current process.
|
daemons whose owning hermes process is no longer alive.
|
||||||
|
|
||||||
Called once on cleanup-thread startup — not every 30 seconds — to avoid
|
Ownership detection priority:
|
||||||
races with sessions being actively created.
|
1. ``<session>.owner_pid`` file (written by current code) — if the
|
||||||
|
referenced hermes PID is alive, leave the daemon alone regardless
|
||||||
|
of whether it's in *this* process's ``_active_sessions``. This is
|
||||||
|
cross-process safe: two concurrent hermes instances won't reap each
|
||||||
|
other's daemons.
|
||||||
|
2. Fallback for daemons that predate owner_pid: check
|
||||||
|
``_active_sessions`` in the current process. If not tracked here,
|
||||||
|
treat as orphan (legacy behavior).
|
||||||
|
|
||||||
|
Safe to call from any context — atexit, cleanup thread, or on demand.
|
||||||
"""
|
"""
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
|
|
@ -548,7 +586,7 @@ def _reap_orphaned_browser_sessions():
|
||||||
if not socket_dirs:
|
if not socket_dirs:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Build set of session_names currently tracked by this process
|
# Build set of session_names currently tracked by this process (fallback path)
|
||||||
with _cleanup_lock:
|
with _cleanup_lock:
|
||||||
tracked_names = {
|
tracked_names = {
|
||||||
info.get("session_name")
|
info.get("session_name")
|
||||||
|
|
@ -564,13 +602,38 @@ def _reap_orphaned_browser_sessions():
|
||||||
if not session_name:
|
if not session_name:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip sessions that we are actively tracking
|
# Ownership check: prefer owner_pid file (cross-process safe).
|
||||||
if session_name in tracked_names:
|
owner_pid_file = os.path.join(socket_dir, f"{session_name}.owner_pid")
|
||||||
|
owner_alive: Optional[bool] = None # None = owner_pid missing/unreadable
|
||||||
|
if os.path.isfile(owner_pid_file):
|
||||||
|
try:
|
||||||
|
owner_pid = int(Path(owner_pid_file).read_text().strip())
|
||||||
|
try:
|
||||||
|
os.kill(owner_pid, 0)
|
||||||
|
owner_alive = True
|
||||||
|
except ProcessLookupError:
|
||||||
|
owner_alive = False
|
||||||
|
except PermissionError:
|
||||||
|
# Owner exists but we can't signal it (different uid).
|
||||||
|
# Treat as alive — don't reap someone else's session.
|
||||||
|
owner_alive = True
|
||||||
|
except (ValueError, OSError):
|
||||||
|
owner_alive = None # corrupt file — fall through
|
||||||
|
|
||||||
|
if owner_alive is True:
|
||||||
|
# Owner is alive — this session belongs to a live hermes process.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if owner_alive is None:
|
||||||
|
# No owner_pid file (legacy daemon). Fall back to in-process
|
||||||
|
# tracking: if this process knows about the session, leave alone.
|
||||||
|
if session_name in tracked_names:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# owner_alive is False (dead owner) OR legacy daemon not tracked here.
|
||||||
pid_file = os.path.join(socket_dir, f"{session_name}.pid")
|
pid_file = os.path.join(socket_dir, f"{session_name}.pid")
|
||||||
if not os.path.isfile(pid_file):
|
if not os.path.isfile(pid_file):
|
||||||
# No PID file — just a stale dir, remove it
|
# No daemon PID file — just a stale dir, remove it
|
||||||
shutil.rmtree(socket_dir, ignore_errors=True)
|
shutil.rmtree(socket_dir, ignore_errors=True)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
@ -591,7 +654,7 @@ def _reap_orphaned_browser_sessions():
|
||||||
# Alive but owned by someone else — leave it alone
|
# Alive but owned by someone else — leave it alone
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Daemon is alive and not tracked — orphan. Kill it.
|
# Daemon is alive and its owner is dead (or legacy + untracked). Reap.
|
||||||
try:
|
try:
|
||||||
os.kill(daemon_pid, signal.SIGTERM)
|
os.kill(daemon_pid, signal.SIGTERM)
|
||||||
logger.info("Reaped orphaned browser daemon PID %d (session %s)",
|
logger.info("Reaped orphaned browser daemon PID %d (session %s)",
|
||||||
|
|
@ -1105,6 +1168,9 @@ def _run_browser_command(
|
||||||
f"agent-browser-{session_info['session_name']}"
|
f"agent-browser-{session_info['session_name']}"
|
||||||
)
|
)
|
||||||
os.makedirs(task_socket_dir, mode=0o700, exist_ok=True)
|
os.makedirs(task_socket_dir, mode=0o700, exist_ok=True)
|
||||||
|
# Record this hermes PID as the session owner (cross-process safe
|
||||||
|
# orphan detection — see _write_owner_pid).
|
||||||
|
_write_owner_pid(task_socket_dir, session_info['session_name'])
|
||||||
logger.debug("browser cmd=%s task=%s socket_dir=%s (%d chars)",
|
logger.debug("browser cmd=%s task=%s socket_dir=%s (%d chars)",
|
||||||
command, task_id, task_socket_dir, len(task_socket_dir))
|
command, task_id, task_socket_dir, len(task_socket_dir))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue