mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
feat(process): escalate SIGTERM->SIGKILL on host-pid termination after grace
A daemon that ignores or stalls in its SIGTERM handler currently survives the process-registry reap and leaks until reboot (observed as agent-browser daemons accumulating to EMFILE on long-running gateways). _terminate_host_pid now snapshots the tree, SIGTERMs it, waits a bounded grace window (terminal.daemon_term_grace_seconds, default 2.0s, 0 disables), then SIGKILLs any survivor. The recycled-PID identity guard still gates the whole path, so escalation never reaches a stranger; Windows is unchanged (taskkill /F is already a hard kill). Config lives in config.yaml (terminal.daemon_term_grace_seconds), NOT an env var, per the .env-secrets-only policy. Implements the SIGKILL-escalation idea from @tkwong's #15008, reworked onto the current _terminate_host_pid tree-kill path (the original predated it) and config-gated instead of env-var-gated. Co-authored-by: Benjamin Wong <tkwong@inspiresynergy.com>
This commit is contained in:
parent
41fe086eb6
commit
8cecaf0b29
3 changed files with 172 additions and 10 deletions
|
|
@ -1021,6 +1021,12 @@ DEFAULT_CONFIG = {
|
|||
"modal_mode": "auto",
|
||||
"cwd": ".", # Use current directory
|
||||
"timeout": 180,
|
||||
# Bounded grace period (seconds) between SIGTERM and an escalated
|
||||
# SIGKILL when terminating a host process tree (browser daemons, etc.).
|
||||
# A daemon that stalls in its SIGTERM handler is force-killed after this
|
||||
# window so it can't leak indefinitely. 0 disables escalation (SIGTERM
|
||||
# only — the historical behavior). Floored internally at 0.
|
||||
"daemon_term_grace_seconds": 2.0,
|
||||
# Environment variables to pass through to sandboxed execution
|
||||
# (terminal and execute_code). Skill-declared required_environment_variables
|
||||
# are passed through automatically; this list is for non-skill use cases.
|
||||
|
|
|
|||
|
|
@ -964,8 +964,12 @@ class TestKillProcess:
|
|||
# ``ProcessRegistry._is_host_pid_alive`` (→
|
||||
# ``gateway.status._pid_exists``), and the actual kill on POSIX
|
||||
# routes through ``psutil.Process(pid).terminate()``. Neither
|
||||
# touches ``os.kill`` directly. Mock both seams.
|
||||
# touches ``os.kill`` directly. Mock both seams. Disable the
|
||||
# SIGKILL-escalation step (grace=0) so it doesn't call
|
||||
# ``psutil.wait_procs`` on the FakeProcess.
|
||||
with patch("gateway.status._pid_exists", return_value=True), \
|
||||
patch.object(ProcessRegistry, "_daemon_term_grace_seconds",
|
||||
staticmethod(lambda: 0.0)), \
|
||||
patch.object(_psutil, "Process", side_effect=lambda pid: FakeProcess(pid)):
|
||||
result = registry.kill_process(s.id)
|
||||
|
||||
|
|
@ -1279,6 +1283,11 @@ class TestTerminateHostPidPosix:
|
|||
|
||||
monkeypatch.setattr(pr, "_IS_WINDOWS", False)
|
||||
monkeypatch.setattr(psutil, "Process", _FakeParent)
|
||||
# This test covers only the SIGTERM tree-walk ordering; disable the
|
||||
# SIGKILL-escalation step (which would call psutil.wait_procs on the
|
||||
# fakes) by setting the grace to 0.
|
||||
monkeypatch.setattr(pr.ProcessRegistry, "_daemon_term_grace_seconds",
|
||||
staticmethod(lambda: 0.0))
|
||||
|
||||
pr.ProcessRegistry._terminate_host_pid(12345)
|
||||
|
||||
|
|
@ -1436,3 +1445,95 @@ class TestPidReuseGuard:
|
|||
refreshed = registry._refresh_detached_session(s)
|
||||
assert refreshed.exited is True
|
||||
assert s.id in registry._finished
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32",
|
||||
reason="POSIX SIGTERM→SIGKILL escalation; Windows uses taskkill /F")
|
||||
class TestSigkillEscalation:
|
||||
"""Bounded SIGTERM→SIGKILL escalation in _terminate_host_pid.
|
||||
|
||||
A daemon that ignores/stalls on SIGTERM must be force-killed after the
|
||||
configured grace window so it can't leak indefinitely — while well-behaved
|
||||
processes still exit cleanly on SIGTERM and the recycled-PID guard is never
|
||||
bypassed.
|
||||
"""
|
||||
|
||||
# A process that traps SIGTERM (ignores it): only SIGKILL stops it.
|
||||
# It prints "ready" AFTER installing the handler so the parent never
|
||||
# signals it during the startup window (before SIG_IGN is in place).
|
||||
_TRAP = (
|
||||
"import signal, sys, time;"
|
||||
"signal.signal(signal.SIGTERM, signal.SIG_IGN);"
|
||||
"sys.stdout.write('ready\\n'); sys.stdout.flush();"
|
||||
"[time.sleep(0.2) for _ in iter(int, 1)]"
|
||||
)
|
||||
|
||||
def _spawn_trap(self):
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, "-c", self._TRAP],
|
||||
stdout=subprocess.PIPE, text=True,
|
||||
)
|
||||
# Wait until the handler is installed before returning.
|
||||
line = proc.stdout.readline()
|
||||
assert line.strip() == "ready", "trap process failed to start"
|
||||
return proc
|
||||
|
||||
def test_sigterm_ignoring_daemon_is_sigkilled(self, monkeypatch):
|
||||
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
|
||||
staticmethod(lambda: 1.0))
|
||||
proc = self._spawn_trap()
|
||||
try:
|
||||
ProcessRegistry._terminate_host_pid(proc.pid)
|
||||
assert _wait_until(lambda: proc.poll() is not None, timeout=4.0), \
|
||||
"SIGTERM-ignoring daemon should be SIGKILLed after grace"
|
||||
finally:
|
||||
if proc.poll() is None:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_grace_zero_disables_escalation(self, monkeypatch):
|
||||
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
|
||||
staticmethod(lambda: 0.0))
|
||||
proc = self._spawn_trap()
|
||||
try:
|
||||
ProcessRegistry._terminate_host_pid(proc.pid)
|
||||
# No escalation → the SIGTERM-ignoring process survives.
|
||||
assert not _wait_until(lambda: proc.poll() is not None, timeout=1.0)
|
||||
assert proc.poll() is None
|
||||
finally:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_well_behaved_process_dies_on_sigterm(self, monkeypatch):
|
||||
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
|
||||
staticmethod(lambda: 2.0))
|
||||
proc = _spawn_python_sleep(60)
|
||||
try:
|
||||
ProcessRegistry._terminate_host_pid(proc.pid)
|
||||
assert _wait_until(lambda: proc.poll() is not None, timeout=3.0)
|
||||
finally:
|
||||
if proc.poll() is None:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_escalation_does_not_bypass_recycled_pid_guard(self, monkeypatch):
|
||||
"""A start-time mismatch must still spare the PID — no SIGTERM, no SIGKILL."""
|
||||
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
|
||||
staticmethod(lambda: 1.0))
|
||||
proc = self._spawn_trap()
|
||||
try:
|
||||
real_start = ProcessRegistry._safe_host_start_time(proc.pid)
|
||||
ProcessRegistry._terminate_host_pid(
|
||||
proc.pid, expected_start=(real_start or 0) + 1)
|
||||
assert not _wait_until(lambda: proc.poll() is not None, timeout=1.5)
|
||||
assert proc.poll() is None
|
||||
finally:
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
def test_grace_reader_floors_at_zero(self, monkeypatch):
|
||||
"""A negative configured grace is clamped to 0 (no escalation)."""
|
||||
import hermes_cli.config as cfg_mod
|
||||
monkeypatch.setattr(cfg_mod, "read_raw_config",
|
||||
lambda: {"terminal": {"daemon_term_grace_seconds": -5}})
|
||||
assert ProcessRegistry._daemon_term_grace_seconds() == 0.0
|
||||
|
|
|
|||
|
|
@ -483,6 +483,24 @@ class ProcessRegistry:
|
|||
self._move_to_finished(session)
|
||||
return session
|
||||
|
||||
@staticmethod
|
||||
def _daemon_term_grace_seconds() -> float:
|
||||
"""Grace window (s) between SIGTERM and escalated SIGKILL.
|
||||
|
||||
Read from ``terminal.daemon_term_grace_seconds`` in config.yaml; floored
|
||||
at 0 (0 disables escalation). Falls back to the DEFAULT_CONFIG value if
|
||||
config is unreadable, so callers always get a sane number.
|
||||
"""
|
||||
try:
|
||||
from hermes_cli.config import read_raw_config, cfg_get, DEFAULT_CONFIG
|
||||
cfg = read_raw_config()
|
||||
val = cfg_get(cfg, "terminal", "daemon_term_grace_seconds")
|
||||
if val is None:
|
||||
val = DEFAULT_CONFIG["terminal"]["daemon_term_grace_seconds"]
|
||||
return max(float(val), 0.0)
|
||||
except Exception:
|
||||
return 2.0
|
||||
|
||||
@classmethod
|
||||
def _terminate_host_pid(cls, pid: int, expected_start: Optional[int] = None) -> None:
|
||||
"""Terminate a host-visible PID and its descendants.
|
||||
|
|
@ -496,12 +514,17 @@ class ProcessRegistry:
|
|||
POSIX: walks the process tree with ``psutil`` and SIGTERMs
|
||||
children before the parent so subprocess trees (e.g. Chromium
|
||||
renderers/GPU helpers spawned by an ``agent-browser`` daemon)
|
||||
don't get reparented to init and survive cleanup.
|
||||
don't get reparented to init and survive cleanup. After a bounded
|
||||
grace window (``terminal.daemon_term_grace_seconds``) any tree member
|
||||
that ignored SIGTERM — a daemon stalled in its signal handler — is
|
||||
escalated to SIGKILL so it can't leak indefinitely. Set the grace to
|
||||
0 to disable escalation (SIGTERM only).
|
||||
|
||||
Windows: shells out to ``taskkill /PID <pid> /T /F``. This is
|
||||
the documented Microsoft primitive for tree-kill and matches the
|
||||
existing convention in ``gateway.status.terminate_pid``. We can't
|
||||
reuse the POSIX psutil path on Windows because:
|
||||
existing convention in ``gateway.status.terminate_pid``. ``/F`` is
|
||||
already a hard kill, so no separate escalation step is needed. We
|
||||
can't reuse the POSIX psutil path on Windows because:
|
||||
|
||||
1. Windows doesn't maintain a Unix-style process tree —
|
||||
``psutil.Process.children(recursive=True)`` walks PPID
|
||||
|
|
@ -550,12 +573,6 @@ class ProcessRegistry:
|
|||
import psutil
|
||||
try:
|
||||
parent = psutil.Process(pid)
|
||||
for child in parent.children(recursive=True):
|
||||
try:
|
||||
child.terminate()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
parent.terminate()
|
||||
except psutil.NoSuchProcess:
|
||||
return
|
||||
except (OSError, PermissionError):
|
||||
|
|
@ -563,6 +580,44 @@ class ProcessRegistry:
|
|||
os.kill(pid, signal.SIGTERM)
|
||||
except (OSError, ProcessLookupError, PermissionError):
|
||||
pass
|
||||
return
|
||||
|
||||
# Snapshot the whole tree (children before parent) and SIGTERM each.
|
||||
try:
|
||||
targets = parent.children(recursive=True)
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, OSError):
|
||||
targets = []
|
||||
targets.append(parent)
|
||||
|
||||
for proc in targets:
|
||||
try:
|
||||
proc.terminate()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
except (psutil.AccessDenied, OSError):
|
||||
pass
|
||||
|
||||
# Escalate to SIGKILL for anything that ignored SIGTERM within the
|
||||
# grace window — a daemon stalled in its signal handler would otherwise
|
||||
# leak indefinitely.
|
||||
grace = cls._daemon_term_grace_seconds()
|
||||
if grace <= 0:
|
||||
return
|
||||
try:
|
||||
_gone, alive = psutil.wait_procs(targets, timeout=grace)
|
||||
except (psutil.Error, OSError):
|
||||
alive = []
|
||||
for proc in alive:
|
||||
try:
|
||||
proc.kill() # SIGKILL on POSIX
|
||||
logger.info(
|
||||
"Escalated to SIGKILL for pid %d (ignored SIGTERM within "
|
||||
"%.1fs grace)", proc.pid, grace,
|
||||
)
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
except (psutil.AccessDenied, OSError):
|
||||
pass
|
||||
|
||||
# ----- Spawn -----
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue