feat(process): escalate SIGTERM->SIGKILL on host-pid termination after grace

A daemon that ignores or stalls in its SIGTERM handler currently survives the
process-registry reap and leaks until reboot (observed as agent-browser
daemons accumulating to EMFILE on long-running gateways). _terminate_host_pid
now snapshots the tree, SIGTERMs it, waits a bounded grace window
(terminal.daemon_term_grace_seconds, default 2.0s, 0 disables), then SIGKILLs
any survivor. The recycled-PID identity guard still gates the whole path, so
escalation never reaches a stranger; Windows is unchanged (taskkill /F is
already a hard kill).

Config lives in config.yaml (terminal.daemon_term_grace_seconds), NOT an env
var, per the .env-secrets-only policy.

Implements the SIGKILL-escalation idea from @tkwong's #15008, reworked onto the
current _terminate_host_pid tree-kill path (the original predated it) and
config-gated instead of env-var-gated.

Co-authored-by: Benjamin Wong <tkwong@inspiresynergy.com>
This commit is contained in:
teknium1 2026-06-21 17:52:23 -07:00 committed by Teknium
parent 41fe086eb6
commit 8cecaf0b29
3 changed files with 172 additions and 10 deletions

View file

@ -1021,6 +1021,12 @@ DEFAULT_CONFIG = {
"modal_mode": "auto",
"cwd": ".", # Use current directory
"timeout": 180,
# Bounded grace period (seconds) between SIGTERM and an escalated
# SIGKILL when terminating a host process tree (browser daemons, etc.).
# A daemon that stalls in its SIGTERM handler is force-killed after this
# window so it can't leak indefinitely. 0 disables escalation (SIGTERM
# only — the historical behavior). Floored internally at 0.
"daemon_term_grace_seconds": 2.0,
# Environment variables to pass through to sandboxed execution
# (terminal and execute_code). Skill-declared required_environment_variables
# are passed through automatically; this list is for non-skill use cases.

View file

@ -964,8 +964,12 @@ class TestKillProcess:
# ``ProcessRegistry._is_host_pid_alive`` (→
# ``gateway.status._pid_exists``), and the actual kill on POSIX
# routes through ``psutil.Process(pid).terminate()``. Neither
# touches ``os.kill`` directly. Mock both seams.
# touches ``os.kill`` directly. Mock both seams. Disable the
# SIGKILL-escalation step (grace=0) so it doesn't call
# ``psutil.wait_procs`` on the FakeProcess.
with patch("gateway.status._pid_exists", return_value=True), \
patch.object(ProcessRegistry, "_daemon_term_grace_seconds",
staticmethod(lambda: 0.0)), \
patch.object(_psutil, "Process", side_effect=lambda pid: FakeProcess(pid)):
result = registry.kill_process(s.id)
@ -1279,6 +1283,11 @@ class TestTerminateHostPidPosix:
monkeypatch.setattr(pr, "_IS_WINDOWS", False)
monkeypatch.setattr(psutil, "Process", _FakeParent)
# This test covers only the SIGTERM tree-walk ordering; disable the
# SIGKILL-escalation step (which would call psutil.wait_procs on the
# fakes) by setting the grace to 0.
monkeypatch.setattr(pr.ProcessRegistry, "_daemon_term_grace_seconds",
staticmethod(lambda: 0.0))
pr.ProcessRegistry._terminate_host_pid(12345)
@ -1436,3 +1445,95 @@ class TestPidReuseGuard:
refreshed = registry._refresh_detached_session(s)
assert refreshed.exited is True
assert s.id in registry._finished
@pytest.mark.skipif(sys.platform == "win32",
reason="POSIX SIGTERM→SIGKILL escalation; Windows uses taskkill /F")
class TestSigkillEscalation:
"""Bounded SIGTERM→SIGKILL escalation in _terminate_host_pid.
A daemon that ignores/stalls on SIGTERM must be force-killed after the
configured grace window so it can't leak indefinitely — while well-behaved
processes still exit cleanly on SIGTERM and the recycled-PID guard is never
bypassed.
"""
# A process that traps SIGTERM (ignores it): only SIGKILL stops it.
# It prints "ready" AFTER installing the handler so the parent never
# signals it during the startup window (before SIG_IGN is in place).
_TRAP = (
"import signal, sys, time;"
"signal.signal(signal.SIGTERM, signal.SIG_IGN);"
"sys.stdout.write('ready\\n'); sys.stdout.flush();"
"[time.sleep(0.2) for _ in iter(int, 1)]"
)
def _spawn_trap(self):
proc = subprocess.Popen(
[sys.executable, "-c", self._TRAP],
stdout=subprocess.PIPE, text=True,
)
# Wait until the handler is installed before returning.
line = proc.stdout.readline()
assert line.strip() == "ready", "trap process failed to start"
return proc
def test_sigterm_ignoring_daemon_is_sigkilled(self, monkeypatch):
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
staticmethod(lambda: 1.0))
proc = self._spawn_trap()
try:
ProcessRegistry._terminate_host_pid(proc.pid)
assert _wait_until(lambda: proc.poll() is not None, timeout=4.0), \
"SIGTERM-ignoring daemon should be SIGKILLed after grace"
finally:
if proc.poll() is None:
proc.kill()
proc.wait()
def test_grace_zero_disables_escalation(self, monkeypatch):
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
staticmethod(lambda: 0.0))
proc = self._spawn_trap()
try:
ProcessRegistry._terminate_host_pid(proc.pid)
# No escalation → the SIGTERM-ignoring process survives.
assert not _wait_until(lambda: proc.poll() is not None, timeout=1.0)
assert proc.poll() is None
finally:
proc.kill()
proc.wait()
def test_well_behaved_process_dies_on_sigterm(self, monkeypatch):
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
staticmethod(lambda: 2.0))
proc = _spawn_python_sleep(60)
try:
ProcessRegistry._terminate_host_pid(proc.pid)
assert _wait_until(lambda: proc.poll() is not None, timeout=3.0)
finally:
if proc.poll() is None:
proc.kill()
proc.wait()
def test_escalation_does_not_bypass_recycled_pid_guard(self, monkeypatch):
"""A start-time mismatch must still spare the PID — no SIGTERM, no SIGKILL."""
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
staticmethod(lambda: 1.0))
proc = self._spawn_trap()
try:
real_start = ProcessRegistry._safe_host_start_time(proc.pid)
ProcessRegistry._terminate_host_pid(
proc.pid, expected_start=(real_start or 0) + 1)
assert not _wait_until(lambda: proc.poll() is not None, timeout=1.5)
assert proc.poll() is None
finally:
proc.kill()
proc.wait()
def test_grace_reader_floors_at_zero(self, monkeypatch):
"""A negative configured grace is clamped to 0 (no escalation)."""
import hermes_cli.config as cfg_mod
monkeypatch.setattr(cfg_mod, "read_raw_config",
lambda: {"terminal": {"daemon_term_grace_seconds": -5}})
assert ProcessRegistry._daemon_term_grace_seconds() == 0.0

View file

@ -483,6 +483,24 @@ class ProcessRegistry:
self._move_to_finished(session)
return session
@staticmethod
def _daemon_term_grace_seconds() -> float:
"""Grace window (s) between SIGTERM and escalated SIGKILL.
Read from ``terminal.daemon_term_grace_seconds`` in config.yaml; floored
at 0 (0 disables escalation). Falls back to the DEFAULT_CONFIG value if
config is unreadable, so callers always get a sane number.
"""
try:
from hermes_cli.config import read_raw_config, cfg_get, DEFAULT_CONFIG
cfg = read_raw_config()
val = cfg_get(cfg, "terminal", "daemon_term_grace_seconds")
if val is None:
val = DEFAULT_CONFIG["terminal"]["daemon_term_grace_seconds"]
return max(float(val), 0.0)
except Exception:
return 2.0
@classmethod
def _terminate_host_pid(cls, pid: int, expected_start: Optional[int] = None) -> None:
"""Terminate a host-visible PID and its descendants.
@ -496,12 +514,17 @@ class ProcessRegistry:
POSIX: walks the process tree with ``psutil`` and SIGTERMs
children before the parent so subprocess trees (e.g. Chromium
renderers/GPU helpers spawned by an ``agent-browser`` daemon)
don't get reparented to init and survive cleanup.
don't get reparented to init and survive cleanup. After a bounded
grace window (``terminal.daemon_term_grace_seconds``) any tree member
that ignored SIGTERM a daemon stalled in its signal handler is
escalated to SIGKILL so it can't leak indefinitely. Set the grace to
0 to disable escalation (SIGTERM only).
Windows: shells out to ``taskkill /PID <pid> /T /F``. This is
the documented Microsoft primitive for tree-kill and matches the
existing convention in ``gateway.status.terminate_pid``. We can't
reuse the POSIX psutil path on Windows because:
existing convention in ``gateway.status.terminate_pid``. ``/F`` is
already a hard kill, so no separate escalation step is needed. We
can't reuse the POSIX psutil path on Windows because:
1. Windows doesn't maintain a Unix-style process tree —
``psutil.Process.children(recursive=True)`` walks PPID
@ -550,12 +573,6 @@ class ProcessRegistry:
import psutil
try:
parent = psutil.Process(pid)
for child in parent.children(recursive=True):
try:
child.terminate()
except psutil.NoSuchProcess:
pass
parent.terminate()
except psutil.NoSuchProcess:
return
except (OSError, PermissionError):
@ -563,6 +580,44 @@ class ProcessRegistry:
os.kill(pid, signal.SIGTERM)
except (OSError, ProcessLookupError, PermissionError):
pass
return
# Snapshot the whole tree (children before parent) and SIGTERM each.
try:
targets = parent.children(recursive=True)
except (psutil.NoSuchProcess, psutil.AccessDenied, OSError):
targets = []
targets.append(parent)
for proc in targets:
try:
proc.terminate()
except psutil.NoSuchProcess:
pass
except (psutil.AccessDenied, OSError):
pass
# Escalate to SIGKILL for anything that ignored SIGTERM within the
# grace window — a daemon stalled in its signal handler would otherwise
# leak indefinitely.
grace = cls._daemon_term_grace_seconds()
if grace <= 0:
return
try:
_gone, alive = psutil.wait_procs(targets, timeout=grace)
except (psutil.Error, OSError):
alive = []
for proc in alive:
try:
proc.kill() # SIGKILL on POSIX
logger.info(
"Escalated to SIGKILL for pid %d (ignored SIGTERM within "
"%.1fs grace)", proc.pid, grace,
)
except psutil.NoSuchProcess:
pass
except (psutil.AccessDenied, OSError):
pass
# ----- Spawn -----