From 8cecaf0b29bf0f3d468271a7d8b495393c43af11 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sun, 21 Jun 2026 17:52:23 -0700 Subject: [PATCH] feat(process): escalate SIGTERM->SIGKILL on host-pid termination after grace A daemon that ignores or stalls in its SIGTERM handler currently survives the process-registry reap and leaks until reboot (observed as agent-browser daemons accumulating to EMFILE on long-running gateways). _terminate_host_pid now snapshots the tree, SIGTERMs it, waits a bounded grace window (terminal.daemon_term_grace_seconds, default 2.0s, 0 disables), then SIGKILLs any survivor. The recycled-PID identity guard still gates the whole path, so escalation never reaches a stranger; Windows is unchanged (taskkill /F is already a hard kill). Config lives in config.yaml (terminal.daemon_term_grace_seconds), NOT an env var, per the .env-secrets-only policy. Implements the SIGKILL-escalation idea from @tkwong's #15008, reworked onto the current _terminate_host_pid tree-kill path (the original predated it) and config-gated instead of env-var-gated. Co-authored-by: Benjamin Wong --- hermes_cli/config.py | 6 ++ tests/tools/test_process_registry.py | 103 ++++++++++++++++++++++++++- tools/process_registry.py | 73 ++++++++++++++++--- 3 files changed, 172 insertions(+), 10 deletions(-) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index ec928d3aff6..173f04ec5dd 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1021,6 +1021,12 @@ DEFAULT_CONFIG = { "modal_mode": "auto", "cwd": ".", # Use current directory "timeout": 180, + # Bounded grace period (seconds) between SIGTERM and an escalated + # SIGKILL when terminating a host process tree (browser daemons, etc.). + # A daemon that stalls in its SIGTERM handler is force-killed after this + # window so it can't leak indefinitely. 0 disables escalation (SIGTERM + # only — the historical behavior). Floored internally at 0. + "daemon_term_grace_seconds": 2.0, # Environment variables to pass through to sandboxed execution # (terminal and execute_code). Skill-declared required_environment_variables # are passed through automatically; this list is for non-skill use cases. diff --git a/tests/tools/test_process_registry.py b/tests/tools/test_process_registry.py index 524a977b524..e2cc6545a30 100644 --- a/tests/tools/test_process_registry.py +++ b/tests/tools/test_process_registry.py @@ -964,8 +964,12 @@ class TestKillProcess: # ``ProcessRegistry._is_host_pid_alive`` (→ # ``gateway.status._pid_exists``), and the actual kill on POSIX # routes through ``psutil.Process(pid).terminate()``. Neither - # touches ``os.kill`` directly. Mock both seams. + # touches ``os.kill`` directly. Mock both seams. Disable the + # SIGKILL-escalation step (grace=0) so it doesn't call + # ``psutil.wait_procs`` on the FakeProcess. with patch("gateway.status._pid_exists", return_value=True), \ + patch.object(ProcessRegistry, "_daemon_term_grace_seconds", + staticmethod(lambda: 0.0)), \ patch.object(_psutil, "Process", side_effect=lambda pid: FakeProcess(pid)): result = registry.kill_process(s.id) @@ -1279,6 +1283,11 @@ class TestTerminateHostPidPosix: monkeypatch.setattr(pr, "_IS_WINDOWS", False) monkeypatch.setattr(psutil, "Process", _FakeParent) + # This test covers only the SIGTERM tree-walk ordering; disable the + # SIGKILL-escalation step (which would call psutil.wait_procs on the + # fakes) by setting the grace to 0. + monkeypatch.setattr(pr.ProcessRegistry, "_daemon_term_grace_seconds", + staticmethod(lambda: 0.0)) pr.ProcessRegistry._terminate_host_pid(12345) @@ -1436,3 +1445,95 @@ class TestPidReuseGuard: refreshed = registry._refresh_detached_session(s) assert refreshed.exited is True assert s.id in registry._finished + + +@pytest.mark.skipif(sys.platform == "win32", + reason="POSIX SIGTERM→SIGKILL escalation; Windows uses taskkill /F") +class TestSigkillEscalation: + """Bounded SIGTERM→SIGKILL escalation in _terminate_host_pid. + + A daemon that ignores/stalls on SIGTERM must be force-killed after the + configured grace window so it can't leak indefinitely — while well-behaved + processes still exit cleanly on SIGTERM and the recycled-PID guard is never + bypassed. + """ + + # A process that traps SIGTERM (ignores it): only SIGKILL stops it. + # It prints "ready" AFTER installing the handler so the parent never + # signals it during the startup window (before SIG_IGN is in place). + _TRAP = ( + "import signal, sys, time;" + "signal.signal(signal.SIGTERM, signal.SIG_IGN);" + "sys.stdout.write('ready\\n'); sys.stdout.flush();" + "[time.sleep(0.2) for _ in iter(int, 1)]" + ) + + def _spawn_trap(self): + proc = subprocess.Popen( + [sys.executable, "-c", self._TRAP], + stdout=subprocess.PIPE, text=True, + ) + # Wait until the handler is installed before returning. + line = proc.stdout.readline() + assert line.strip() == "ready", "trap process failed to start" + return proc + + def test_sigterm_ignoring_daemon_is_sigkilled(self, monkeypatch): + monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds", + staticmethod(lambda: 1.0)) + proc = self._spawn_trap() + try: + ProcessRegistry._terminate_host_pid(proc.pid) + assert _wait_until(lambda: proc.poll() is not None, timeout=4.0), \ + "SIGTERM-ignoring daemon should be SIGKILLed after grace" + finally: + if proc.poll() is None: + proc.kill() + proc.wait() + + def test_grace_zero_disables_escalation(self, monkeypatch): + monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds", + staticmethod(lambda: 0.0)) + proc = self._spawn_trap() + try: + ProcessRegistry._terminate_host_pid(proc.pid) + # No escalation → the SIGTERM-ignoring process survives. + assert not _wait_until(lambda: proc.poll() is not None, timeout=1.0) + assert proc.poll() is None + finally: + proc.kill() + proc.wait() + + def test_well_behaved_process_dies_on_sigterm(self, monkeypatch): + monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds", + staticmethod(lambda: 2.0)) + proc = _spawn_python_sleep(60) + try: + ProcessRegistry._terminate_host_pid(proc.pid) + assert _wait_until(lambda: proc.poll() is not None, timeout=3.0) + finally: + if proc.poll() is None: + proc.kill() + proc.wait() + + def test_escalation_does_not_bypass_recycled_pid_guard(self, monkeypatch): + """A start-time mismatch must still spare the PID — no SIGTERM, no SIGKILL.""" + monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds", + staticmethod(lambda: 1.0)) + proc = self._spawn_trap() + try: + real_start = ProcessRegistry._safe_host_start_time(proc.pid) + ProcessRegistry._terminate_host_pid( + proc.pid, expected_start=(real_start or 0) + 1) + assert not _wait_until(lambda: proc.poll() is not None, timeout=1.5) + assert proc.poll() is None + finally: + proc.kill() + proc.wait() + + def test_grace_reader_floors_at_zero(self, monkeypatch): + """A negative configured grace is clamped to 0 (no escalation).""" + import hermes_cli.config as cfg_mod + monkeypatch.setattr(cfg_mod, "read_raw_config", + lambda: {"terminal": {"daemon_term_grace_seconds": -5}}) + assert ProcessRegistry._daemon_term_grace_seconds() == 0.0 diff --git a/tools/process_registry.py b/tools/process_registry.py index 3d20e02d56f..91e24884174 100644 --- a/tools/process_registry.py +++ b/tools/process_registry.py @@ -483,6 +483,24 @@ class ProcessRegistry: self._move_to_finished(session) return session + @staticmethod + def _daemon_term_grace_seconds() -> float: + """Grace window (s) between SIGTERM and escalated SIGKILL. + + Read from ``terminal.daemon_term_grace_seconds`` in config.yaml; floored + at 0 (0 disables escalation). Falls back to the DEFAULT_CONFIG value if + config is unreadable, so callers always get a sane number. + """ + try: + from hermes_cli.config import read_raw_config, cfg_get, DEFAULT_CONFIG + cfg = read_raw_config() + val = cfg_get(cfg, "terminal", "daemon_term_grace_seconds") + if val is None: + val = DEFAULT_CONFIG["terminal"]["daemon_term_grace_seconds"] + return max(float(val), 0.0) + except Exception: + return 2.0 + @classmethod def _terminate_host_pid(cls, pid: int, expected_start: Optional[int] = None) -> None: """Terminate a host-visible PID and its descendants. @@ -496,12 +514,17 @@ class ProcessRegistry: POSIX: walks the process tree with ``psutil`` and SIGTERMs children before the parent so subprocess trees (e.g. Chromium renderers/GPU helpers spawned by an ``agent-browser`` daemon) - don't get reparented to init and survive cleanup. + don't get reparented to init and survive cleanup. After a bounded + grace window (``terminal.daemon_term_grace_seconds``) any tree member + that ignored SIGTERM — a daemon stalled in its signal handler — is + escalated to SIGKILL so it can't leak indefinitely. Set the grace to + 0 to disable escalation (SIGTERM only). Windows: shells out to ``taskkill /PID /T /F``. This is the documented Microsoft primitive for tree-kill and matches the - existing convention in ``gateway.status.terminate_pid``. We can't - reuse the POSIX psutil path on Windows because: + existing convention in ``gateway.status.terminate_pid``. ``/F`` is + already a hard kill, so no separate escalation step is needed. We + can't reuse the POSIX psutil path on Windows because: 1. Windows doesn't maintain a Unix-style process tree — ``psutil.Process.children(recursive=True)`` walks PPID @@ -550,12 +573,6 @@ class ProcessRegistry: import psutil try: parent = psutil.Process(pid) - for child in parent.children(recursive=True): - try: - child.terminate() - except psutil.NoSuchProcess: - pass - parent.terminate() except psutil.NoSuchProcess: return except (OSError, PermissionError): @@ -563,6 +580,44 @@ class ProcessRegistry: os.kill(pid, signal.SIGTERM) except (OSError, ProcessLookupError, PermissionError): pass + return + + # Snapshot the whole tree (children before parent) and SIGTERM each. + try: + targets = parent.children(recursive=True) + except (psutil.NoSuchProcess, psutil.AccessDenied, OSError): + targets = [] + targets.append(parent) + + for proc in targets: + try: + proc.terminate() + except psutil.NoSuchProcess: + pass + except (psutil.AccessDenied, OSError): + pass + + # Escalate to SIGKILL for anything that ignored SIGTERM within the + # grace window — a daemon stalled in its signal handler would otherwise + # leak indefinitely. + grace = cls._daemon_term_grace_seconds() + if grace <= 0: + return + try: + _gone, alive = psutil.wait_procs(targets, timeout=grace) + except (psutil.Error, OSError): + alive = [] + for proc in alive: + try: + proc.kill() # SIGKILL on POSIX + logger.info( + "Escalated to SIGKILL for pid %d (ignored SIGTERM within " + "%.1fs grace)", proc.pid, grace, + ) + except psutil.NoSuchProcess: + pass + except (psutil.AccessDenied, OSError): + pass # ----- Spawn -----