mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(kanban): SIGTERM on worker must terminate the process (#28181)
The single-query signal handler in cli.py raises KeyboardInterrupt on SIGTERM/SIGHUP. For interactive 'hermes chat -q' that unwinds the main thread cleanly. For kanban workers spawned by the dispatcher, the worker process is likely to have a non-daemon thread alive (terminal _wait_for_process, custom plugins, etc.). With KeyboardInterrupt only the main thread unwinds; the non-daemon thread keeps the process alive, the gateway has already restarted, and the dispatcher's _pid_alive check returns True forever — task stuck in 'running' indefinitely. When HERMES_KANBAN_TASK is set (dispatcher-spawned worker), flush logging + stdout/stderr, then os._exit(0) instead of raising KeyboardInterrupt. The kernel reclaims the PID immediately, and the existing zombie-state detection in _pid_alive flips the task to crashed on the next dispatcher tick. detect_crashed_workers then re-spawns it on the following tick — no manual recovery needed. A SIGALRM(2s) deadman is armed before the flush so a pathological blocking-I/O flush can't wedge the worker forever. In practice the reporter measured flush in <1ms; the alarm is a failsafe, never the common path. Interactive (non-kanban) chat -q is unchanged — the env-gated branch only fires for dispatcher-spawned workers. Live verification on this machine: - Without HERMES_KANBAN_TASK + non-daemon thread alive: process hangs alive 4+ seconds after SIGTERM. Dispatcher's _pid_alive returns True → task stuck. - With HERMES_KANBAN_TASK + same non-daemon thread: process exits in 0.10s via os._exit(0). Dispatcher reclaims on next tick. Tests: - tests/hermes_cli/test_signal_handler_kanban_worker.py (3 cases): end-to-end subprocess test with a non-daemon thread, HERMES_KANBAN_TASK env, SIGTERM, dispatcher-style _pid_alive check. Plus a source-level invariant test catching future refactors that drop the env-gated exit. - 452/452 kanban tests pass. Co-authored-by: andrewhosf <andrewho.sf@gmail.com>
This commit is contained in:
parent
3a9bc9d88a
commit
f30db14ced
2 changed files with 263 additions and 0 deletions
33
cli.py
33
cli.py
|
|
@ -14980,6 +14980,39 @@ def main(
|
|||
time.sleep(_grace)
|
||||
except Exception:
|
||||
pass # never block signal handling
|
||||
# Kanban worker exit path (#28181): SIGTERM hits a dispatcher-spawned
|
||||
# worker that's likely in a non-daemon thread waiting on a child
|
||||
# subprocess in _wait_for_process. Raising KeyboardInterrupt only
|
||||
# unwinds the main thread; the worker thread keeps running, the
|
||||
# process gets reparented to init, and the dispatcher's _pid_alive
|
||||
# check returns True forever — task stuck in 'running' indefinitely.
|
||||
# Skip the controlled-unwind dance and call os._exit(0) so the kernel
|
||||
# reclaims the PID immediately and detect_crashed_workers can reclaim
|
||||
# the stale claim on the next tick. Flush logging + stdout/stderr
|
||||
# first so the final debug trace isn't lost; SIGALRM deadman guards
|
||||
# the flush against any rare blocking-I/O case (the reporter measured
|
||||
# flush in <1ms; the alarm is a failsafe, not the common path).
|
||||
if os.environ.get("HERMES_KANBAN_TASK"):
|
||||
try:
|
||||
import signal as _sig_mod
|
||||
if hasattr(_sig_mod, "SIGALRM"):
|
||||
# Cancel any pre-existing alarm to avoid colliding with
|
||||
# caller-installed timers.
|
||||
_sig_mod.signal(_sig_mod.SIGALRM, lambda *_: os._exit(0))
|
||||
_sig_mod.alarm(2)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
import logging as _lg
|
||||
_lg.shutdown()
|
||||
except Exception:
|
||||
pass
|
||||
for _stream in (sys.stdout, sys.stderr):
|
||||
try:
|
||||
_stream.flush()
|
||||
except Exception:
|
||||
pass
|
||||
os._exit(0)
|
||||
raise KeyboardInterrupt()
|
||||
try:
|
||||
import signal as _signal
|
||||
|
|
|
|||
230
tests/hermes_cli/test_signal_handler_kanban_worker.py
Normal file
230
tests/hermes_cli/test_signal_handler_kanban_worker.py
Normal file
|
|
@ -0,0 +1,230 @@
|
|||
"""Regression test for #28181 — kanban worker SIGTERM must terminate the process.
|
||||
|
||||
The single-query signal handler in cli.py (``_signal_handler_q``) raises
|
||||
``KeyboardInterrupt`` to unwind the main thread on SIGTERM/SIGHUP. That works
|
||||
for interactive ``hermes chat -q`` invocations, but kanban workers spawned by
|
||||
the dispatcher are likely to have a non-daemon thread alive (terminal_tool's
|
||||
``_wait_for_process``, custom plugin background workers, etc.). With
|
||||
``KeyboardInterrupt`` only the main thread unwinds; the non-daemon thread
|
||||
keeps the process alive after the gateway has already restarted, the kanban
|
||||
dispatcher's ``_pid_alive`` check returns True forever, and the task stays
|
||||
``running`` indefinitely.
|
||||
|
||||
The fix: when the process is a dispatcher-spawned worker (``HERMES_KANBAN_TASK``
|
||||
env var set), flush logging + stdout/stderr and call ``os._exit(0)`` instead.
|
||||
The kernel reclaims the PID immediately, and ``detect_crashed_workers``
|
||||
reclaims the stale claim on the next dispatcher tick.
|
||||
|
||||
These tests use a synthetic Python script that mirrors the cli.py signal
|
||||
handler shape so we can exercise the exit-path contract without booting the
|
||||
full CLI (which needs a real provider config).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _synthetic_worker_script() -> str:
|
||||
"""A standalone script that mirrors cli.py's single-query SIGTERM handler.
|
||||
|
||||
Keeping the synthetic copy here means the test exercises the exact handler
|
||||
shape without needing the full hermes_cli boot path (config, providers,
|
||||
skills, etc.). If the production handler in cli.py drifts, the test
|
||||
that loads the real handler (test_real_handler_uses_os_exit) will catch it.
|
||||
"""
|
||||
return textwrap.dedent(
|
||||
"""
|
||||
import os, signal, sys, threading, time
|
||||
|
||||
# Non-daemon thread that blocks forever — simulates the worker
|
||||
# thread that would prevent orderly Python shutdown after
|
||||
# KeyboardInterrupt unwinds main.
|
||||
stuck = threading.Event()
|
||||
threading.Thread(target=stuck.wait, daemon=False).start()
|
||||
|
||||
def handler(signum, frame):
|
||||
# Mirrors cli.py:_signal_handler_q. Real handler sleeps 1.5s; the
|
||||
# test uses a short grace so it runs fast.
|
||||
try:
|
||||
time.sleep(0.05)
|
||||
except Exception:
|
||||
pass
|
||||
if os.environ.get("HERMES_KANBAN_TASK"):
|
||||
try:
|
||||
if hasattr(signal, "SIGALRM"):
|
||||
signal.signal(signal.SIGALRM, lambda *_: os._exit(0))
|
||||
signal.alarm(2)
|
||||
except Exception:
|
||||
pass
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
os._exit(0)
|
||||
raise KeyboardInterrupt()
|
||||
|
||||
signal.signal(signal.SIGTERM, handler)
|
||||
print("READY", flush=True)
|
||||
try:
|
||||
threading.Event().wait()
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def _is_alive_like_dispatcher(pid: int) -> bool:
|
||||
"""Mirrors hermes_cli/kanban_db.py:_pid_alive on Linux.
|
||||
|
||||
A zombie is treated as dead — the dispatcher's _pid_alive checks
|
||||
/proc/<pid>/status for State: Z. We replicate that here so a clean
|
||||
os._exit followed by zombie-state is correctly counted as dead.
|
||||
"""
|
||||
if pid <= 0:
|
||||
return False
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except PermissionError:
|
||||
return True
|
||||
if sys.platform == "linux":
|
||||
try:
|
||||
with open(f"/proc/{pid}/status") as f:
|
||||
for line in f:
|
||||
if line.startswith("State:"):
|
||||
if "Z" in line.split(":", 1)[1]:
|
||||
return False
|
||||
break
|
||||
except (FileNotFoundError, PermissionError, OSError):
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def _spawn_synthetic(env_overrides: dict) -> subprocess.Popen:
|
||||
env = dict(os.environ)
|
||||
env.update(env_overrides)
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, "-u", "-c", _synthetic_worker_script()],
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
start_new_session=True,
|
||||
)
|
||||
# Wait for "READY" so we know the signal handler is installed.
|
||||
assert proc.stdout is not None
|
||||
deadline = time.time() + 5.0
|
||||
while time.time() < deadline:
|
||||
line = proc.stdout.readline()
|
||||
if line and line.startswith(b"READY"):
|
||||
return proc
|
||||
proc.kill()
|
||||
raise RuntimeError("synthetic worker never signalled READY")
|
||||
|
||||
|
||||
def _cleanup(proc: subprocess.Popen) -> None:
|
||||
try:
|
||||
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
pass
|
||||
try:
|
||||
proc.communicate(timeout=2)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform == "win32",
|
||||
reason="SIGTERM semantics differ on Windows; kanban dispatcher is POSIX-only",
|
||||
)
|
||||
def test_sigterm_with_kanban_task_env_terminates_quickly():
|
||||
"""With HERMES_KANBAN_TASK set, SIGTERM should kill the process in <2s
|
||||
even when a non-daemon thread is still alive."""
|
||||
proc = _spawn_synthetic({"HERMES_KANBAN_TASK": "t_test_28181"})
|
||||
try:
|
||||
t0 = time.time()
|
||||
os.kill(proc.pid, signal.SIGTERM)
|
||||
|
||||
# Should die in <2s. The handler sleeps ~50ms, then os._exit(0)
|
||||
# is immediate. Give generous headroom for slow CI runners.
|
||||
deadline = t0 + 2.0
|
||||
while time.time() < deadline:
|
||||
if not _is_alive_like_dispatcher(proc.pid):
|
||||
elapsed = time.time() - t0
|
||||
assert elapsed < 2.0
|
||||
return
|
||||
time.sleep(0.02)
|
||||
pytest.fail(
|
||||
f"process still alive 2s after SIGTERM with HERMES_KANBAN_TASK set "
|
||||
f"(dispatcher would keep extending claim) — fix regressed"
|
||||
)
|
||||
finally:
|
||||
_cleanup(proc)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sys.platform == "win32",
|
||||
reason="SIGTERM semantics differ on Windows; kanban dispatcher is POSIX-only",
|
||||
)
|
||||
def test_sigterm_without_kanban_task_env_uses_keyboard_interrupt_path():
|
||||
"""Without HERMES_KANBAN_TASK, the original KeyboardInterrupt path runs.
|
||||
|
||||
This is the contrast case proving the fix is gated on the env var: in
|
||||
interactive ``hermes chat -q`` (no env var), behavior is unchanged. The
|
||||
process MAY hang under non-daemon threads, but that's not a kanban-worker
|
||||
concern. We just verify the handler logs the KeyboardInterrupt branch
|
||||
rather than os._exit'ing.
|
||||
"""
|
||||
proc = _spawn_synthetic({})
|
||||
try:
|
||||
os.kill(proc.pid, signal.SIGTERM)
|
||||
# Wait a moment for the handler to react.
|
||||
time.sleep(0.5)
|
||||
# The process may or may not be dead depending on whether the
|
||||
# KeyboardInterrupt unwinds cleanly. The behavioral guarantee is
|
||||
# only that the env-gated path didn't fire.
|
||||
try:
|
||||
# Drain stdout up to whatever's available.
|
||||
if proc.stdout is not None:
|
||||
proc.stdout.close()
|
||||
if proc.stderr is not None:
|
||||
proc.stderr.close()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
_cleanup(proc)
|
||||
|
||||
|
||||
def test_real_handler_uses_os_exit_for_kanban_workers():
|
||||
"""Source-level invariant: cli.py's _signal_handler_q must call
|
||||
os._exit(0) when HERMES_KANBAN_TASK is set.
|
||||
|
||||
Catches the case where someone refactors the handler and accidentally
|
||||
drops the env-gated exit, restoring the bug. Reading cli.py directly is
|
||||
cheap and avoids the heavy CLI import.
|
||||
"""
|
||||
import pathlib
|
||||
|
||||
cli_path = (
|
||||
pathlib.Path(__file__).resolve().parent.parent.parent / "cli.py"
|
||||
)
|
||||
src = cli_path.read_text()
|
||||
# Locate the handler body.
|
||||
start = src.find("def _signal_handler_q(signum, frame):")
|
||||
assert start != -1, "cli.py is missing _signal_handler_q"
|
||||
# Look ahead for the env-gated os._exit call within ~80 lines.
|
||||
body = src[start : start + 4000]
|
||||
assert "HERMES_KANBAN_TASK" in body, (
|
||||
"_signal_handler_q must gate its kanban-worker exit path on "
|
||||
"HERMES_KANBAN_TASK — see #28181"
|
||||
)
|
||||
assert "os._exit(0)" in body, (
|
||||
"_signal_handler_q must call os._exit(0) for kanban workers — "
|
||||
"raising KeyboardInterrupt orphans the process when non-daemon "
|
||||
"threads are alive (see #28181)"
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue