mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
* fix(interrupt): propagate to concurrent-tool workers + opt-in debug trace
interrupt() previously only flagged the agent's _execution_thread_id.
Tools running inside _execute_tool_calls_concurrent execute on
ThreadPoolExecutor worker threads whose tids are distinct from the
agent's, so is_interrupted() inside those tools returned False no matter
how many times the gateway called .interrupt() — hung ssh / curl / long
make-builds ran to their own timeout.
Changes:
- run_agent.py: track concurrent-tool worker tids in a per-agent set,
fan interrupt()/clear_interrupt() out to them, and handle the
register-after-interrupt race at _run_tool entry. getattr fallback
for the tracker so test stubs built via object.__new__ keep working.
- tools/environments/base.py: opt-in _wait_for_process trace (ENTER,
per-30s HEARTBEAT with interrupt+activity-cb state, INTERRUPT
DETECTED, TIMEOUT, EXIT) behind HERMES_DEBUG_INTERRUPT=1.
- tools/interrupt.py: opt-in set_interrupt() trace (caller tid, target
tid, set snapshot) behind the same env flag.
- tests: new regression test runs a polling tool on a concurrent worker
and asserts is_interrupted() flips to True within ~1s of interrupt().
Second new test guards clear_interrupt() clearing tracked worker bits.
Validation: tests/run_agent/ all 762 pass; tests/tools/ interrupt+env
subset 216 pass.
* fix(interrupt-debug): bypass quiet_mode logger filter so trace reaches agent.log
AIAgent.__init__ sets logging.getLogger('tools').setLevel(ERROR) when
quiet_mode=True (the CLI default). This would silently swallow every
INFO-level trace line from the HERMES_DEBUG_INTERRUPT=1 instrumentation
added in the parent commit — confirmed by running hermes chat -q with
the flag and finding zero trace lines in agent.log even though
_wait_for_process was clearly executing (subprocess pid existed).
Fix: when HERMES_DEBUG_INTERRUPT=1, each traced module explicitly sets
its own logger level to INFO at import time, overriding the 'tools'
parent-level filter. Scoped to the opt-in case only, so production
(quiet_mode default) logs stay quiet as designed.
Validation: hermes chat -q with HERMES_DEBUG_INTERRUPT=1 now writes
'_wait_for_process ENTER/EXIT' lines to agent.log as expected.
* fix(cli): SIGTERM/SIGHUP no longer orphans tool subprocesses
Tool subprocesses spawned by the local environment backend use
os.setsid so they run in their own process group. Before this fix,
SIGTERM/SIGHUP to the hermes CLI killed the main thread via
KeyboardInterrupt but the worker thread running _wait_for_process
never got a chance to call _kill_process — Python exited, the child
was reparented to init (PPID=1), and the subprocess ran to its
natural end (confirmed live: sleep 300 survived 4+ min after SIGTERM
to the agent until manual cleanup).
Changes:
- cli.py _signal_handler (interactive) + _signal_handler_q (-q mode):
route SIGTERM/SIGHUP through agent.interrupt() so the worker's poll
loop sees the per-thread interrupt flag and calls _kill_process
(os.killpg) on the subprocess group. HERMES_SIGTERM_GRACE (default
1.5s) gives the worker time to complete its SIGTERM+SIGKILL
escalation before KeyboardInterrupt unwinds main.
- tools/environments/base.py _wait_for_process: wrap the poll loop in
try/except (KeyboardInterrupt, SystemExit) so the cleanup fires
even on paths the signal handlers don't cover (direct sys.exit,
unhandled KI from nested code, etc.). Emits EXCEPTION_EXIT trace
line when HERMES_DEBUG_INTERRUPT=1.
- New regression test: injects KeyboardInterrupt into a running
_wait_for_process via PyThreadState_SetAsyncExc, verifies the
subprocess process group is dead within 3s of the exception and
that KeyboardInterrupt re-raises cleanly afterward.
Validation:
| Before | After |
|---------------------------------------------------------|--------------------|
| sleep 300 survives 4+ min as PPID=1 orphan after SIGTERM | dies within 2 s |
| No INTERRUPT DETECTED in trace | INTERRUPT DETECTED fires + killing process group |
| tests/tools/test_local_interrupt_cleanup | 1/1 pass |
| tests/run_agent/test_concurrent_interrupt | 4/4 pass |
145 lines
6.1 KiB
Python
145 lines
6.1 KiB
Python
"""Regression tests for _wait_for_process subprocess cleanup on exception exit.
|
|
|
|
When the poll loop exits via KeyboardInterrupt or SystemExit (SIGTERM via
|
|
cli.py signal handler, SIGINT on the main thread in non-interactive -q mode,
|
|
or explicit sys.exit from some caller), the child subprocess must be killed
|
|
before the exception propagates — otherwise the local backend's use of
|
|
os.setsid leaves an orphan with PPID=1.
|
|
|
|
The live repro that motivated this: hermes chat -q ... 'sleep 300', SIGTERM
|
|
to the python process, sleep 300 survived with PPID=1 for the full 300 s
|
|
because _wait_for_process never got to call _kill_process before python
|
|
died. See commit message for full context.
|
|
"""
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
|
|
import pytest
|
|
|
|
from tools.environments.local import LocalEnvironment
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _isolate_hermes_home(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
(tmp_path / "logs").mkdir(exist_ok=True)
|
|
|
|
|
|
def _pgid_still_alive(pgid: int) -> bool:
|
|
"""Return True if any process in the given process group is still alive."""
|
|
try:
|
|
os.killpg(pgid, 0) # signal 0 = existence check
|
|
return True
|
|
except ProcessLookupError:
|
|
return False
|
|
|
|
|
|
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
|
|
"""When KeyboardInterrupt arrives mid-poll, the subprocess group must be
|
|
killed before the exception is re-raised."""
|
|
env = LocalEnvironment(cwd="/tmp")
|
|
try:
|
|
result_holder = {}
|
|
proc_holder = {}
|
|
started = threading.Event()
|
|
raise_at = [None] # set by the main thread to tell worker when
|
|
|
|
# Drive execute() on a separate thread so we can SIGNAL-interrupt it
|
|
# via a thread-targeted exception without killing our test process.
|
|
def worker():
|
|
# Spawn a subprocess that will definitely be alive long enough
|
|
# to observe the cleanup, via env.execute(...) — the normal path
|
|
# that goes through _wait_for_process.
|
|
try:
|
|
result_holder["result"] = env.execute("sleep 30", timeout=60)
|
|
except BaseException as e: # noqa: BLE001 — we want to observe it
|
|
result_holder["exception"] = type(e).__name__
|
|
|
|
t = threading.Thread(target=worker, daemon=True)
|
|
t.start()
|
|
# Wait until the subprocess actually exists. LocalEnvironment.execute
|
|
# does init_session() (one spawn) before the real command, so we need
|
|
# to wait until a sleep 30 is visible. Use pgrep-style lookup via
|
|
# /proc to find the bash process running our sleep.
|
|
deadline = time.monotonic() + 5.0
|
|
target_pid = None
|
|
while time.monotonic() < deadline:
|
|
# Walk our children and grand-children to find one running 'sleep 30'
|
|
try:
|
|
import psutil # optional — fall back if absent
|
|
for p in psutil.Process(os.getpid()).children(recursive=True):
|
|
try:
|
|
if "sleep 30" in " ".join(p.cmdline()):
|
|
target_pid = p.pid
|
|
break
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
continue
|
|
except ImportError:
|
|
# Fall back to ps
|
|
ps = subprocess.run(
|
|
["ps", "-eo", "pid,ppid,pgid,cmd"], capture_output=True, text=True,
|
|
)
|
|
for line in ps.stdout.splitlines():
|
|
if "sleep 30" in line and "grep" not in line:
|
|
parts = line.split()
|
|
if parts and parts[0].isdigit():
|
|
target_pid = int(parts[0])
|
|
break
|
|
if target_pid:
|
|
break
|
|
time.sleep(0.1)
|
|
|
|
assert target_pid is not None, (
|
|
"test setup: couldn't find 'sleep 30' subprocess after 5 s"
|
|
)
|
|
pgid = os.getpgid(target_pid)
|
|
assert _pgid_still_alive(pgid), "sanity: subprocess should be alive"
|
|
|
|
# Now inject a KeyboardInterrupt into the worker thread the same
|
|
# way CPython's signal machinery would. We use ctypes.PyThreadState_SetAsyncExc
|
|
# which is how signal delivery to non-main threads is simulated.
|
|
import ctypes
|
|
import sys as _sys
|
|
# py-thread-state exception targets need the ident, not the Thread
|
|
tid = t.ident
|
|
assert tid is not None
|
|
# Fire KeyboardInterrupt into the worker thread
|
|
ret = ctypes.pythonapi.PyThreadState_SetAsyncExc(
|
|
ctypes.c_ulong(tid), ctypes.py_object(KeyboardInterrupt),
|
|
)
|
|
assert ret == 1, f"SetAsyncExc returned {ret}, expected 1"
|
|
|
|
# Give the worker a moment to: hit the exception at the next poll,
|
|
# run the except-block cleanup (_kill_process), and exit.
|
|
t.join(timeout=5.0)
|
|
assert not t.is_alive(), "worker didn't exit within 5 s of the interrupt"
|
|
|
|
# The critical assertion: the subprocess GROUP must be dead. Not
|
|
# just the bash wrapper — the 'sleep 30' child too.
|
|
# Give the SIGTERM+1s wait+SIGKILL escalation a moment to complete.
|
|
deadline = time.monotonic() + 3.0
|
|
while time.monotonic() < deadline:
|
|
if not _pgid_still_alive(pgid):
|
|
break
|
|
time.sleep(0.1)
|
|
assert not _pgid_still_alive(pgid), (
|
|
f"subprocess group {pgid} is STILL ALIVE after worker received "
|
|
f"KeyboardInterrupt — orphan bug regressed. This is the "
|
|
f"sleep-300-survives-SIGTERM scenario from Physikal's Apr 2026 "
|
|
f"report. See tools/environments/base.py _wait_for_process "
|
|
f"except-block."
|
|
)
|
|
# And the worker should have observed the KeyboardInterrupt (i.e.
|
|
# it re-raised cleanly, not silently swallowed).
|
|
assert result_holder.get("exception") == "KeyboardInterrupt", (
|
|
f"worker result: {result_holder!r} — expected KeyboardInterrupt "
|
|
f"propagation after cleanup"
|
|
)
|
|
finally:
|
|
try:
|
|
env.cleanup()
|
|
except Exception:
|
|
pass
|