mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(interrupt): propagate to concurrent-tool workers + opt-in debug trace (#11907)
* fix(interrupt): propagate to concurrent-tool workers + opt-in debug trace
interrupt() previously only flagged the agent's _execution_thread_id.
Tools running inside _execute_tool_calls_concurrent execute on
ThreadPoolExecutor worker threads whose tids are distinct from the
agent's, so is_interrupted() inside those tools returned False no matter
how many times the gateway called .interrupt() — hung ssh / curl / long
make-builds ran to their own timeout.
Changes:
- run_agent.py: track concurrent-tool worker tids in a per-agent set,
fan interrupt()/clear_interrupt() out to them, and handle the
register-after-interrupt race at _run_tool entry. getattr fallback
for the tracker so test stubs built via object.__new__ keep working.
- tools/environments/base.py: opt-in _wait_for_process trace (ENTER,
per-30s HEARTBEAT with interrupt+activity-cb state, INTERRUPT
DETECTED, TIMEOUT, EXIT) behind HERMES_DEBUG_INTERRUPT=1.
- tools/interrupt.py: opt-in set_interrupt() trace (caller tid, target
tid, set snapshot) behind the same env flag.
- tests: new regression test runs a polling tool on a concurrent worker
and asserts is_interrupted() flips to True within ~1s of interrupt().
Second new test guards clear_interrupt() clearing tracked worker bits.
Validation: tests/run_agent/ all 762 pass; tests/tools/ interrupt+env
subset 216 pass.
* fix(interrupt-debug): bypass quiet_mode logger filter so trace reaches agent.log
AIAgent.__init__ sets logging.getLogger('tools').setLevel(ERROR) when
quiet_mode=True (the CLI default). This would silently swallow every
INFO-level trace line from the HERMES_DEBUG_INTERRUPT=1 instrumentation
added in the parent commit — confirmed by running hermes chat -q with
the flag and finding zero trace lines in agent.log even though
_wait_for_process was clearly executing (subprocess pid existed).
Fix: when HERMES_DEBUG_INTERRUPT=1, each traced module explicitly sets
its own logger level to INFO at import time, overriding the 'tools'
parent-level filter. Scoped to the opt-in case only, so production
(quiet_mode default) logs stay quiet as designed.
Validation: hermes chat -q with HERMES_DEBUG_INTERRUPT=1 now writes
'_wait_for_process ENTER/EXIT' lines to agent.log as expected.
* fix(cli): SIGTERM/SIGHUP no longer orphans tool subprocesses
Tool subprocesses spawned by the local environment backend use
os.setsid so they run in their own process group. Before this fix,
SIGTERM/SIGHUP to the hermes CLI killed the main thread via
KeyboardInterrupt but the worker thread running _wait_for_process
never got a chance to call _kill_process — Python exited, the child
was reparented to init (PPID=1), and the subprocess ran to its
natural end (confirmed live: sleep 300 survived 4+ min after SIGTERM
to the agent until manual cleanup).
Changes:
- cli.py _signal_handler (interactive) + _signal_handler_q (-q mode):
route SIGTERM/SIGHUP through agent.interrupt() so the worker's poll
loop sees the per-thread interrupt flag and calls _kill_process
(os.killpg) on the subprocess group. HERMES_SIGTERM_GRACE (default
1.5s) gives the worker time to complete its SIGTERM+SIGKILL
escalation before KeyboardInterrupt unwinds main.
- tools/environments/base.py _wait_for_process: wrap the poll loop in
try/except (KeyboardInterrupt, SystemExit) so the cleanup fires
even on paths the signal handlers don't cover (direct sys.exit,
unhandled KI from nested code, etc.). Emits EXCEPTION_EXIT trace
line when HERMES_DEBUG_INTERRUPT=1.
- New regression test: injects KeyboardInterrupt into a running
_wait_for_process via PyThreadState_SetAsyncExc, verifies the
subprocess process group is dead within 3s of the exception and
that KeyboardInterrupt re-raises cleanly afterward.
Validation:
| Before | After |
|---------------------------------------------------------|--------------------|
| sleep 300 survives 4+ min as PPID=1 orphan after SIGTERM | dies within 2 s |
| No INTERRUPT DETECTED in trace | INTERRUPT DETECTED fires + killing process group |
| tests/tools/test_local_interrupt_cleanup | 1/1 pass |
| tests/run_agent/test_concurrent_interrupt | 4/4 pass |
This commit is contained in:
parent
607be54a24
commit
20f2258f34
6 changed files with 551 additions and 22 deletions
|
|
@ -23,6 +23,19 @@ from tools.interrupt import is_interrupted
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Opt-in debug tracing for the interrupt/activity/poll machinery. Set
|
||||
# HERMES_DEBUG_INTERRUPT=1 to log loop entry/exit, periodic heartbeats, and
|
||||
# every is_interrupted() state change from _wait_for_process. Off by default
|
||||
# to avoid flooding production gateway logs.
|
||||
_DEBUG_INTERRUPT = bool(os.getenv("HERMES_DEBUG_INTERRUPT"))
|
||||
|
||||
if _DEBUG_INTERRUPT:
|
||||
# AIAgent's quiet_mode path (run_agent.py) forces the `tools` logger to
|
||||
# ERROR on CLI startup, which would silently swallow every trace we emit.
|
||||
# Force this module's own logger back to INFO so the trace is visible in
|
||||
# agent.log regardless of quiet-mode. Scoped to the opt-in case only.
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Thread-local activity callback. The agent sets this before a tool call so
|
||||
# long-running _wait_for_process loops can report liveness to the gateway.
|
||||
_activity_callback_local = threading.local()
|
||||
|
|
@ -413,6 +426,13 @@ class BaseEnvironment(ABC):
|
|||
Fires the ``activity_callback`` (if set on this instance) every 10s
|
||||
while the process is running so the gateway's inactivity timeout
|
||||
doesn't kill long-running commands.
|
||||
|
||||
Also wraps the poll loop in a ``try/finally`` that guarantees we
|
||||
call ``self._kill_process(proc)`` if we exit via ``KeyboardInterrupt``
|
||||
or ``SystemExit``. Without this, the local backend (which spawns
|
||||
subprocesses with ``os.setsid`` into their own process group) leaves
|
||||
an orphan with ``PPID=1`` when python is shut down mid-tool — the
|
||||
``sleep 300``-survives-30-min bug Physikal and I both hit.
|
||||
"""
|
||||
output_chunks: list[str] = []
|
||||
|
||||
|
|
@ -437,28 +457,101 @@ class BaseEnvironment(ABC):
|
|||
"start": _now,
|
||||
}
|
||||
|
||||
while proc.poll() is None:
|
||||
if is_interrupted():
|
||||
# --- Debug tracing (opt-in via HERMES_DEBUG_INTERRUPT=1) -------------
|
||||
# Captures loop entry/exit, interrupt state changes, and periodic
|
||||
# heartbeats so we can diagnose "agent never sees the interrupt"
|
||||
# reports without reproducing locally.
|
||||
_tid = threading.current_thread().ident
|
||||
_pid = getattr(proc, "pid", None)
|
||||
_iter_count = 0
|
||||
_last_heartbeat = _now
|
||||
_last_interrupt_state = False
|
||||
_cb_was_none = _get_activity_callback() is None
|
||||
if _DEBUG_INTERRUPT:
|
||||
logger.info(
|
||||
"[interrupt-debug] _wait_for_process ENTER tid=%s pid=%s "
|
||||
"timeout=%ss activity_cb=%s initial_interrupt=%s",
|
||||
_tid, _pid, timeout,
|
||||
"set" if not _cb_was_none else "MISSING",
|
||||
is_interrupted(),
|
||||
)
|
||||
|
||||
try:
|
||||
while proc.poll() is None:
|
||||
_iter_count += 1
|
||||
if is_interrupted():
|
||||
if _DEBUG_INTERRUPT:
|
||||
logger.info(
|
||||
"[interrupt-debug] _wait_for_process INTERRUPT DETECTED "
|
||||
"tid=%s pid=%s iter=%d elapsed=%.1fs — killing process group",
|
||||
_tid, _pid, _iter_count, time.monotonic() - _activity_state["start"],
|
||||
)
|
||||
self._kill_process(proc)
|
||||
drain_thread.join(timeout=2)
|
||||
return {
|
||||
"output": "".join(output_chunks) + "\n[Command interrupted]",
|
||||
"returncode": 130,
|
||||
}
|
||||
if time.monotonic() > deadline:
|
||||
if _DEBUG_INTERRUPT:
|
||||
logger.info(
|
||||
"[interrupt-debug] _wait_for_process TIMEOUT "
|
||||
"tid=%s pid=%s iter=%d timeout=%ss",
|
||||
_tid, _pid, _iter_count, timeout,
|
||||
)
|
||||
self._kill_process(proc)
|
||||
drain_thread.join(timeout=2)
|
||||
partial = "".join(output_chunks)
|
||||
timeout_msg = f"\n[Command timed out after {timeout}s]"
|
||||
return {
|
||||
"output": partial + timeout_msg
|
||||
if partial
|
||||
else timeout_msg.lstrip(),
|
||||
"returncode": 124,
|
||||
}
|
||||
# Periodic activity touch so the gateway knows we're alive
|
||||
touch_activity_if_due(_activity_state, "terminal command running")
|
||||
|
||||
# Heartbeat every ~30s: proves the loop is alive and reports
|
||||
# the activity-callback state (thread-local, can get clobbered
|
||||
# by nested tool calls or executor thread reuse).
|
||||
if _DEBUG_INTERRUPT and time.monotonic() - _last_heartbeat >= 30.0:
|
||||
_cb_now_none = _get_activity_callback() is None
|
||||
logger.info(
|
||||
"[interrupt-debug] _wait_for_process HEARTBEAT "
|
||||
"tid=%s pid=%s iter=%d elapsed=%.0fs "
|
||||
"interrupt=%s activity_cb=%s%s",
|
||||
_tid, _pid, _iter_count,
|
||||
time.monotonic() - _activity_state["start"],
|
||||
is_interrupted(),
|
||||
"set" if not _cb_now_none else "MISSING",
|
||||
" (LOST during run)" if _cb_now_none and not _cb_was_none else "",
|
||||
)
|
||||
_last_heartbeat = time.monotonic()
|
||||
_cb_was_none = _cb_now_none
|
||||
|
||||
time.sleep(0.2)
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
# Signal arrived (SIGTERM/SIGHUP/SIGINT) or sys.exit() was called
|
||||
# while we were polling. The local backend spawns subprocesses
|
||||
# with os.setsid, which puts them in their own process group — so
|
||||
# if we let the interrupt propagate without killing the child,
|
||||
# python exits and the child is reparented to init (PPID=1) and
|
||||
# keeps running as an orphan. Killing the process group here
|
||||
# guarantees the tool's side effects stop when the agent stops.
|
||||
if _DEBUG_INTERRUPT:
|
||||
logger.info(
|
||||
"[interrupt-debug] _wait_for_process EXCEPTION_EXIT "
|
||||
"tid=%s pid=%s iter=%d elapsed=%.1fs — killing subprocess group before re-raise",
|
||||
_tid, _pid, _iter_count,
|
||||
time.monotonic() - _activity_state["start"],
|
||||
)
|
||||
try:
|
||||
self._kill_process(proc)
|
||||
drain_thread.join(timeout=2)
|
||||
return {
|
||||
"output": "".join(output_chunks) + "\n[Command interrupted]",
|
||||
"returncode": 130,
|
||||
}
|
||||
if time.monotonic() > deadline:
|
||||
self._kill_process(proc)
|
||||
drain_thread.join(timeout=2)
|
||||
partial = "".join(output_chunks)
|
||||
timeout_msg = f"\n[Command timed out after {timeout}s]"
|
||||
return {
|
||||
"output": partial + timeout_msg
|
||||
if partial
|
||||
else timeout_msg.lstrip(),
|
||||
"returncode": 124,
|
||||
}
|
||||
# Periodic activity touch so the gateway knows we're alive
|
||||
touch_activity_if_due(_activity_state, "terminal command running")
|
||||
time.sleep(0.2)
|
||||
except Exception:
|
||||
pass # cleanup is best-effort
|
||||
raise
|
||||
|
||||
drain_thread.join(timeout=5)
|
||||
|
||||
|
|
@ -467,6 +560,15 @@ class BaseEnvironment(ABC):
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
if _DEBUG_INTERRUPT:
|
||||
logger.info(
|
||||
"[interrupt-debug] _wait_for_process EXIT (natural) "
|
||||
"tid=%s pid=%s iter=%d elapsed=%.1fs returncode=%s",
|
||||
_tid, _pid, _iter_count,
|
||||
time.monotonic() - _activity_state["start"],
|
||||
proc.returncode,
|
||||
)
|
||||
|
||||
return {"output": "".join(output_chunks), "returncode": proc.returncode}
|
||||
|
||||
def _kill_process(self, proc: ProcessHandle):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue