mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(interrupt): propagate to concurrent-tool workers + opt-in debug trace (#11907)
* fix(interrupt): propagate to concurrent-tool workers + opt-in debug trace
interrupt() previously only flagged the agent's _execution_thread_id.
Tools running inside _execute_tool_calls_concurrent execute on
ThreadPoolExecutor worker threads whose tids are distinct from the
agent's, so is_interrupted() inside those tools returned False no matter
how many times the gateway called .interrupt() — hung ssh / curl / long
make-builds ran to their own timeout.
Changes:
- run_agent.py: track concurrent-tool worker tids in a per-agent set,
fan interrupt()/clear_interrupt() out to them, and handle the
register-after-interrupt race at _run_tool entry. getattr fallback
for the tracker so test stubs built via object.__new__ keep working.
- tools/environments/base.py: opt-in _wait_for_process trace (ENTER,
per-30s HEARTBEAT with interrupt+activity-cb state, INTERRUPT
DETECTED, TIMEOUT, EXIT) behind HERMES_DEBUG_INTERRUPT=1.
- tools/interrupt.py: opt-in set_interrupt() trace (caller tid, target
tid, set snapshot) behind the same env flag.
- tests: new regression test runs a polling tool on a concurrent worker
and asserts is_interrupted() flips to True within ~1s of interrupt().
Second new test guards clear_interrupt() clearing tracked worker bits.
Validation: tests/run_agent/ all 762 pass; tests/tools/ interrupt+env
subset 216 pass.
* fix(interrupt-debug): bypass quiet_mode logger filter so trace reaches agent.log
AIAgent.__init__ sets logging.getLogger('tools').setLevel(ERROR) when
quiet_mode=True (the CLI default). This would silently swallow every
INFO-level trace line from the HERMES_DEBUG_INTERRUPT=1 instrumentation
added in the parent commit — confirmed by running hermes chat -q with
the flag and finding zero trace lines in agent.log even though
_wait_for_process was clearly executing (subprocess pid existed).
Fix: when HERMES_DEBUG_INTERRUPT=1, each traced module explicitly sets
its own logger level to INFO at import time, overriding the 'tools'
parent-level filter. Scoped to the opt-in case only, so production
(quiet_mode default) logs stay quiet as designed.
Validation: hermes chat -q with HERMES_DEBUG_INTERRUPT=1 now writes
'_wait_for_process ENTER/EXIT' lines to agent.log as expected.
* fix(cli): SIGTERM/SIGHUP no longer orphans tool subprocesses
Tool subprocesses spawned by the local environment backend use
os.setsid so they run in their own process group. Before this fix,
SIGTERM/SIGHUP to the hermes CLI killed the main thread via
KeyboardInterrupt but the worker thread running _wait_for_process
never got a chance to call _kill_process — Python exited, the child
was reparented to init (PPID=1), and the subprocess ran to its
natural end (confirmed live: sleep 300 survived 4+ min after SIGTERM
to the agent until manual cleanup).
Changes:
- cli.py _signal_handler (interactive) + _signal_handler_q (-q mode):
route SIGTERM/SIGHUP through agent.interrupt() so the worker's poll
loop sees the per-thread interrupt flag and calls _kill_process
(os.killpg) on the subprocess group. HERMES_SIGTERM_GRACE (default
1.5s) gives the worker time to complete its SIGTERM+SIGKILL
escalation before KeyboardInterrupt unwinds main.
- tools/environments/base.py _wait_for_process: wrap the poll loop in
try/except (KeyboardInterrupt, SystemExit) so the cleanup fires
even on paths the signal handlers don't cover (direct sys.exit,
unhandled KI from nested code, etc.). Emits EXCEPTION_EXIT trace
line when HERMES_DEBUG_INTERRUPT=1.
- New regression test: injects KeyboardInterrupt into a running
_wait_for_process via PyThreadState_SetAsyncExc, verifies the
subprocess process group is dead within 3s of the exception and
that KeyboardInterrupt re-raises cleanly afterward.
Validation:
| Before | After |
|---------------------------------------------------------|--------------------|
| sleep 300 survives 4+ min as PPID=1 orphan after SIGTERM | dies within 2 s |
| No INTERRUPT DETECTED in trace | INTERRUPT DETECTED fires + killing process group |
| tests/tools/test_local_interrupt_cleanup | 1/1 pass |
| tests/run_agent/test_concurrent_interrupt | 4/4 pass |
This commit is contained in:
parent
607be54a24
commit
20f2258f34
6 changed files with 551 additions and 22 deletions
72
run_agent.py
72
run_agent.py
|
|
@ -831,6 +831,16 @@ class AIAgent:
|
|||
self._execution_thread_id: int | None = None # Set at run_conversation() start
|
||||
self._interrupt_thread_signal_pending = False
|
||||
self._client_lock = threading.RLock()
|
||||
|
||||
# Concurrent-tool worker thread tracking. `_execute_tool_calls_concurrent`
|
||||
# runs each tool on its own ThreadPoolExecutor worker — those worker
|
||||
# threads have tids distinct from `_execution_thread_id`, so
|
||||
# `_set_interrupt(True, _execution_thread_id)` alone does NOT cause
|
||||
# `is_interrupted()` inside the worker to return True. Track the
|
||||
# workers here so `interrupt()` / `clear_interrupt()` can fan out to
|
||||
# their tids explicitly.
|
||||
self._tool_worker_threads: set[int] = set()
|
||||
self._tool_worker_threads_lock = threading.Lock()
|
||||
|
||||
# Subagent delegation state
|
||||
self._delegate_depth = 0 # 0 = top-level agent, incremented for children
|
||||
|
|
@ -3191,6 +3201,25 @@ class AIAgent:
|
|||
# interrupt signal until startup completes instead of targeting
|
||||
# the caller thread by mistake.
|
||||
self._interrupt_thread_signal_pending = True
|
||||
# Fan out to concurrent-tool worker threads. Those workers run tools
|
||||
# on their own tids (ThreadPoolExecutor workers), so `is_interrupted()`
|
||||
# inside a tool only sees an interrupt when their specific tid is in
|
||||
# the `_interrupted_threads` set. Without this propagation, an
|
||||
# already-running concurrent tool (e.g. a terminal command hung on
|
||||
# network I/O) never notices the interrupt and has to run to its own
|
||||
# timeout. See `_run_tool` for the matching entry/exit bookkeeping.
|
||||
# `getattr` fallback covers test stubs that build AIAgent via
|
||||
# object.__new__ and skip __init__.
|
||||
_tracker = getattr(self, "_tool_worker_threads", None)
|
||||
_tracker_lock = getattr(self, "_tool_worker_threads_lock", None)
|
||||
if _tracker is not None and _tracker_lock is not None:
|
||||
with _tracker_lock:
|
||||
_worker_tids = list(_tracker)
|
||||
for _wtid in _worker_tids:
|
||||
try:
|
||||
_set_interrupt(True, _wtid)
|
||||
except Exception:
|
||||
pass
|
||||
# Propagate interrupt to any running child agents (subagent delegation)
|
||||
with self._active_children_lock:
|
||||
children_copy = list(self._active_children)
|
||||
|
|
@ -3209,6 +3238,23 @@ class AIAgent:
|
|||
self._interrupt_thread_signal_pending = False
|
||||
if self._execution_thread_id is not None:
|
||||
_set_interrupt(False, self._execution_thread_id)
|
||||
# Also clear any concurrent-tool worker thread bits. Tracked
|
||||
# workers normally clear their own bit on exit, but an explicit
|
||||
# clear here guarantees no stale interrupt can survive a turn
|
||||
# boundary and fire on a subsequent, unrelated tool call that
|
||||
# happens to get scheduled onto the same recycled worker tid.
|
||||
# `getattr` fallback covers test stubs that build AIAgent via
|
||||
# object.__new__ and skip __init__.
|
||||
_tracker = getattr(self, "_tool_worker_threads", None)
|
||||
_tracker_lock = getattr(self, "_tool_worker_threads_lock", None)
|
||||
if _tracker is not None and _tracker_lock is not None:
|
||||
with _tracker_lock:
|
||||
_worker_tids = list(_tracker)
|
||||
for _wtid in _worker_tids:
|
||||
try:
|
||||
_set_interrupt(False, _wtid)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _touch_activity(self, desc: str) -> None:
|
||||
"""Update the last-activity timestamp and description (thread-safe)."""
|
||||
|
|
@ -7653,6 +7699,22 @@ class AIAgent:
|
|||
|
||||
def _run_tool(index, tool_call, function_name, function_args):
|
||||
"""Worker function executed in a thread."""
|
||||
# Register this worker tid so the agent can fan out an interrupt
|
||||
# to it — see AIAgent.interrupt(). Must happen first thing, and
|
||||
# must be paired with discard + clear in the finally block.
|
||||
_worker_tid = threading.current_thread().ident
|
||||
with self._tool_worker_threads_lock:
|
||||
self._tool_worker_threads.add(_worker_tid)
|
||||
# Race: if the agent was interrupted between fan-out (which
|
||||
# snapshotted an empty/earlier set) and our registration, apply
|
||||
# the interrupt to our own tid now so is_interrupted() inside
|
||||
# the tool returns True on the next poll.
|
||||
if self._interrupt_requested:
|
||||
try:
|
||||
from tools.interrupt import set_interrupt as _sif
|
||||
_sif(True, _worker_tid)
|
||||
except Exception:
|
||||
pass
|
||||
# Set the activity callback on THIS worker thread so
|
||||
# _wait_for_process (terminal commands) can fire heartbeats.
|
||||
# The callback is thread-local; the main thread's callback
|
||||
|
|
@ -7675,6 +7737,16 @@ class AIAgent:
|
|||
else:
|
||||
logger.info("tool %s completed (%.2fs, %d chars)", function_name, duration, len(result))
|
||||
results[index] = (function_name, function_args, result, duration, is_error)
|
||||
# Tear down worker-tid tracking. Clear any interrupt bit we may
|
||||
# have set so the next task scheduled onto this recycled tid
|
||||
# starts with a clean slate.
|
||||
with self._tool_worker_threads_lock:
|
||||
self._tool_worker_threads.discard(_worker_tid)
|
||||
try:
|
||||
from tools.interrupt import set_interrupt as _sif
|
||||
_sif(False, _worker_tid)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Start spinner for CLI mode (skip when TUI handles tool progress)
|
||||
spinner = None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue