mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-03 02:11:48 +00:00
fix(approval): heartbeat activity during gateway approval wait (#11245)
The blocking gateway approval wait at tools/approval.py called `entry.event.wait(timeout=...)` which never touched the agent's activity tracker. When a user was slow to respond to a /approve prompt (or the gateway_timeout config was set higher than the default 300s), the agent thread sat silent long enough for the gateway's inactivity watchdog (agent.gateway_timeout, default 1800s) to kill it — even though the agent was doing exactly the right thing and the user was the one causing the delay. The fix polls the event in 1s slices and calls touch_activity_if_due between slices, mirroring the _wait_for_process() pattern in tools/environments/base.py that covers the subprocess-waiting side of the same problem. At the default 10s heartbeat cadence, a 300s approval wait now pings activity ~30 times, well under the 1800s idle threshold. Observed in community user logs: 12 repeated 'Agent idle 1800s, last_activity=executing tool: terminal' events across April 12-14. Companion to PR #10501 which covered streaming / concurrent-tool / Modal-backend gaps but did not touch approval.py. Test: tests/tools/test_approval_heartbeat.py — verifies (1) heartbeats fire during the wait, (2) user responses are still near-instant, and (3) the approval path stays functional when the heartbeat helper can't be imported.
This commit is contained in:
parent
f6179c5d5f
commit
387aa9afc9
2 changed files with 233 additions and 2 deletions
|
|
@ -14,6 +14,7 @@ import os
|
|||
import re
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import unicodedata
|
||||
from typing import Optional
|
||||
|
||||
|
|
@ -834,13 +835,43 @@ def check_all_command_guards(command: str, env_type: str,
|
|||
"description": combined_desc,
|
||||
}
|
||||
|
||||
# Block until the user responds or timeout (default 5 min)
|
||||
# Block until the user responds or timeout (default 5 min).
|
||||
# Poll in short slices so we can fire activity heartbeats every
|
||||
# ~10s to the agent's inactivity tracker. Without this, the
|
||||
# blocking event.wait() never touches activity, and the
|
||||
# gateway's inactivity watchdog (agent.gateway_timeout, default
|
||||
# 1800s) kills the agent while the user is still responding to
|
||||
# the approval prompt. Mirrors the _wait_for_process() cadence
|
||||
# in tools/environments/base.py.
|
||||
timeout = _get_approval_config().get("gateway_timeout", 300)
|
||||
try:
|
||||
timeout = int(timeout)
|
||||
except (ValueError, TypeError):
|
||||
timeout = 300
|
||||
resolved = entry.event.wait(timeout=timeout)
|
||||
|
||||
try:
|
||||
from tools.environments.base import touch_activity_if_due
|
||||
except Exception: # pragma: no cover
|
||||
touch_activity_if_due = None
|
||||
|
||||
_now = time.monotonic()
|
||||
_deadline = _now + max(timeout, 0)
|
||||
_activity_state = {"last_touch": _now, "start": _now}
|
||||
resolved = False
|
||||
while True:
|
||||
_remaining = _deadline - time.monotonic()
|
||||
if _remaining <= 0:
|
||||
break
|
||||
# 1s poll slice — the event is set immediately when the
|
||||
# user responds, so slice length only controls heartbeat
|
||||
# cadence, not user-visible responsiveness.
|
||||
if entry.event.wait(timeout=min(1.0, _remaining)):
|
||||
resolved = True
|
||||
break
|
||||
if touch_activity_if_due is not None:
|
||||
touch_activity_if_due(
|
||||
_activity_state, "waiting for user approval"
|
||||
)
|
||||
|
||||
# Clean up this entry from the queue
|
||||
with _lock:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue