mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
A child running a legitimately long-running tool (terminal command, browser fetch, big file read) holds current_tool set and keeps api_call_count frozen while the tool runs. The previous stale check treated that as idle after 5 heartbeat cycles (~150s), stopped touching the parent, and let the gateway kill the session. Split the threshold in two: - _HEARTBEAT_STALE_CYCLES_IDLE=5 (~150s) — applied only when current_tool is None (child wedged between turns) - _HEARTBEAT_STALE_CYCLES_IN_TOOL=20 (~600s) — applied when the child is inside a tool call Stale counter also resets when current_tool changes (new tool = progress). The hard child_timeout_seconds (default 600s) is still the final cap, so genuinely stuck tools don't get to block forever.
This commit is contained in:
parent
1840c6a57d
commit
fcc05284fc
2 changed files with 146 additions and 14 deletions
|
|
@ -411,9 +411,15 @@ def _preserve_parent_mcp_toolsets(
|
|||
DEFAULT_MAX_ITERATIONS = 50
|
||||
DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck
|
||||
_HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation
|
||||
_HEARTBEAT_STALE_CYCLES = (
|
||||
5 # mark child stale after this many heartbeats with no iteration progress
|
||||
)
|
||||
# Stale-heartbeat thresholds. A child with no API-call progress is either:
|
||||
# - idle between turns (no current_tool) — probably stuck on a slow API call
|
||||
# - inside a tool (current_tool set) — probably running a legitimately long
|
||||
# operation (terminal command, web fetch, large file read)
|
||||
# The idle ceiling stays tight so genuinely stuck children don't mask the gateway
|
||||
# timeout. The in-tool ceiling is much higher so legit long-running tools get
|
||||
# time to finish; child_timeout_seconds (default 600s) is still the hard cap.
|
||||
_HEARTBEAT_STALE_CYCLES_IDLE = 5 # 5 * 30s = 150s idle between turns → stale
|
||||
_HEARTBEAT_STALE_CYCLES_IN_TOOL = 20 # 20 * 30s = 600s stuck on same tool → stale
|
||||
DEFAULT_TOOLSETS = ["terminal", "file", "web"]
|
||||
|
||||
|
||||
|
|
@ -1201,7 +1207,11 @@ def _run_single_child(
|
|||
# Without this, the parent's _last_activity_ts freezes when delegate_task
|
||||
# starts and the gateway eventually kills the agent for "no activity".
|
||||
_heartbeat_stop = threading.Event()
|
||||
_last_seen_iter = [0] # mutable container for heartbeat stale detection
|
||||
# Stale detection: track the child's (tool, iteration) pair across
|
||||
# heartbeat cycles. If neither advances, count the cycle as stale.
|
||||
# Different thresholds for idle vs in-tool (see _HEARTBEAT_STALE_CYCLES_*).
|
||||
_last_seen_iter = [0]
|
||||
_last_seen_tool = [None] # type: list
|
||||
_stale_count = [0]
|
||||
|
||||
def _heartbeat_loop():
|
||||
|
|
@ -1219,22 +1229,38 @@ def _run_single_child(
|
|||
child_iter = child_summary.get("api_call_count", 0)
|
||||
child_max = child_summary.get("max_iterations", 0)
|
||||
|
||||
# Stale detection: if iteration count hasn't advanced,
|
||||
# increment stale counter. After N cycles with no
|
||||
# progress, stop masking the hang so the gateway
|
||||
# inactivity timeout can fire as a last resort.
|
||||
if child_iter <= _last_seen_iter[0]:
|
||||
_stale_count[0] += 1
|
||||
else:
|
||||
# Stale detection: count cycles where neither the iteration
|
||||
# count nor the current_tool advances. A child running a
|
||||
# legitimately long-running tool (terminal command, web
|
||||
# fetch) keeps current_tool set but doesn't advance
|
||||
# api_call_count — we don't want that to look stale at the
|
||||
# idle threshold.
|
||||
iter_advanced = child_iter > _last_seen_iter[0]
|
||||
tool_changed = child_tool != _last_seen_tool[0]
|
||||
if iter_advanced or tool_changed:
|
||||
_last_seen_iter[0] = child_iter
|
||||
_last_seen_tool[0] = child_tool
|
||||
_stale_count[0] = 0
|
||||
else:
|
||||
_stale_count[0] += 1
|
||||
|
||||
if _stale_count[0] >= _HEARTBEAT_STALE_CYCLES:
|
||||
# Pick threshold based on whether the child is currently
|
||||
# inside a tool call. In-tool threshold is high enough to
|
||||
# cover legitimately slow tools; idle threshold stays
|
||||
# tight so the gateway timeout can fire on a truly wedged
|
||||
# child.
|
||||
stale_limit = (
|
||||
_HEARTBEAT_STALE_CYCLES_IN_TOOL
|
||||
if child_tool
|
||||
else _HEARTBEAT_STALE_CYCLES_IDLE
|
||||
)
|
||||
if _stale_count[0] >= stale_limit:
|
||||
logger.warning(
|
||||
"Subagent %d appears stale (no iteration progress "
|
||||
"for %d heartbeat cycles) — stopping heartbeat",
|
||||
"Subagent %d appears stale (no progress for %d "
|
||||
"heartbeat cycles, tool=%s) — stopping heartbeat",
|
||||
task_index,
|
||||
_stale_count[0],
|
||||
child_tool or "<none>",
|
||||
)
|
||||
break # stop touching parent, let gateway timeout fire
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue