fix(delegate): tool-activity-aware heartbeat stale detection (#13041) (#15183)

A child running a legitimately long-running tool (terminal command,
browser fetch, big file read) holds current_tool set and keeps
api_call_count frozen while the tool runs. The previous stale check
treated that as idle after 5 heartbeat cycles (~150s), stopped
touching the parent, and let the gateway kill the session.

Split the threshold in two:
- _HEARTBEAT_STALE_CYCLES_IDLE=5 (~150s)  — applied only when
  current_tool is None (child wedged between turns)
- _HEARTBEAT_STALE_CYCLES_IN_TOOL=20 (~600s) — applied when the child
  is inside a tool call

Stale counter also resets when current_tool changes (new tool =
progress). The hard child_timeout_seconds (default 600s) is still
the final cap, so genuinely stuck tools don't get to block forever.
This commit is contained in:
Teknium 2026-04-24 07:25:19 -07:00 committed by GitHub
parent 1840c6a57d
commit fcc05284fc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 146 additions and 14 deletions

View file

@ -411,9 +411,15 @@ def _preserve_parent_mcp_toolsets(
DEFAULT_MAX_ITERATIONS = 50
DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck
_HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation
_HEARTBEAT_STALE_CYCLES = (
5 # mark child stale after this many heartbeats with no iteration progress
)
# Stale-heartbeat thresholds. A child with no API-call progress is either:
# - idle between turns (no current_tool) — probably stuck on a slow API call
# - inside a tool (current_tool set) — probably running a legitimately long
# operation (terminal command, web fetch, large file read)
# The idle ceiling stays tight so genuinely stuck children don't mask the gateway
# timeout. The in-tool ceiling is much higher so legit long-running tools get
# time to finish; child_timeout_seconds (default 600s) is still the hard cap.
_HEARTBEAT_STALE_CYCLES_IDLE = 5 # 5 * 30s = 150s idle between turns → stale
_HEARTBEAT_STALE_CYCLES_IN_TOOL = 20 # 20 * 30s = 600s stuck on same tool → stale
DEFAULT_TOOLSETS = ["terminal", "file", "web"]
@ -1201,7 +1207,11 @@ def _run_single_child(
# Without this, the parent's _last_activity_ts freezes when delegate_task
# starts and the gateway eventually kills the agent for "no activity".
_heartbeat_stop = threading.Event()
_last_seen_iter = [0] # mutable container for heartbeat stale detection
# Stale detection: track the child's (tool, iteration) pair across
# heartbeat cycles. If neither advances, count the cycle as stale.
# Different thresholds for idle vs in-tool (see _HEARTBEAT_STALE_CYCLES_*).
_last_seen_iter = [0]
_last_seen_tool = [None] # type: list
_stale_count = [0]
def _heartbeat_loop():
@ -1219,22 +1229,38 @@ def _run_single_child(
child_iter = child_summary.get("api_call_count", 0)
child_max = child_summary.get("max_iterations", 0)
# Stale detection: if iteration count hasn't advanced,
# increment stale counter. After N cycles with no
# progress, stop masking the hang so the gateway
# inactivity timeout can fire as a last resort.
if child_iter <= _last_seen_iter[0]:
_stale_count[0] += 1
else:
# Stale detection: count cycles where neither the iteration
# count nor the current_tool advances. A child running a
# legitimately long-running tool (terminal command, web
# fetch) keeps current_tool set but doesn't advance
# api_call_count — we don't want that to look stale at the
# idle threshold.
iter_advanced = child_iter > _last_seen_iter[0]
tool_changed = child_tool != _last_seen_tool[0]
if iter_advanced or tool_changed:
_last_seen_iter[0] = child_iter
_last_seen_tool[0] = child_tool
_stale_count[0] = 0
else:
_stale_count[0] += 1
if _stale_count[0] >= _HEARTBEAT_STALE_CYCLES:
# Pick threshold based on whether the child is currently
# inside a tool call. In-tool threshold is high enough to
# cover legitimately slow tools; idle threshold stays
# tight so the gateway timeout can fire on a truly wedged
# child.
stale_limit = (
_HEARTBEAT_STALE_CYCLES_IN_TOOL
if child_tool
else _HEARTBEAT_STALE_CYCLES_IDLE
)
if _stale_count[0] >= stale_limit:
logger.warning(
"Subagent %d appears stale (no iteration progress "
"for %d heartbeat cycles) — stopping heartbeat",
"Subagent %d appears stale (no progress for %d "
"heartbeat cycles, tool=%s) — stopping heartbeat",
task_index,
_stale_count[0],
child_tool or "<none>",
)
break # stop touching parent, let gateway timeout fire