fix(delegate): tool-activity-aware heartbeat stale detection (#13041) (#15183)

A child running a legitimately long-running tool (terminal command,
browser fetch, big file read) holds current_tool set and keeps
api_call_count frozen while the tool runs. The previous stale check
treated that as idle after 5 heartbeat cycles (~150s), stopped
touching the parent, and let the gateway kill the session.

Split the threshold in two:
- _HEARTBEAT_STALE_CYCLES_IDLE=5 (~150s)  — applied only when
  current_tool is None (child wedged between turns)
- _HEARTBEAT_STALE_CYCLES_IN_TOOL=20 (~600s) — applied when the child
  is inside a tool call

Stale counter also resets when current_tool changes (new tool =
progress). The hard child_timeout_seconds (default 600s) is still
the final cap, so genuinely stuck tools don't get to block forever.
This commit is contained in:
Teknium 2026-04-24 07:25:19 -07:00 committed by GitHub
parent 1840c6a57d
commit fcc05284fc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 146 additions and 14 deletions

View file

@ -1319,6 +1319,112 @@ class TestDelegateHeartbeat(unittest.TestCase):
any("API call #5 completed" in desc for desc in touch_calls), any("API call #5 completed" in desc for desc in touch_calls),
f"Heartbeat should include last_activity_desc: {touch_calls}") f"Heartbeat should include last_activity_desc: {touch_calls}")
def test_heartbeat_does_not_trip_idle_stale_while_inside_tool(self):
"""A long-running tool (no iteration advance, but current_tool set)
must not be flagged stale at the idle threshold.
Bug #13041: when a child is legitimately busy inside a slow tool
(terminal command, browser fetch), api_call_count does not advance.
The previous stale check treated this as idle and stopped the
heartbeat after 5 cycles (~150s), letting the gateway kill the
session. The fix uses a much higher in-tool threshold and only
applies the tight idle threshold when current_tool is None.
"""
from tools.delegate_tool import _run_single_child
parent = _make_mock_parent()
touch_calls = []
parent._touch_activity = lambda desc: touch_calls.append(desc)
child = MagicMock()
# Child is stuck inside a single terminal call for the whole run.
# api_call_count never advances, current_tool is always set.
child.get_activity_summary.return_value = {
"current_tool": "terminal",
"api_call_count": 1,
"max_iterations": 50,
"last_activity_desc": "executing tool: terminal",
}
def slow_run(**kwargs):
# Long enough to exceed the OLD idle threshold (5 cycles) at
# the patched interval, but shorter than the new in-tool
# threshold.
time.sleep(0.4)
return {"final_response": "done", "completed": True, "api_calls": 1}
child.run_conversation.side_effect = slow_run
# Patch both the interval AND the idle ceiling so the test proves
# the in-tool branch takes effect: with a 0.05s interval and the
# default _HEARTBEAT_STALE_CYCLES_IDLE=5, the old behavior would
# trip after 0.25s and stop firing. We should see heartbeats
# continuing through the full 0.4s run.
with patch("tools.delegate_tool._HEARTBEAT_INTERVAL", 0.05):
_run_single_child(
task_index=0,
goal="Test long-running tool",
child=child,
parent_agent=parent,
)
# With the old idle threshold (5 cycles = 0.25s), touch_calls
# would cap at ~5. With the in-tool threshold (20 cycles = 1.0s),
# we should see substantially more heartbeats over 0.4s.
self.assertGreater(
len(touch_calls), 6,
f"Heartbeat stopped too early while child was inside a tool; "
f"got {len(touch_calls)} touches over 0.4s at 0.05s interval",
)
def test_heartbeat_still_trips_idle_stale_when_no_tool(self):
"""A wedged child with no current_tool still trips the idle threshold.
Regression guard: the fix for #13041 must not disable stale
detection entirely. A child that's hung between turns (no tool
running, no iteration progress) must still stop touching the
parent so the gateway timeout can fire.
"""
from tools.delegate_tool import _run_single_child
parent = _make_mock_parent()
touch_calls = []
parent._touch_activity = lambda desc: touch_calls.append(desc)
child = MagicMock()
# Wedged child: no tool running, iteration frozen.
child.get_activity_summary.return_value = {
"current_tool": None,
"api_call_count": 3,
"max_iterations": 50,
"last_activity_desc": "waiting for API response",
}
def slow_run(**kwargs):
time.sleep(0.6)
return {"final_response": "done", "completed": True, "api_calls": 3}
child.run_conversation.side_effect = slow_run
# At interval 0.05s, idle threshold (5 cycles) trips at ~0.25s.
# We should see the heartbeat stop firing well before 0.6s.
with patch("tools.delegate_tool._HEARTBEAT_INTERVAL", 0.05):
_run_single_child(
task_index=0,
goal="Test wedged child",
child=child,
parent_agent=parent,
)
# With idle threshold=5 + interval=0.05s, touches should cap
# around 5. Bound loosely to avoid timing flakes.
self.assertLess(
len(touch_calls), 9,
f"Idle stale detection did not fire: got {len(touch_calls)} "
f"touches over 0.6s — expected heartbeat to stop after "
f"~5 stale cycles",
)
class TestDelegationReasoningEffort(unittest.TestCase): class TestDelegationReasoningEffort(unittest.TestCase):
"""Tests for delegation.reasoning_effort config override.""" """Tests for delegation.reasoning_effort config override."""

View file

@ -411,9 +411,15 @@ def _preserve_parent_mcp_toolsets(
DEFAULT_MAX_ITERATIONS = 50 DEFAULT_MAX_ITERATIONS = 50
DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck
_HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation _HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation
_HEARTBEAT_STALE_CYCLES = ( # Stale-heartbeat thresholds. A child with no API-call progress is either:
5 # mark child stale after this many heartbeats with no iteration progress # - idle between turns (no current_tool) — probably stuck on a slow API call
) # - inside a tool (current_tool set) — probably running a legitimately long
# operation (terminal command, web fetch, large file read)
# The idle ceiling stays tight so genuinely stuck children don't mask the gateway
# timeout. The in-tool ceiling is much higher so legit long-running tools get
# time to finish; child_timeout_seconds (default 600s) is still the hard cap.
_HEARTBEAT_STALE_CYCLES_IDLE = 5 # 5 * 30s = 150s idle between turns → stale
_HEARTBEAT_STALE_CYCLES_IN_TOOL = 20 # 20 * 30s = 600s stuck on same tool → stale
DEFAULT_TOOLSETS = ["terminal", "file", "web"] DEFAULT_TOOLSETS = ["terminal", "file", "web"]
@ -1201,7 +1207,11 @@ def _run_single_child(
# Without this, the parent's _last_activity_ts freezes when delegate_task # Without this, the parent's _last_activity_ts freezes when delegate_task
# starts and the gateway eventually kills the agent for "no activity". # starts and the gateway eventually kills the agent for "no activity".
_heartbeat_stop = threading.Event() _heartbeat_stop = threading.Event()
_last_seen_iter = [0] # mutable container for heartbeat stale detection # Stale detection: track the child's (tool, iteration) pair across
# heartbeat cycles. If neither advances, count the cycle as stale.
# Different thresholds for idle vs in-tool (see _HEARTBEAT_STALE_CYCLES_*).
_last_seen_iter = [0]
_last_seen_tool = [None] # type: list
_stale_count = [0] _stale_count = [0]
def _heartbeat_loop(): def _heartbeat_loop():
@ -1219,22 +1229,38 @@ def _run_single_child(
child_iter = child_summary.get("api_call_count", 0) child_iter = child_summary.get("api_call_count", 0)
child_max = child_summary.get("max_iterations", 0) child_max = child_summary.get("max_iterations", 0)
# Stale detection: if iteration count hasn't advanced, # Stale detection: count cycles where neither the iteration
# increment stale counter. After N cycles with no # count nor the current_tool advances. A child running a
# progress, stop masking the hang so the gateway # legitimately long-running tool (terminal command, web
# inactivity timeout can fire as a last resort. # fetch) keeps current_tool set but doesn't advance
if child_iter <= _last_seen_iter[0]: # api_call_count — we don't want that to look stale at the
_stale_count[0] += 1 # idle threshold.
else: iter_advanced = child_iter > _last_seen_iter[0]
tool_changed = child_tool != _last_seen_tool[0]
if iter_advanced or tool_changed:
_last_seen_iter[0] = child_iter _last_seen_iter[0] = child_iter
_last_seen_tool[0] = child_tool
_stale_count[0] = 0 _stale_count[0] = 0
else:
_stale_count[0] += 1
if _stale_count[0] >= _HEARTBEAT_STALE_CYCLES: # Pick threshold based on whether the child is currently
# inside a tool call. In-tool threshold is high enough to
# cover legitimately slow tools; idle threshold stays
# tight so the gateway timeout can fire on a truly wedged
# child.
stale_limit = (
_HEARTBEAT_STALE_CYCLES_IN_TOOL
if child_tool
else _HEARTBEAT_STALE_CYCLES_IDLE
)
if _stale_count[0] >= stale_limit:
logger.warning( logger.warning(
"Subagent %d appears stale (no iteration progress " "Subagent %d appears stale (no progress for %d "
"for %d heartbeat cycles) — stopping heartbeat", "heartbeat cycles, tool=%s) — stopping heartbeat",
task_index, task_index,
_stale_count[0], _stale_count[0],
child_tool or "<none>",
) )
break # stop touching parent, let gateway timeout fire break # stop touching parent, let gateway timeout fire