From fcc05284fc7f8d46c143b3e940ade505f782eb6e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 24 Apr 2026 07:25:19 -0700 Subject: [PATCH] fix(delegate): tool-activity-aware heartbeat stale detection (#13041) (#15183) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A child running a legitimately long-running tool (terminal command, browser fetch, big file read) holds current_tool set and keeps api_call_count frozen while the tool runs. The previous stale check treated that as idle after 5 heartbeat cycles (~150s), stopped touching the parent, and let the gateway kill the session. Split the threshold in two: - _HEARTBEAT_STALE_CYCLES_IDLE=5 (~150s) — applied only when current_tool is None (child wedged between turns) - _HEARTBEAT_STALE_CYCLES_IN_TOOL=20 (~600s) — applied when the child is inside a tool call Stale counter also resets when current_tool changes (new tool = progress). The hard child_timeout_seconds (default 600s) is still the final cap, so genuinely stuck tools don't get to block forever. --- tests/tools/test_delegate.py | 106 +++++++++++++++++++++++++++++++++++ tools/delegate_tool.py | 54 +++++++++++++----- 2 files changed, 146 insertions(+), 14 deletions(-) diff --git a/tests/tools/test_delegate.py b/tests/tools/test_delegate.py index 9c93f05c7..f3a1a2632 100644 --- a/tests/tools/test_delegate.py +++ b/tests/tools/test_delegate.py @@ -1319,6 +1319,112 @@ class TestDelegateHeartbeat(unittest.TestCase): any("API call #5 completed" in desc for desc in touch_calls), f"Heartbeat should include last_activity_desc: {touch_calls}") + def test_heartbeat_does_not_trip_idle_stale_while_inside_tool(self): + """A long-running tool (no iteration advance, but current_tool set) + must not be flagged stale at the idle threshold. + + Bug #13041: when a child is legitimately busy inside a slow tool + (terminal command, browser fetch), api_call_count does not advance. + The previous stale check treated this as idle and stopped the + heartbeat after 5 cycles (~150s), letting the gateway kill the + session. The fix uses a much higher in-tool threshold and only + applies the tight idle threshold when current_tool is None. + """ + from tools.delegate_tool import _run_single_child + + parent = _make_mock_parent() + touch_calls = [] + parent._touch_activity = lambda desc: touch_calls.append(desc) + + child = MagicMock() + # Child is stuck inside a single terminal call for the whole run. + # api_call_count never advances, current_tool is always set. + child.get_activity_summary.return_value = { + "current_tool": "terminal", + "api_call_count": 1, + "max_iterations": 50, + "last_activity_desc": "executing tool: terminal", + } + + def slow_run(**kwargs): + # Long enough to exceed the OLD idle threshold (5 cycles) at + # the patched interval, but shorter than the new in-tool + # threshold. + time.sleep(0.4) + return {"final_response": "done", "completed": True, "api_calls": 1} + + child.run_conversation.side_effect = slow_run + + # Patch both the interval AND the idle ceiling so the test proves + # the in-tool branch takes effect: with a 0.05s interval and the + # default _HEARTBEAT_STALE_CYCLES_IDLE=5, the old behavior would + # trip after 0.25s and stop firing. We should see heartbeats + # continuing through the full 0.4s run. + with patch("tools.delegate_tool._HEARTBEAT_INTERVAL", 0.05): + _run_single_child( + task_index=0, + goal="Test long-running tool", + child=child, + parent_agent=parent, + ) + + # With the old idle threshold (5 cycles = 0.25s), touch_calls + # would cap at ~5. With the in-tool threshold (20 cycles = 1.0s), + # we should see substantially more heartbeats over 0.4s. + self.assertGreater( + len(touch_calls), 6, + f"Heartbeat stopped too early while child was inside a tool; " + f"got {len(touch_calls)} touches over 0.4s at 0.05s interval", + ) + + def test_heartbeat_still_trips_idle_stale_when_no_tool(self): + """A wedged child with no current_tool still trips the idle threshold. + + Regression guard: the fix for #13041 must not disable stale + detection entirely. A child that's hung between turns (no tool + running, no iteration progress) must still stop touching the + parent so the gateway timeout can fire. + """ + from tools.delegate_tool import _run_single_child + + parent = _make_mock_parent() + touch_calls = [] + parent._touch_activity = lambda desc: touch_calls.append(desc) + + child = MagicMock() + # Wedged child: no tool running, iteration frozen. + child.get_activity_summary.return_value = { + "current_tool": None, + "api_call_count": 3, + "max_iterations": 50, + "last_activity_desc": "waiting for API response", + } + + def slow_run(**kwargs): + time.sleep(0.6) + return {"final_response": "done", "completed": True, "api_calls": 3} + + child.run_conversation.side_effect = slow_run + + # At interval 0.05s, idle threshold (5 cycles) trips at ~0.25s. + # We should see the heartbeat stop firing well before 0.6s. + with patch("tools.delegate_tool._HEARTBEAT_INTERVAL", 0.05): + _run_single_child( + task_index=0, + goal="Test wedged child", + child=child, + parent_agent=parent, + ) + + # With idle threshold=5 + interval=0.05s, touches should cap + # around 5. Bound loosely to avoid timing flakes. + self.assertLess( + len(touch_calls), 9, + f"Idle stale detection did not fire: got {len(touch_calls)} " + f"touches over 0.6s — expected heartbeat to stop after " + f"~5 stale cycles", + ) + class TestDelegationReasoningEffort(unittest.TestCase): """Tests for delegation.reasoning_effort config override.""" diff --git a/tools/delegate_tool.py b/tools/delegate_tool.py index e3273c317..2bbf354cf 100644 --- a/tools/delegate_tool.py +++ b/tools/delegate_tool.py @@ -411,9 +411,15 @@ def _preserve_parent_mcp_toolsets( DEFAULT_MAX_ITERATIONS = 50 DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck _HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation -_HEARTBEAT_STALE_CYCLES = ( - 5 # mark child stale after this many heartbeats with no iteration progress -) +# Stale-heartbeat thresholds. A child with no API-call progress is either: +# - idle between turns (no current_tool) — probably stuck on a slow API call +# - inside a tool (current_tool set) — probably running a legitimately long +# operation (terminal command, web fetch, large file read) +# The idle ceiling stays tight so genuinely stuck children don't mask the gateway +# timeout. The in-tool ceiling is much higher so legit long-running tools get +# time to finish; child_timeout_seconds (default 600s) is still the hard cap. +_HEARTBEAT_STALE_CYCLES_IDLE = 5 # 5 * 30s = 150s idle between turns → stale +_HEARTBEAT_STALE_CYCLES_IN_TOOL = 20 # 20 * 30s = 600s stuck on same tool → stale DEFAULT_TOOLSETS = ["terminal", "file", "web"] @@ -1201,7 +1207,11 @@ def _run_single_child( # Without this, the parent's _last_activity_ts freezes when delegate_task # starts and the gateway eventually kills the agent for "no activity". _heartbeat_stop = threading.Event() - _last_seen_iter = [0] # mutable container for heartbeat stale detection + # Stale detection: track the child's (tool, iteration) pair across + # heartbeat cycles. If neither advances, count the cycle as stale. + # Different thresholds for idle vs in-tool (see _HEARTBEAT_STALE_CYCLES_*). + _last_seen_iter = [0] + _last_seen_tool = [None] # type: list _stale_count = [0] def _heartbeat_loop(): @@ -1219,22 +1229,38 @@ def _run_single_child( child_iter = child_summary.get("api_call_count", 0) child_max = child_summary.get("max_iterations", 0) - # Stale detection: if iteration count hasn't advanced, - # increment stale counter. After N cycles with no - # progress, stop masking the hang so the gateway - # inactivity timeout can fire as a last resort. - if child_iter <= _last_seen_iter[0]: - _stale_count[0] += 1 - else: + # Stale detection: count cycles where neither the iteration + # count nor the current_tool advances. A child running a + # legitimately long-running tool (terminal command, web + # fetch) keeps current_tool set but doesn't advance + # api_call_count — we don't want that to look stale at the + # idle threshold. + iter_advanced = child_iter > _last_seen_iter[0] + tool_changed = child_tool != _last_seen_tool[0] + if iter_advanced or tool_changed: _last_seen_iter[0] = child_iter + _last_seen_tool[0] = child_tool _stale_count[0] = 0 + else: + _stale_count[0] += 1 - if _stale_count[0] >= _HEARTBEAT_STALE_CYCLES: + # Pick threshold based on whether the child is currently + # inside a tool call. In-tool threshold is high enough to + # cover legitimately slow tools; idle threshold stays + # tight so the gateway timeout can fire on a truly wedged + # child. + stale_limit = ( + _HEARTBEAT_STALE_CYCLES_IN_TOOL + if child_tool + else _HEARTBEAT_STALE_CYCLES_IDLE + ) + if _stale_count[0] >= stale_limit: logger.warning( - "Subagent %d appears stale (no iteration progress " - "for %d heartbeat cycles) — stopping heartbeat", + "Subagent %d appears stale (no progress for %d " + "heartbeat cycles, tool=%s) — stopping heartbeat", task_index, _stale_count[0], + child_tool or "", ) break # stop touching parent, let gateway timeout fire