fix(delegate): tool-activity-aware heartbeat stale detection (#13041) (#15183)

A child running a legitimately long-running tool (terminal command,
browser fetch, big file read) holds current_tool set and keeps
api_call_count frozen while the tool runs. The previous stale check
treated that as idle after 5 heartbeat cycles (~150s), stopped
touching the parent, and let the gateway kill the session.

Split the threshold in two:
- _HEARTBEAT_STALE_CYCLES_IDLE=5 (~150s)  — applied only when
  current_tool is None (child wedged between turns)
- _HEARTBEAT_STALE_CYCLES_IN_TOOL=20 (~600s) — applied when the child
  is inside a tool call

Stale counter also resets when current_tool changes (new tool =
progress). The hard child_timeout_seconds (default 600s) is still
the final cap, so genuinely stuck tools don't get to block forever.
This commit is contained in:
Teknium 2026-04-24 07:25:19 -07:00 committed by GitHub
parent 1840c6a57d
commit fcc05284fc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 146 additions and 14 deletions

View file

@ -1319,6 +1319,112 @@ class TestDelegateHeartbeat(unittest.TestCase):
any("API call #5 completed" in desc for desc in touch_calls),
f"Heartbeat should include last_activity_desc: {touch_calls}")
def test_heartbeat_does_not_trip_idle_stale_while_inside_tool(self):
"""A long-running tool (no iteration advance, but current_tool set)
must not be flagged stale at the idle threshold.
Bug #13041: when a child is legitimately busy inside a slow tool
(terminal command, browser fetch), api_call_count does not advance.
The previous stale check treated this as idle and stopped the
heartbeat after 5 cycles (~150s), letting the gateway kill the
session. The fix uses a much higher in-tool threshold and only
applies the tight idle threshold when current_tool is None.
"""
from tools.delegate_tool import _run_single_child
parent = _make_mock_parent()
touch_calls = []
parent._touch_activity = lambda desc: touch_calls.append(desc)
child = MagicMock()
# Child is stuck inside a single terminal call for the whole run.
# api_call_count never advances, current_tool is always set.
child.get_activity_summary.return_value = {
"current_tool": "terminal",
"api_call_count": 1,
"max_iterations": 50,
"last_activity_desc": "executing tool: terminal",
}
def slow_run(**kwargs):
# Long enough to exceed the OLD idle threshold (5 cycles) at
# the patched interval, but shorter than the new in-tool
# threshold.
time.sleep(0.4)
return {"final_response": "done", "completed": True, "api_calls": 1}
child.run_conversation.side_effect = slow_run
# Patch both the interval AND the idle ceiling so the test proves
# the in-tool branch takes effect: with a 0.05s interval and the
# default _HEARTBEAT_STALE_CYCLES_IDLE=5, the old behavior would
# trip after 0.25s and stop firing. We should see heartbeats
# continuing through the full 0.4s run.
with patch("tools.delegate_tool._HEARTBEAT_INTERVAL", 0.05):
_run_single_child(
task_index=0,
goal="Test long-running tool",
child=child,
parent_agent=parent,
)
# With the old idle threshold (5 cycles = 0.25s), touch_calls
# would cap at ~5. With the in-tool threshold (20 cycles = 1.0s),
# we should see substantially more heartbeats over 0.4s.
self.assertGreater(
len(touch_calls), 6,
f"Heartbeat stopped too early while child was inside a tool; "
f"got {len(touch_calls)} touches over 0.4s at 0.05s interval",
)
def test_heartbeat_still_trips_idle_stale_when_no_tool(self):
"""A wedged child with no current_tool still trips the idle threshold.
Regression guard: the fix for #13041 must not disable stale
detection entirely. A child that's hung between turns (no tool
running, no iteration progress) must still stop touching the
parent so the gateway timeout can fire.
"""
from tools.delegate_tool import _run_single_child
parent = _make_mock_parent()
touch_calls = []
parent._touch_activity = lambda desc: touch_calls.append(desc)
child = MagicMock()
# Wedged child: no tool running, iteration frozen.
child.get_activity_summary.return_value = {
"current_tool": None,
"api_call_count": 3,
"max_iterations": 50,
"last_activity_desc": "waiting for API response",
}
def slow_run(**kwargs):
time.sleep(0.6)
return {"final_response": "done", "completed": True, "api_calls": 3}
child.run_conversation.side_effect = slow_run
# At interval 0.05s, idle threshold (5 cycles) trips at ~0.25s.
# We should see the heartbeat stop firing well before 0.6s.
with patch("tools.delegate_tool._HEARTBEAT_INTERVAL", 0.05):
_run_single_child(
task_index=0,
goal="Test wedged child",
child=child,
parent_agent=parent,
)
# With idle threshold=5 + interval=0.05s, touches should cap
# around 5. Bound loosely to avoid timing flakes.
self.assertLess(
len(touch_calls), 9,
f"Idle stale detection did not fire: got {len(touch_calls)} "
f"touches over 0.6s — expected heartbeat to stop after "
f"~5 stale cycles",
)
class TestDelegationReasoningEffort(unittest.TestCase):
"""Tests for delegation.reasoning_effort config override."""