mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
A child running a legitimately long-running tool (terminal command, browser fetch, big file read) holds current_tool set and keeps api_call_count frozen while the tool runs. The previous stale check treated that as idle after 5 heartbeat cycles (~150s), stopped touching the parent, and let the gateway kill the session. Split the threshold in two: - _HEARTBEAT_STALE_CYCLES_IDLE=5 (~150s) — applied only when current_tool is None (child wedged between turns) - _HEARTBEAT_STALE_CYCLES_IN_TOOL=20 (~600s) — applied when the child is inside a tool call Stale counter also resets when current_tool changes (new tool = progress). The hard child_timeout_seconds (default 600s) is still the final cap, so genuinely stuck tools don't get to block forever.
This commit is contained in:
parent
1840c6a57d
commit
fcc05284fc
2 changed files with 146 additions and 14 deletions
|
|
@ -1319,6 +1319,112 @@ class TestDelegateHeartbeat(unittest.TestCase):
|
||||||
any("API call #5 completed" in desc for desc in touch_calls),
|
any("API call #5 completed" in desc for desc in touch_calls),
|
||||||
f"Heartbeat should include last_activity_desc: {touch_calls}")
|
f"Heartbeat should include last_activity_desc: {touch_calls}")
|
||||||
|
|
||||||
|
def test_heartbeat_does_not_trip_idle_stale_while_inside_tool(self):
|
||||||
|
"""A long-running tool (no iteration advance, but current_tool set)
|
||||||
|
must not be flagged stale at the idle threshold.
|
||||||
|
|
||||||
|
Bug #13041: when a child is legitimately busy inside a slow tool
|
||||||
|
(terminal command, browser fetch), api_call_count does not advance.
|
||||||
|
The previous stale check treated this as idle and stopped the
|
||||||
|
heartbeat after 5 cycles (~150s), letting the gateway kill the
|
||||||
|
session. The fix uses a much higher in-tool threshold and only
|
||||||
|
applies the tight idle threshold when current_tool is None.
|
||||||
|
"""
|
||||||
|
from tools.delegate_tool import _run_single_child
|
||||||
|
|
||||||
|
parent = _make_mock_parent()
|
||||||
|
touch_calls = []
|
||||||
|
parent._touch_activity = lambda desc: touch_calls.append(desc)
|
||||||
|
|
||||||
|
child = MagicMock()
|
||||||
|
# Child is stuck inside a single terminal call for the whole run.
|
||||||
|
# api_call_count never advances, current_tool is always set.
|
||||||
|
child.get_activity_summary.return_value = {
|
||||||
|
"current_tool": "terminal",
|
||||||
|
"api_call_count": 1,
|
||||||
|
"max_iterations": 50,
|
||||||
|
"last_activity_desc": "executing tool: terminal",
|
||||||
|
}
|
||||||
|
|
||||||
|
def slow_run(**kwargs):
|
||||||
|
# Long enough to exceed the OLD idle threshold (5 cycles) at
|
||||||
|
# the patched interval, but shorter than the new in-tool
|
||||||
|
# threshold.
|
||||||
|
time.sleep(0.4)
|
||||||
|
return {"final_response": "done", "completed": True, "api_calls": 1}
|
||||||
|
|
||||||
|
child.run_conversation.side_effect = slow_run
|
||||||
|
|
||||||
|
# Patch both the interval AND the idle ceiling so the test proves
|
||||||
|
# the in-tool branch takes effect: with a 0.05s interval and the
|
||||||
|
# default _HEARTBEAT_STALE_CYCLES_IDLE=5, the old behavior would
|
||||||
|
# trip after 0.25s and stop firing. We should see heartbeats
|
||||||
|
# continuing through the full 0.4s run.
|
||||||
|
with patch("tools.delegate_tool._HEARTBEAT_INTERVAL", 0.05):
|
||||||
|
_run_single_child(
|
||||||
|
task_index=0,
|
||||||
|
goal="Test long-running tool",
|
||||||
|
child=child,
|
||||||
|
parent_agent=parent,
|
||||||
|
)
|
||||||
|
|
||||||
|
# With the old idle threshold (5 cycles = 0.25s), touch_calls
|
||||||
|
# would cap at ~5. With the in-tool threshold (20 cycles = 1.0s),
|
||||||
|
# we should see substantially more heartbeats over 0.4s.
|
||||||
|
self.assertGreater(
|
||||||
|
len(touch_calls), 6,
|
||||||
|
f"Heartbeat stopped too early while child was inside a tool; "
|
||||||
|
f"got {len(touch_calls)} touches over 0.4s at 0.05s interval",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_heartbeat_still_trips_idle_stale_when_no_tool(self):
|
||||||
|
"""A wedged child with no current_tool still trips the idle threshold.
|
||||||
|
|
||||||
|
Regression guard: the fix for #13041 must not disable stale
|
||||||
|
detection entirely. A child that's hung between turns (no tool
|
||||||
|
running, no iteration progress) must still stop touching the
|
||||||
|
parent so the gateway timeout can fire.
|
||||||
|
"""
|
||||||
|
from tools.delegate_tool import _run_single_child
|
||||||
|
|
||||||
|
parent = _make_mock_parent()
|
||||||
|
touch_calls = []
|
||||||
|
parent._touch_activity = lambda desc: touch_calls.append(desc)
|
||||||
|
|
||||||
|
child = MagicMock()
|
||||||
|
# Wedged child: no tool running, iteration frozen.
|
||||||
|
child.get_activity_summary.return_value = {
|
||||||
|
"current_tool": None,
|
||||||
|
"api_call_count": 3,
|
||||||
|
"max_iterations": 50,
|
||||||
|
"last_activity_desc": "waiting for API response",
|
||||||
|
}
|
||||||
|
|
||||||
|
def slow_run(**kwargs):
|
||||||
|
time.sleep(0.6)
|
||||||
|
return {"final_response": "done", "completed": True, "api_calls": 3}
|
||||||
|
|
||||||
|
child.run_conversation.side_effect = slow_run
|
||||||
|
|
||||||
|
# At interval 0.05s, idle threshold (5 cycles) trips at ~0.25s.
|
||||||
|
# We should see the heartbeat stop firing well before 0.6s.
|
||||||
|
with patch("tools.delegate_tool._HEARTBEAT_INTERVAL", 0.05):
|
||||||
|
_run_single_child(
|
||||||
|
task_index=0,
|
||||||
|
goal="Test wedged child",
|
||||||
|
child=child,
|
||||||
|
parent_agent=parent,
|
||||||
|
)
|
||||||
|
|
||||||
|
# With idle threshold=5 + interval=0.05s, touches should cap
|
||||||
|
# around 5. Bound loosely to avoid timing flakes.
|
||||||
|
self.assertLess(
|
||||||
|
len(touch_calls), 9,
|
||||||
|
f"Idle stale detection did not fire: got {len(touch_calls)} "
|
||||||
|
f"touches over 0.6s — expected heartbeat to stop after "
|
||||||
|
f"~5 stale cycles",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestDelegationReasoningEffort(unittest.TestCase):
|
class TestDelegationReasoningEffort(unittest.TestCase):
|
||||||
"""Tests for delegation.reasoning_effort config override."""
|
"""Tests for delegation.reasoning_effort config override."""
|
||||||
|
|
|
||||||
|
|
@ -411,9 +411,15 @@ def _preserve_parent_mcp_toolsets(
|
||||||
DEFAULT_MAX_ITERATIONS = 50
|
DEFAULT_MAX_ITERATIONS = 50
|
||||||
DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck
|
DEFAULT_CHILD_TIMEOUT = 600 # seconds before a child agent is considered stuck
|
||||||
_HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation
|
_HEARTBEAT_INTERVAL = 30 # seconds between parent activity heartbeats during delegation
|
||||||
_HEARTBEAT_STALE_CYCLES = (
|
# Stale-heartbeat thresholds. A child with no API-call progress is either:
|
||||||
5 # mark child stale after this many heartbeats with no iteration progress
|
# - idle between turns (no current_tool) — probably stuck on a slow API call
|
||||||
)
|
# - inside a tool (current_tool set) — probably running a legitimately long
|
||||||
|
# operation (terminal command, web fetch, large file read)
|
||||||
|
# The idle ceiling stays tight so genuinely stuck children don't mask the gateway
|
||||||
|
# timeout. The in-tool ceiling is much higher so legit long-running tools get
|
||||||
|
# time to finish; child_timeout_seconds (default 600s) is still the hard cap.
|
||||||
|
_HEARTBEAT_STALE_CYCLES_IDLE = 5 # 5 * 30s = 150s idle between turns → stale
|
||||||
|
_HEARTBEAT_STALE_CYCLES_IN_TOOL = 20 # 20 * 30s = 600s stuck on same tool → stale
|
||||||
DEFAULT_TOOLSETS = ["terminal", "file", "web"]
|
DEFAULT_TOOLSETS = ["terminal", "file", "web"]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1201,7 +1207,11 @@ def _run_single_child(
|
||||||
# Without this, the parent's _last_activity_ts freezes when delegate_task
|
# Without this, the parent's _last_activity_ts freezes when delegate_task
|
||||||
# starts and the gateway eventually kills the agent for "no activity".
|
# starts and the gateway eventually kills the agent for "no activity".
|
||||||
_heartbeat_stop = threading.Event()
|
_heartbeat_stop = threading.Event()
|
||||||
_last_seen_iter = [0] # mutable container for heartbeat stale detection
|
# Stale detection: track the child's (tool, iteration) pair across
|
||||||
|
# heartbeat cycles. If neither advances, count the cycle as stale.
|
||||||
|
# Different thresholds for idle vs in-tool (see _HEARTBEAT_STALE_CYCLES_*).
|
||||||
|
_last_seen_iter = [0]
|
||||||
|
_last_seen_tool = [None] # type: list
|
||||||
_stale_count = [0]
|
_stale_count = [0]
|
||||||
|
|
||||||
def _heartbeat_loop():
|
def _heartbeat_loop():
|
||||||
|
|
@ -1219,22 +1229,38 @@ def _run_single_child(
|
||||||
child_iter = child_summary.get("api_call_count", 0)
|
child_iter = child_summary.get("api_call_count", 0)
|
||||||
child_max = child_summary.get("max_iterations", 0)
|
child_max = child_summary.get("max_iterations", 0)
|
||||||
|
|
||||||
# Stale detection: if iteration count hasn't advanced,
|
# Stale detection: count cycles where neither the iteration
|
||||||
# increment stale counter. After N cycles with no
|
# count nor the current_tool advances. A child running a
|
||||||
# progress, stop masking the hang so the gateway
|
# legitimately long-running tool (terminal command, web
|
||||||
# inactivity timeout can fire as a last resort.
|
# fetch) keeps current_tool set but doesn't advance
|
||||||
if child_iter <= _last_seen_iter[0]:
|
# api_call_count — we don't want that to look stale at the
|
||||||
_stale_count[0] += 1
|
# idle threshold.
|
||||||
else:
|
iter_advanced = child_iter > _last_seen_iter[0]
|
||||||
|
tool_changed = child_tool != _last_seen_tool[0]
|
||||||
|
if iter_advanced or tool_changed:
|
||||||
_last_seen_iter[0] = child_iter
|
_last_seen_iter[0] = child_iter
|
||||||
|
_last_seen_tool[0] = child_tool
|
||||||
_stale_count[0] = 0
|
_stale_count[0] = 0
|
||||||
|
else:
|
||||||
|
_stale_count[0] += 1
|
||||||
|
|
||||||
if _stale_count[0] >= _HEARTBEAT_STALE_CYCLES:
|
# Pick threshold based on whether the child is currently
|
||||||
|
# inside a tool call. In-tool threshold is high enough to
|
||||||
|
# cover legitimately slow tools; idle threshold stays
|
||||||
|
# tight so the gateway timeout can fire on a truly wedged
|
||||||
|
# child.
|
||||||
|
stale_limit = (
|
||||||
|
_HEARTBEAT_STALE_CYCLES_IN_TOOL
|
||||||
|
if child_tool
|
||||||
|
else _HEARTBEAT_STALE_CYCLES_IDLE
|
||||||
|
)
|
||||||
|
if _stale_count[0] >= stale_limit:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Subagent %d appears stale (no iteration progress "
|
"Subagent %d appears stale (no progress for %d "
|
||||||
"for %d heartbeat cycles) — stopping heartbeat",
|
"heartbeat cycles, tool=%s) — stopping heartbeat",
|
||||||
task_index,
|
task_index,
|
||||||
_stale_count[0],
|
_stale_count[0],
|
||||||
|
child_tool or "<none>",
|
||||||
)
|
)
|
||||||
break # stop touching parent, let gateway timeout fire
|
break # stop touching parent, let gateway timeout fire
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue