mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-11 08:42:11 +00:00
fix(gateway): preserve inactivity clock on interrupt-recursive cached-agent turns (#15654)
_last_activity_ts was unconditionally reset to time.time() on every _agent_cache hit. For interrupt-recursive _run_agent calls (_interrupt_depth > 0) this silently reset the inactivity watchdog's idle clock on each re-entry, preventing the 30-min timeout from ever firing when a turn got stuck in an interrupt loop. A stuck session would emit "Still working... iteration 0/60, starting new turn (cached)" heartbeats indefinitely instead of timing out. Gate the reset on _interrupt_depth == 0 only. Fresh external turns still receive the reset so a session idle for 29 min doesn't trip the watchdog before the new turn makes its first API call (#9051). The per-turn reset logic is extracted into a static helper _init_cached_agent_for_turn() to make it directly testable. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
20cb706e03
commit
de24315978
2 changed files with 118 additions and 6 deletions
|
|
@ -8722,6 +8722,22 @@ class GatewayRunner:
|
|||
with _lock:
|
||||
self._agent_cache.pop(session_key, None)
|
||||
|
||||
@staticmethod
|
||||
def _init_cached_agent_for_turn(agent: Any, interrupt_depth: int) -> None:
|
||||
"""Reset per-turn state on a cached agent before a new turn starts.
|
||||
|
||||
_last_activity_ts is only reset for fresh external turns (depth 0).
|
||||
For interrupt-recursive turns the timestamp is preserved so the
|
||||
inactivity watchdog can accumulate stuck-turn idle time and fire
|
||||
the 30-min timeout (#15654). The depth-0 reset is still needed:
|
||||
a session idle for 29 min would otherwise trip the watchdog before
|
||||
the new turn makes its first API call (#9051).
|
||||
"""
|
||||
if interrupt_depth == 0:
|
||||
agent._last_activity_ts = time.time()
|
||||
agent._last_activity_desc = "starting new turn (cached)"
|
||||
agent._api_call_count = 0
|
||||
|
||||
def _release_evicted_agent_soft(self, agent: Any) -> None:
|
||||
"""Soft cleanup for cache-evicted agents — preserves session tool state.
|
||||
|
||||
|
|
@ -9766,12 +9782,7 @@ class GatewayRunner:
|
|||
_cache.move_to_end(session_key)
|
||||
except KeyError:
|
||||
pass
|
||||
# Reset activity timestamp so the inactivity timeout
|
||||
# handler doesn't see stale idle time from the previous
|
||||
# turn and immediately kill this agent. (#9051)
|
||||
agent._last_activity_ts = time.time()
|
||||
agent._last_activity_desc = "starting new turn (cached)"
|
||||
agent._api_call_count = 0
|
||||
self._init_cached_agent_for_turn(agent, _interrupt_depth)
|
||||
logger.debug("Reusing cached agent for session %s", session_key)
|
||||
|
||||
if agent is None:
|
||||
|
|
|
|||
|
|
@ -1043,3 +1043,104 @@ class TestAgentCacheIdleResume:
|
|||
new_agent.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class TestCachedAgentInactivityReset:
|
||||
"""Inactivity-clock reset must be gated on _interrupt_depth == 0.
|
||||
|
||||
On interrupt-recursive turns (_interrupt_depth > 0) the clock must
|
||||
keep accumulating so the inactivity watchdog can fire when a turn is
|
||||
stuck in an interrupt loop. Resetting unconditionally prevented the
|
||||
30-min timeout from triggering (#15654). The depth-0 reset is still
|
||||
needed: a session idle for 29 min must not trip the watchdog before
|
||||
the new turn makes its first API call (#9051).
|
||||
"""
|
||||
|
||||
def _fake_agent(self, stale_seconds: float = 1800.0):
|
||||
import time as _t
|
||||
m = MagicMock()
|
||||
m._last_activity_ts = _t.time() - stale_seconds
|
||||
m._api_call_count = 10
|
||||
m._last_activity_desc = "previous turn activity"
|
||||
return m
|
||||
|
||||
def test_fresh_turn_resets_idle_clock(self):
|
||||
"""interrupt_depth=0: clock resets so a post-idle turn gets a
|
||||
fresh 30-min inactivity window (guard for #9051)."""
|
||||
import time as _t
|
||||
from gateway.run import GatewayRunner
|
||||
|
||||
agent = self._fake_agent(stale_seconds=1800.0)
|
||||
old_ts = agent._last_activity_ts
|
||||
before = _t.time()
|
||||
|
||||
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=0)
|
||||
|
||||
assert agent._last_activity_ts >= before, (
|
||||
"_last_activity_ts was not reset on a fresh turn (interrupt_depth=0)"
|
||||
)
|
||||
assert agent._last_activity_ts > old_ts, (
|
||||
"Stale idle time should be cleared so the new turn gets a fresh window"
|
||||
)
|
||||
|
||||
def test_interrupt_turn_preserves_idle_clock(self):
|
||||
"""interrupt_depth=1: clock preserved so accumulated stuck-turn
|
||||
idle time is not discarded by an interrupt-recursive re-entry (#15654)."""
|
||||
from gateway.run import GatewayRunner
|
||||
|
||||
agent = self._fake_agent(stale_seconds=1200.0)
|
||||
old_ts = agent._last_activity_ts
|
||||
|
||||
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=1)
|
||||
|
||||
assert agent._last_activity_ts == old_ts, (
|
||||
"_last_activity_ts must not be reset on interrupt-recursive turns "
|
||||
"(interrupt_depth>0) — the watchdog needs the accumulated idle time"
|
||||
)
|
||||
|
||||
def test_deep_interrupt_recursion_preserves_idle_clock(self):
|
||||
"""interrupt_depth=MAX-1: clock still preserved at any non-zero depth."""
|
||||
from gateway.run import GatewayRunner
|
||||
|
||||
agent = self._fake_agent(stale_seconds=600.0)
|
||||
old_ts = agent._last_activity_ts
|
||||
|
||||
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=4)
|
||||
|
||||
assert agent._last_activity_ts == old_ts
|
||||
|
||||
def test_api_call_count_reset_regardless_of_depth(self):
|
||||
"""_api_call_count is always reset to 0 for the new turn, at any depth."""
|
||||
from gateway.run import GatewayRunner
|
||||
|
||||
agent_fresh = self._fake_agent()
|
||||
agent_interrupted = self._fake_agent()
|
||||
|
||||
GatewayRunner._init_cached_agent_for_turn(agent_fresh, interrupt_depth=0)
|
||||
GatewayRunner._init_cached_agent_for_turn(agent_interrupted, interrupt_depth=1)
|
||||
|
||||
assert agent_fresh._api_call_count == 0
|
||||
assert agent_interrupted._api_call_count == 0
|
||||
|
||||
def test_watchdog_accumulation_across_recursive_turns(self):
|
||||
"""Scenario: stuck turn + user interrupt → recursive turn.
|
||||
|
||||
The idle time seen by the watchdog must reflect the full stuck
|
||||
duration, not restart from zero on the recursive re-entry.
|
||||
"""
|
||||
import time as _t
|
||||
from gateway.run import GatewayRunner
|
||||
|
||||
STUCK_FOR = 1750.0
|
||||
agent = self._fake_agent(stale_seconds=STUCK_FOR)
|
||||
|
||||
# Simulate: user sees "Still working..." and sends another message.
|
||||
# That triggers an interrupt → _run_agent recurses at depth=1.
|
||||
GatewayRunner._init_cached_agent_for_turn(agent, interrupt_depth=1)
|
||||
|
||||
# Watchdog sees time.time() - _last_activity_ts ≥ STUCK_FOR.
|
||||
idle_secs = _t.time() - agent._last_activity_ts
|
||||
assert idle_secs >= STUCK_FOR - 1.0, (
|
||||
f"Watchdog would see {idle_secs:.0f}s idle, expected ~{STUCK_FOR}s. "
|
||||
"Inactivity timeout could not fire for a stuck interrupted turn."
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue