mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
#35994 moved /new reset cleanup off the loop, but _cleanup_agent_resources (agent.close() subprocess teardown; shutdown_memory_provider() plugin IO) was still called INLINE on the event loop from three other sites: - _session_expiry_watcher (5-min idle sweep) — live loop - _handle_message_with_agent cache-hygiene re-eviction — live loop - _finalize_shutdown_agents / stop() idle-cache loop — shutdown A wedged memory provider on any of these froze the loop: bot goes silent, runtime-status updated_at heartbeat stops advancing, and SIGTERM can't be serviced (requires kill -9) — exactly the #53175 zombie pattern. Adds _cleanup_agent_resources_off_loop: a bounded (30s) worker-thread offload mirroring the #35994 reset fix, and routes all four sites through it.
226 lines
7.4 KiB
Python
226 lines
7.4 KiB
Python
"""Regression tests for gateway shutdown cleaning up cached agent memory providers (issue #11205).
|
|
|
|
When the gateway shuts down, ``stop()`` called ``_finalize_shutdown_agents()``
|
|
which only drained agents in ``_running_agents``. Idle agents sitting in
|
|
``_agent_cache`` (LRU cache) were never cleaned up, so their
|
|
``MemoryProvider.on_session_end()`` hooks never fired.
|
|
|
|
The fix adds an explicit sweep of ``_agent_cache`` after
|
|
``_finalize_shutdown_agents`` in the ``_stop_impl`` coroutine.
|
|
"""
|
|
|
|
import asyncio
|
|
import threading
|
|
from collections import OrderedDict
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
# Import the module (not the class) to reach stop() and helpers
|
|
import gateway.run as gw_mod
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class _FakeGateway:
|
|
"""Minimal stand-in with just enough state for ``stop()`` to run."""
|
|
|
|
def __init__(self):
|
|
self._running = True
|
|
self._draining = False
|
|
self._restart_requested = False
|
|
self._restart_detached = False
|
|
self._restart_via_service = False
|
|
self._stop_task = None
|
|
self._exit_cleanly = False
|
|
self._exit_with_failure = False
|
|
self._exit_reason = None
|
|
self._exit_code = None
|
|
self._restart_drain_timeout = 0.01
|
|
self._running_agents = {}
|
|
self._running_agents_ts = {}
|
|
self._agent_cache = OrderedDict()
|
|
self._agent_cache_lock = threading.Lock()
|
|
self.adapters = {}
|
|
self._background_tasks = set()
|
|
self._failed_platforms = []
|
|
self._shutdown_event = asyncio.Event()
|
|
self._pending_messages = {}
|
|
self._pending_approvals = {}
|
|
self._busy_ack_ts = {}
|
|
|
|
def _running_agent_count(self):
|
|
return len(self._running_agents)
|
|
|
|
def _update_runtime_status(self, *_a, **_kw):
|
|
pass
|
|
|
|
async def _run_in_executor_with_context(self, func, *args):
|
|
# stop() offloads agent-resource cleanup off the loop (#53175); run
|
|
# inline in tests so the bounded-cleanup path is exercised.
|
|
return func(*args)
|
|
|
|
async def _cleanup_agent_resources_off_loop(self, agent, *, context=""):
|
|
# Mirror the real bounded helper, inline (no executor/timeout) so the
|
|
# fake exercises the same call shape stop() now uses.
|
|
self._cleanup_agent_resources(agent)
|
|
|
|
async def _notify_active_sessions_of_shutdown(self):
|
|
pass
|
|
|
|
async def _drain_active_agents(self, timeout):
|
|
return {}, False
|
|
|
|
async def _finalize_shutdown_agents(self, agents):
|
|
for agent in agents.values():
|
|
self._cleanup_agent_resources(agent)
|
|
|
|
def _cleanup_agent_resources(self, agent):
|
|
if agent is None:
|
|
return
|
|
try:
|
|
if hasattr(agent, "shutdown_memory_provider"):
|
|
agent.shutdown_memory_provider()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if hasattr(agent, "close"):
|
|
agent.close()
|
|
except Exception:
|
|
pass
|
|
|
|
def _evict_cached_agent(self, key):
|
|
pass
|
|
|
|
def _release_running_agent_state(self, session_key, **_kwargs):
|
|
agent = self._running_agents.pop(session_key, None)
|
|
self._running_agents_ts.pop(session_key, None)
|
|
self._cleanup_agent_resources(agent)
|
|
return agent is not None
|
|
|
|
|
|
def _make_mock_agent():
|
|
a = MagicMock()
|
|
a.shutdown_memory_provider = MagicMock()
|
|
a.close = MagicMock()
|
|
return a
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCachedAgentCleanupOnShutdown:
|
|
"""Verify that ``stop()`` calls ``_cleanup_agent_resources`` on idle
|
|
cached agents, triggering ``shutdown_memory_provider()`` (which calls
|
|
``on_session_end``)."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cached_agent_memory_provider_shut_down(self):
|
|
"""A cached agent's shutdown_memory_provider is called during gateway stop."""
|
|
gw = _FakeGateway()
|
|
agent = _make_mock_agent()
|
|
gw._agent_cache["session-1"] = (agent, "sig-123")
|
|
|
|
# Call the real stop() from GatewayRunner
|
|
await gw_mod.GatewayRunner.stop(gw)
|
|
|
|
agent.shutdown_memory_provider.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cache_cleared_after_shutdown(self):
|
|
"""The _agent_cache dict is cleared after stop."""
|
|
gw = _FakeGateway()
|
|
agent = _make_mock_agent()
|
|
gw._agent_cache["s1"] = (agent, "sig1")
|
|
|
|
await gw_mod.GatewayRunner.stop(gw)
|
|
|
|
assert len(gw._agent_cache) == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_cached_agents_no_error(self):
|
|
"""stop() works fine when _agent_cache is empty."""
|
|
gw = _FakeGateway()
|
|
|
|
await gw_mod.GatewayRunner.stop(gw) # Should not raise
|
|
|
|
assert len(gw._agent_cache) == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multiple_cached_agents_all_cleaned(self):
|
|
"""All cached agents get cleaned up."""
|
|
gw = _FakeGateway()
|
|
agents = []
|
|
for i in range(5):
|
|
a = _make_mock_agent()
|
|
agents.append(a)
|
|
gw._agent_cache[f"s{i}"] = (a, f"sig{i}")
|
|
|
|
await gw_mod.GatewayRunner.stop(gw)
|
|
|
|
for a in agents:
|
|
a.shutdown_memory_provider.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_survives_agent_exception(self):
|
|
"""An exception from one agent's shutdown doesn't prevent others."""
|
|
gw = _FakeGateway()
|
|
|
|
bad = _make_mock_agent()
|
|
bad.shutdown_memory_provider.side_effect = RuntimeError("boom")
|
|
bad.close.side_effect = RuntimeError("boom")
|
|
|
|
good = _make_mock_agent()
|
|
|
|
gw._agent_cache["bad"] = (bad, "sig-bad")
|
|
gw._agent_cache["good"] = (good, "sig-good")
|
|
|
|
await gw_mod.GatewayRunner.stop(gw)
|
|
|
|
# The good agent should still be cleaned up
|
|
good.shutdown_memory_provider.assert_called_once()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_plain_agent_not_tuple(self):
|
|
"""Cache entries that aren't tuples (just bare agents) are also cleaned."""
|
|
gw = _FakeGateway()
|
|
agent = _make_mock_agent()
|
|
gw._agent_cache["s1"] = agent # Not a tuple
|
|
|
|
await gw_mod.GatewayRunner.stop(gw)
|
|
|
|
agent.shutdown_memory_provider.assert_called_once()
|
|
assert len(gw._agent_cache) == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_none_entry_skipped(self):
|
|
"""A None cache entry doesn't cause errors."""
|
|
gw = _FakeGateway()
|
|
gw._agent_cache["s1"] = None
|
|
|
|
await gw_mod.GatewayRunner.stop(gw)
|
|
|
|
assert len(gw._agent_cache) == 0
|
|
|
|
|
|
class TestRunningAgentsNotDoubleCleaned:
|
|
"""Verify behavior when agents appear in both _running_agents and _agent_cache."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_running_and_cached_agent_cleaned_at_least_once(self):
|
|
"""An agent in both _running_agents and _agent_cache gets
|
|
shutdown_memory_provider called at least once."""
|
|
gw = _FakeGateway()
|
|
shared = _make_mock_agent()
|
|
|
|
gw._running_agents["s1"] = shared
|
|
gw._agent_cache["s1"] = (shared, "sig1")
|
|
|
|
await gw_mod.GatewayRunner.stop(gw)
|
|
|
|
# Called at least once — either from _finalize_shutdown_agents
|
|
# or from the cache sweep (or both)
|
|
assert shared.shutdown_memory_provider.call_count >= 1
|