mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-03 12:23:08 +00:00
#35994 moved /new reset cleanup off the loop, but _cleanup_agent_resources (agent.close() subprocess teardown; shutdown_memory_provider() plugin IO) was still called INLINE on the event loop from three other sites: - _session_expiry_watcher (5-min idle sweep) — live loop - _handle_message_with_agent cache-hygiene re-eviction — live loop - _finalize_shutdown_agents / stop() idle-cache loop — shutdown A wedged memory provider on any of these froze the loop: bot goes silent, runtime-status updated_at heartbeat stops advancing, and SIGTERM can't be serviced (requires kill -9) — exactly the #53175 zombie pattern. Adds _cleanup_agent_resources_off_loop: a bounded (30s) worker-thread offload mirroring the #35994 reset fix, and routes all four sites through it.
174 lines
6 KiB
Python
174 lines
6 KiB
Python
"""Regression test for #53175: gateway event loop wedged by synchronous
|
|
agent-resource cleanup run inline from loop coroutines.
|
|
|
|
#35994 fixed the /new reset path, but the same synchronous
|
|
``_cleanup_agent_resources`` (agent.close() tears down terminal sandboxes /
|
|
browser daemons / background processes; shutdown_memory_provider() may do
|
|
SQLite / network IO via a memory plugin) was still called INLINE on the event
|
|
loop from three other places:
|
|
|
|
* ``_session_expiry_watcher`` (the 5-minute idle sweep) — live loop
|
|
* ``_handle_message_with_agent`` cache-hygiene re-eviction — live loop
|
|
* ``_finalize_shutdown_agents`` / ``stop()`` idle-cache loop — shutdown
|
|
|
|
A wedged provider on any of these froze the whole loop: the bot went silent,
|
|
the runtime-status ``updated_at`` heartbeat stopped advancing (the symptom the
|
|
reporter's watchdog keyed on), and SIGTERM could not be serviced (requiring
|
|
``kill -9``).
|
|
|
|
The fix routes all four call sites through ``_cleanup_agent_resources_off_loop``
|
|
which offloads to a worker thread under a bounded ``asyncio.wait_for``, so the
|
|
loop is never blocked and a stuck teardown degrades gracefully.
|
|
|
|
These tests drive that shared helper directly — it is the single chokepoint
|
|
every fixed call site now uses.
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
import threading
|
|
from contextvars import copy_context
|
|
from types import SimpleNamespace
|
|
|
|
import pytest
|
|
|
|
|
|
def _make_runner():
|
|
"""Bare GatewayRunner with a real thread-pool-backed executor helper."""
|
|
from gateway.run import GatewayRunner
|
|
|
|
runner = object.__new__(GatewayRunner)
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
executor = ThreadPoolExecutor(max_workers=2)
|
|
runner._get_executor = lambda: executor
|
|
|
|
async def _run_in_executor_with_context(func, *args):
|
|
loop = asyncio.get_running_loop()
|
|
ctx = copy_context()
|
|
return await loop.run_in_executor(executor, lambda: ctx.run(func, *args))
|
|
|
|
runner._run_in_executor_with_context = _run_in_executor_with_context
|
|
return runner, executor
|
|
|
|
|
|
def _agent_with_close(close_fn):
|
|
return SimpleNamespace(
|
|
close=close_fn,
|
|
shutdown_memory_provider=lambda *a, **k: None,
|
|
_session_messages=None,
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_off_loop_does_not_block_event_loop():
|
|
"""A slow agent.close() must NOT freeze the loop. A concurrent heartbeat
|
|
keeps ticking WHILE close() blocks in its worker thread — proving the
|
|
cleanup was offloaded, not run inline (which would freeze the loop and
|
|
stall the runtime-status updated_at heartbeat, #53175)."""
|
|
runner, executor = _make_runner()
|
|
close_started = threading.Event()
|
|
release = threading.Event()
|
|
|
|
def slow_close():
|
|
close_started.set()
|
|
release.wait(timeout=5) # block the WORKER thread, not the loop
|
|
|
|
agent = _agent_with_close(slow_close)
|
|
|
|
ticks = {"n": 0}
|
|
stop = threading.Event()
|
|
|
|
async def _heartbeat():
|
|
while not stop.is_set():
|
|
ticks["n"] += 1
|
|
await asyncio.sleep(0.005)
|
|
|
|
hb = asyncio.create_task(_heartbeat())
|
|
cleanup_task = asyncio.create_task(
|
|
runner._cleanup_agent_resources_off_loop(agent, context="test")
|
|
)
|
|
|
|
for _ in range(200):
|
|
if close_started.is_set():
|
|
break
|
|
await asyncio.sleep(0.005)
|
|
assert close_started.is_set(), "close() never ran"
|
|
|
|
ticks_at_block = ticks["n"]
|
|
await asyncio.sleep(0.1)
|
|
ticks_during_block = ticks["n"] - ticks_at_block
|
|
|
|
release.set()
|
|
await cleanup_task
|
|
stop.set()
|
|
await hb
|
|
executor.shutdown(wait=False)
|
|
|
|
assert ticks_during_block >= 5, (
|
|
f"event loop was blocked during agent cleanup (#53175): only "
|
|
f"{ticks_during_block} ticks while close() was running"
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_off_loop_times_out_gracefully(caplog):
|
|
"""A cleanup that exceeds the bounded timeout logs a warning and returns —
|
|
the caller (sweep / shutdown / hygiene) proceeds rather than hanging."""
|
|
runner, executor = _make_runner()
|
|
|
|
async def _instant_timeout(aw, timeout=None):
|
|
if asyncio.iscoroutine(aw):
|
|
aw.close()
|
|
raise asyncio.TimeoutError
|
|
|
|
import gateway.run as _run
|
|
|
|
agent = _agent_with_close(lambda: None)
|
|
with caplog.at_level(logging.WARNING, logger="gateway.run"):
|
|
# Patch the wait_for the helper uses so we don't actually wait 30s.
|
|
orig = _run.asyncio.wait_for
|
|
_run.asyncio.wait_for = _instant_timeout
|
|
try:
|
|
await runner._cleanup_agent_resources_off_loop(agent, context="sweep")
|
|
finally:
|
|
_run.asyncio.wait_for = orig
|
|
executor.shutdown(wait=False)
|
|
|
|
assert any(
|
|
"exceeded" in r.message and "#53175" in r.message for r in caplog.records
|
|
), "expected the timeout warning to be logged"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_off_loop_swallows_executor_failure(caplog):
|
|
"""If the offloaded cleanup raises, the helper logs and returns — a
|
|
teardown failure must never abort the loop coroutine that triggered it."""
|
|
runner, executor = _make_runner()
|
|
|
|
def boom():
|
|
raise RuntimeError("provider shutdown blew up")
|
|
|
|
# _cleanup_agent_resources swallows its own internal errors, so to reach
|
|
# the helper's except branch make the offloaded call itself raise.
|
|
def _boom_cleanup(agent):
|
|
raise RuntimeError("boom")
|
|
|
|
runner._cleanup_agent_resources = _boom_cleanup
|
|
|
|
with caplog.at_level(logging.WARNING, logger="gateway.run"):
|
|
await runner._cleanup_agent_resources_off_loop(
|
|
_agent_with_close(boom), context="shutdown finalize"
|
|
)
|
|
executor.shutdown(wait=False)
|
|
|
|
assert any(
|
|
"failed" in r.message and "#53175" in r.message for r in caplog.records
|
|
), "expected the cleanup-failure warning to be logged"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_off_loop_none_agent_is_noop():
|
|
"""A None agent (None cache entry) is a no-op and never touches the loop."""
|
|
runner, executor = _make_runner()
|
|
await runner._cleanup_agent_resources_off_loop(None)
|
|
executor.shutdown(wait=False)
|