mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-03 12:23:08 +00:00
The main stop loop in _stop_impl() awaited adapter.cancel_background_tasks() and adapter.disconnect() with no timeout, for both the primary and the secondary-profile (multiplex) adapter maps. A half-dead platform — a wedged Feishu/Lark WebSocket thread blocked on network I/O is the reported case — makes one of those awaits block forever, so the process never exits. systemd then SIGKILLs it after TimeoutStopSec, skipping atexit PID-file cleanup, and the next start dies with 'PID file race lost' and enters a restart loop. The per-adapter timeout infra already existed on main (_adapter_disconnect_timeout_secs / HERMES_GATEWAY_ADAPTER_DISCONNECT_TIMEOUT, default 5s) but was only wired into _safe_adapter_disconnect, which the teardown path never calls. Add _bounded_adapter_teardown(): wraps BOTH cancel_background_tasks() and disconnect() in the existing timeout budget, logs and forces forward progress on timeout, and never raises. Both teardown loops now route through it, so the stop sequence always completes regardless of any adapter's internal behavior and PID-file cleanup runs. Original report + fix direction by @happy5318 (#14128, #14130); this widens it to cover cancel_background_tasks(), the multiplex loop, and the config knob. Co-authored-by: happy5318 <happy5318@users.noreply.github.com>
134 lines
4.9 KiB
Python
134 lines
4.9 KiB
Python
"""Regression tests: the shutdown teardown loop must not hang on a wedged adapter.
|
|
|
|
`GatewayRunner._stop_impl()` tears down every adapter by awaiting
|
|
`cancel_background_tasks()` then `disconnect()`. Both calls can block
|
|
indefinitely when a platform's network state is half-dead (e.g. a wedged
|
|
Feishu/Lark WebSocket thread waiting on I/O). An unbounded await stalls the
|
|
whole shutdown past systemd's TimeoutStopSec; the resulting SIGKILL skips
|
|
atexit PID-file cleanup, so the next start dies with "PID file race lost"
|
|
(#14128).
|
|
|
|
The fix routes both teardown loops through `_bounded_adapter_teardown`,
|
|
which wraps each await in the existing per-adapter timeout budget
|
|
(HERMES_GATEWAY_ADAPTER_DISCONNECT_TIMEOUT) and always returns.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from unittest.mock import AsyncMock, MagicMock
|
|
|
|
import pytest
|
|
|
|
from gateway.config import Platform
|
|
from gateway.run import GatewayRunner
|
|
|
|
|
|
@pytest.fixture
|
|
def bare_runner():
|
|
"""A GatewayRunner shell that only needs _bounded_adapter_teardown."""
|
|
return object.__new__(GatewayRunner)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_teardown_calls_both_methods(bare_runner):
|
|
"""The helper cancels background tasks AND disconnects, in that order."""
|
|
calls = []
|
|
adapter = MagicMock()
|
|
adapter.cancel_background_tasks = AsyncMock(
|
|
side_effect=lambda: calls.append("cancel")
|
|
)
|
|
adapter.disconnect = AsyncMock(side_effect=lambda: calls.append("disconnect"))
|
|
|
|
await bare_runner._bounded_adapter_teardown(adapter, Platform.TELEGRAM)
|
|
|
|
adapter.cancel_background_tasks.assert_awaited_once()
|
|
adapter.disconnect.assert_awaited_once()
|
|
assert calls == ["cancel", "disconnect"]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_teardown_bounds_hanging_disconnect(bare_runner, monkeypatch, caplog):
|
|
"""A wedged disconnect() must time out instead of hanging the loop."""
|
|
monkeypatch.setenv("HERMES_GATEWAY_ADAPTER_DISCONNECT_TIMEOUT", "0.01")
|
|
adapter = MagicMock()
|
|
adapter.cancel_background_tasks = AsyncMock(return_value=None)
|
|
|
|
async def hang():
|
|
await asyncio.sleep(60)
|
|
|
|
adapter.disconnect = AsyncMock(side_effect=hang)
|
|
|
|
with caplog.at_level(logging.WARNING, logger="gateway.run"):
|
|
await asyncio.wait_for(
|
|
bare_runner._bounded_adapter_teardown(adapter, Platform.FEISHU),
|
|
timeout=5.0, # the helper itself must return well under this
|
|
)
|
|
|
|
adapter.disconnect.assert_awaited_once()
|
|
assert "feishu disconnect timed out" in caplog.text
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_teardown_bounds_hanging_cancel(bare_runner, monkeypatch, caplog):
|
|
"""A wedged cancel_background_tasks() must time out, then disconnect runs."""
|
|
monkeypatch.setenv("HERMES_GATEWAY_ADAPTER_DISCONNECT_TIMEOUT", "0.01")
|
|
adapter = MagicMock()
|
|
|
|
async def hang():
|
|
await asyncio.sleep(60)
|
|
|
|
adapter.cancel_background_tasks = AsyncMock(side_effect=hang)
|
|
adapter.disconnect = AsyncMock(return_value=None)
|
|
|
|
with caplog.at_level(logging.WARNING, logger="gateway.run"):
|
|
await asyncio.wait_for(
|
|
bare_runner._bounded_adapter_teardown(adapter, Platform.FEISHU),
|
|
timeout=5.0,
|
|
)
|
|
|
|
assert "feishu background-task cancel timed out" in caplog.text
|
|
# disconnect still attempted after the cancel timeout — forward progress.
|
|
adapter.disconnect.assert_awaited_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_teardown_swallows_exceptions(bare_runner):
|
|
"""Errors in either await must not propagate — shutdown continues."""
|
|
adapter = MagicMock()
|
|
adapter.cancel_background_tasks = AsyncMock(side_effect=RuntimeError("bg"))
|
|
adapter.disconnect = AsyncMock(side_effect=RuntimeError("disc"))
|
|
|
|
# Must NOT raise.
|
|
await bare_runner._bounded_adapter_teardown(adapter, Platform.TELEGRAM)
|
|
|
|
adapter.cancel_background_tasks.assert_awaited_once()
|
|
adapter.disconnect.assert_awaited_once()
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_teardown_profile_suffix_in_logs(bare_runner, caplog):
|
|
"""Multiplex (secondary-profile) teardown tags log lines with the profile."""
|
|
adapter = MagicMock()
|
|
adapter.cancel_background_tasks = AsyncMock(return_value=None)
|
|
adapter.disconnect = AsyncMock(return_value=None)
|
|
|
|
with caplog.at_level(logging.INFO, logger="gateway.run"):
|
|
await bare_runner._bounded_adapter_teardown(
|
|
adapter, Platform.TELEGRAM, profile="acct2"
|
|
)
|
|
|
|
assert "(profile: acct2)" in caplog.text
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_teardown_timeout_zero_disables_bound(bare_runner, monkeypatch):
|
|
"""timeout=0 disables the wait_for wrapper but still calls through."""
|
|
monkeypatch.setenv("HERMES_GATEWAY_ADAPTER_DISCONNECT_TIMEOUT", "0")
|
|
adapter = MagicMock()
|
|
adapter.cancel_background_tasks = AsyncMock(return_value=None)
|
|
adapter.disconnect = AsyncMock(return_value=None)
|
|
|
|
await bare_runner._bounded_adapter_teardown(adapter, Platform.TELEGRAM)
|
|
|
|
adapter.cancel_background_tasks.assert_awaited_once()
|
|
adapter.disconnect.assert_awaited_once()
|