mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-03 12:23:08 +00:00
After a prolonged outage the in-process network-error ladder escalates to fatal and GatewayRunner._platform_reconnect_watcher rebuilds a fresh adapter that reconnects through the bootstrap path. That path called start_polling(drop_pending_updates=True), discarding every update Telegram queued during the outage — all messages sent while the bot was down were silently lost. The in-process ladder and 409-conflict handler already passed drop_pending_updates=False; only bootstrap did not distinguish a cold first boot from a reconnect. Thread an is_reconnect signal from the watcher through _connect_adapter_with_timeout into adapter.connect(). The base BasePlatformAdapter.connect() gains a keyword-only is_reconnect=False so every adapter inherits a tolerant signature (no per-platform breakage when the runner forwards the kwarg). Telegram translates is_reconnect into drop_pending_updates=not is_reconnect on both the polling and webhook bootstrap calls. Cold boot still drops the stale queue; a watcher reconnect preserves it. Fixes #46621. Co-authored-by: annguyenNous <annguyen@nousresearch.com> Co-authored-by: kyssta-exe <kyssta-exe@users.noreply.github.com> Co-authored-by: Kewe63 <Kewe63@users.noreply.github.com>
350 lines
15 KiB
Python
350 lines
15 KiB
Python
"""Regression tests for the gateway platform fd-leak fix (#37011).
|
|
|
|
Without an explicit ``disconnect()`` on adapters that fail to connect in
|
|
the reconnect watcher, every retry leaks the resources the adapter
|
|
opened in ``__init__`` — for ``APIServerAdapter`` that means 2 file
|
|
descriptors per attempt (the SQLite ``response_store.db`` and its WAL
|
|
sidecar). At the 300s backoff cap that's ~12 fds/hour; the default
|
|
2560-fd ulimit is exhausted in ~12h of continuous failure, after which
|
|
the gateway raises ``OSError: [Errno 24] Too many open files`` on
|
|
every ``open()`` and becomes a zombie.
|
|
|
|
These tests pin all three failure paths in
|
|
``_platform_reconnect_watcher`` (non-retryable error, retryable error,
|
|
exception during connect) to call ``adapter.disconnect()`` on the
|
|
unowned adapter, plus the path-level ``APIServerAdapter.disconnect()``
|
|
behavior of also closing the ``ResponseStore``. The pre-fix
|
|
implementation did not call ``disconnect()`` on any of these paths;
|
|
this file would have caught the regression and now pins the fix.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import time
|
|
from typing import Any
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
|
from gateway.platforms.api_server import APIServerAdapter, ResponseStore
|
|
from gateway.platforms.base import BasePlatformAdapter, SendResult
|
|
from gateway.run import GatewayRunner, _dispose_unused_adapter
|
|
|
|
|
|
def _make_runner() -> GatewayRunner:
|
|
"""Create a minimal GatewayRunner via object.__new__ to skip __init__.
|
|
|
|
Mirrors the helper in test_platform_reconnect.py so this file
|
|
is drop-in compatible with the existing reconnect test suite.
|
|
"""
|
|
runner = object.__new__(GatewayRunner)
|
|
runner.config = GatewayConfig(
|
|
platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="test")}
|
|
)
|
|
runner._running = True
|
|
runner._shutdown_event = asyncio.Event()
|
|
runner._exit_reason = None
|
|
runner._exit_with_failure = False
|
|
runner._exit_cleanly = False
|
|
runner._failed_platforms = {}
|
|
runner.adapters = {}
|
|
return runner
|
|
|
|
|
|
async def _run_watcher_one_iteration(runner: GatewayRunner) -> None:
|
|
"""Drive ``_platform_reconnect_watcher`` for exactly one retry pass.
|
|
|
|
Patches ``asyncio.sleep`` to advance the watcher's internal
|
|
``await asyncio.sleep(10)`` initial delay and the 1-second inner
|
|
sleeps without actually waiting. Mirrors the pattern used in
|
|
``test_platform_reconnect.py::TestPlatformReconnectWatcher``.
|
|
"""
|
|
real_sleep = asyncio.sleep
|
|
call_count = 0
|
|
|
|
async def fake_sleep(_n: float) -> None:
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count > 2:
|
|
# Two sleeps is enough to get past the initial 10s wait
|
|
# and the first inner-tick check. After that, stop the
|
|
# watcher so the test returns.
|
|
runner._running = False
|
|
await real_sleep(0)
|
|
|
|
with patch("asyncio.sleep", side_effect=fake_sleep):
|
|
await runner._platform_reconnect_watcher()
|
|
|
|
|
|
class _CountingAdapter(BasePlatformAdapter):
|
|
"""Adapter that records every disconnect() call for fd-leak assertions.
|
|
|
|
The base ``BasePlatformAdapter.disconnect()`` is a no-op by default
|
|
for stub adapters, which is why the pre-fix reconnect watcher
|
|
silently leaked: the would-be dispose calls were happening on
|
|
objects that did nothing on disconnect. This stub mimics the real
|
|
``APIServerAdapter`` shape — every constructor call opens 2 fds
|
|
(the SQLite db + WAL), and every disconnect() must close them.
|
|
"""
|
|
|
|
def __init__(self, *, succeed: bool = False, fatal_error: str | None = None,
|
|
fatal_retryable: bool = True, raise_during_connect: bool = False):
|
|
super().__init__(PlatformConfig(enabled=True, token="t"), Platform.TELEGRAM)
|
|
# 2 fds to track: the canonical "ResponseStore" pair. The
|
|
# reconnect watcher should call disconnect() once per
|
|
# construction; otherwise these stay open and contribute to
|
|
# the gateway-wide fd count.
|
|
self._open_fds = 2
|
|
self._disconnect_calls = 0
|
|
self._succeed = succeed
|
|
self._fatal_error = fatal_error
|
|
self._fatal_retryable = fatal_retryable
|
|
self._raise_during_connect = raise_during_connect
|
|
|
|
async def connect(self, *, is_reconnect: bool = False) -> bool:
|
|
if self._raise_during_connect:
|
|
raise RuntimeError("simulated connect exception")
|
|
if self._fatal_error:
|
|
self._set_fatal_error(
|
|
"test_code", self._fatal_error, retryable=self._fatal_retryable,
|
|
)
|
|
return False
|
|
return self._succeed
|
|
|
|
async def disconnect(self) -> None:
|
|
self._disconnect_calls += 1
|
|
self._open_fds = 0 # fd release on dispose
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
return SendResult(success=True, message_id="1")
|
|
|
|
async def send_typing(self, chat_id, metadata=None):
|
|
return None
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
|
|
def _seed_runner_with_one_failure(runner: GatewayRunner) -> None:
|
|
"""Queue a single platform for the reconnect watcher to pick up."""
|
|
runner._failed_platforms[Platform.TELEGRAM] = {
|
|
"config": PlatformConfig(enabled=True, token="t"),
|
|
"attempts": 0,
|
|
"next_retry": time.monotonic() - 1, # eligible immediately
|
|
}
|
|
|
|
|
|
class TestReconnectFDLeakRegression:
|
|
"""All three reconnect failure paths must dispose the unowned adapter.
|
|
|
|
The pre-fix implementation constructed a fresh adapter on every
|
|
retry and dropped it on the floor when connect() failed. That leaks
|
|
2 fds per retry (for ``APIServerAdapter``) at the 300s backoff cap,
|
|
exhausting the 2560-fd ulimit in ~12h of continuous failure (#37011).
|
|
"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_nonretryable_failure_disposes_unowned_adapter(self):
|
|
"""A fatal error (bad auth, etc.) must call disconnect() exactly once.
|
|
|
|
The adapter failed to connect and is being removed from the
|
|
retry queue. Nothing else owns it, so the watcher is the only
|
|
code with a chance to call disconnect() — and disconnect() is
|
|
the only place the SQLite fds get closed. One retry, one
|
|
dispose, no leak.
|
|
"""
|
|
runner = _make_runner()
|
|
_seed_runner_with_one_failure(runner)
|
|
adapter = _CountingAdapter(
|
|
succeed=False, fatal_error="bad token", fatal_retryable=False,
|
|
)
|
|
with patch.object(runner, "_create_adapter", return_value=adapter), \
|
|
patch.object(runner, "_connect_adapter_with_timeout",
|
|
new=AsyncMock(return_value=False)):
|
|
await _run_watcher_one_iteration(runner)
|
|
|
|
# The intent of this test is "the watcher calls disconnect()
|
|
# exactly once on the unowned adapter" — not "at least once".
|
|
# An accidental double-dispose would be a new bug to catch
|
|
# (e.g. the watcher's two failure paths both calling dispose
|
|
# for the same adapter instance). Tighten to == 1.
|
|
assert adapter._disconnect_calls == 1, (
|
|
f"non-retryable reconnect failure must call adapter.disconnect() "
|
|
f"exactly once; got {adapter._disconnect_calls} calls. "
|
|
"Without it, 2 fds leak per retry at the 300s backoff cap "
|
|
"(#37011). More than one call would also be a bug — the "
|
|
"adapter has already been disposed once, a second call is "
|
|
"wasted work and may itself raise."
|
|
)
|
|
assert adapter._open_fds == 0, (
|
|
f"adapter fds not released after disconnect(); "
|
|
f"{adapter._open_fds} still open. This is the fd leak #37011."
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retryable_failure_disposes_unowned_adapter(self):
|
|
"""A retryable failure (network blip) must also call disconnect().
|
|
|
|
This is the path that fires most often in production: a
|
|
transient DNS resolution failure or upstream outage, which
|
|
back-offs to 300s and retries indefinitely. The watcher
|
|
tracks ``info["attempts"]`` and reschedules, but the failed
|
|
adapter is still dropped on the floor without dispose.
|
|
"""
|
|
runner = _make_runner()
|
|
_seed_runner_with_one_failure(runner)
|
|
adapter = _CountingAdapter(
|
|
succeed=False, fatal_error="dns timeout", fatal_retryable=True,
|
|
)
|
|
with patch.object(runner, "_create_adapter", return_value=adapter), \
|
|
patch.object(runner, "_connect_adapter_with_timeout",
|
|
new=AsyncMock(return_value=False)):
|
|
await _run_watcher_one_iteration(runner)
|
|
|
|
assert adapter._disconnect_calls >= 1, (
|
|
f"retryable reconnect failure must call adapter.disconnect(); "
|
|
f"got {adapter._disconnect_calls} calls. This is the hot path "
|
|
"for the fd leak in #37011."
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_exception_during_connect_disposes_unowned_adapter(self):
|
|
"""An exception escaping connect() (aiohttp start crash, etc.) disposes.
|
|
|
|
The ``except Exception`` arm in the watcher used to skip the
|
|
dispose call entirely. Pre-fix, this leaked the same 2 fds
|
|
per retry as the other two branches.
|
|
"""
|
|
runner = _make_runner()
|
|
_seed_runner_with_one_failure(runner)
|
|
adapter = _CountingAdapter(raise_during_connect=True)
|
|
with patch.object(runner, "_create_adapter", return_value=adapter), \
|
|
patch.object(runner, "_connect_adapter_with_timeout",
|
|
new=AsyncMock(side_effect=RuntimeError("boom"))):
|
|
await _run_watcher_one_iteration(runner)
|
|
|
|
assert adapter._disconnect_calls >= 1, (
|
|
f"exception-during-connect must call adapter.disconnect(); "
|
|
f"got {adapter._disconnect_calls} calls. The except-arm of the "
|
|
"reconnect watcher is one of the three leak paths in #37011."
|
|
)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_dispose_helper_handles_none(self):
|
|
"""``_dispose_unused_adapter(None)`` is a no-op (defensive)."""
|
|
await _dispose_unused_adapter(None) # must not raise
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_dispose_helper_swallows_disconnect_exception(self):
|
|
"""A disconnect() that itself raises must not abort the watcher loop.
|
|
|
|
Half-constructed adapters can raise from disconnect() because
|
|
some of their __init__ state is missing. The watcher loop
|
|
would then die and stop retrying, masking the original
|
|
configuration error as a hard crash.
|
|
"""
|
|
disconnect_calls = 0
|
|
|
|
class _RaisingAdapter(BasePlatformAdapter):
|
|
def __init__(self):
|
|
super().__init__(
|
|
PlatformConfig(enabled=True, token="t"),
|
|
Platform.TELEGRAM,
|
|
)
|
|
|
|
async def connect(self, *, is_reconnect: bool = False) -> bool:
|
|
return True
|
|
|
|
async def disconnect(self) -> None:
|
|
nonlocal disconnect_calls
|
|
disconnect_calls += 1
|
|
raise RuntimeError("half-constructed; aiohttp app never started")
|
|
|
|
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
|
return SendResult(success=True, message_id="1")
|
|
|
|
async def send_typing(self, chat_id, metadata=None):
|
|
return None
|
|
|
|
async def get_chat_info(self, chat_id):
|
|
return {"id": chat_id}
|
|
|
|
await _dispose_unused_adapter(_RaisingAdapter()) # must not raise
|
|
assert disconnect_calls == 1
|
|
|
|
|
|
class TestAPIServerDisconnectClosesResponseStore:
|
|
"""The platform-level fix: ``APIServerAdapter.disconnect()`` must close its ResponseStore.
|
|
|
|
Without this, the reconnect watcher's dispose call (see the
|
|
test class above) is a no-op for ``APIServerAdapter`` — the
|
|
aiohttp web server stops, but the SQLite ``ResponseStore``
|
|
connection stays open. The DB file plus its WAL sidecar = 2 fds,
|
|
which is the headline leak in #37011.
|
|
"""
|
|
|
|
def _build_adapter_with_store(self, store: ResponseStore) -> APIServerAdapter:
|
|
"""Build an APIServerAdapter with the required internal state.
|
|
|
|
We bypass ``__init__`` (which would try to start aiohttp
|
|
immediately) and set just the fields ``disconnect()`` reads.
|
|
"""
|
|
adapter = APIServerAdapter.__new__(APIServerAdapter)
|
|
adapter._mark_disconnected = lambda: None # type: ignore[method-assign]
|
|
adapter._site = None
|
|
adapter._runner = None
|
|
adapter._app = None
|
|
adapter._response_store = store
|
|
adapter.platform = Platform.API_SERVER
|
|
return adapter
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_disconnect_closes_response_store(self, tmp_path):
|
|
"""Closing the adapter's ResponseStore releases its SQLite connection.
|
|
|
|
We point the ``ResponseStore`` at a tmp db so we can verify
|
|
its ``close()`` is called by ``APIServerAdapter.disconnect()``.
|
|
The real ``ResponseStore.__init__`` opens a SQLite connection
|
|
to ``~/.hermes/response_store.db`` (or :memory: as a fallback),
|
|
which is exactly the resource that was leaking pre-fix.
|
|
"""
|
|
import sqlite3
|
|
|
|
store = ResponseStore(max_size=10, db_path=str(tmp_path / "rs.db"))
|
|
adapter = self._build_adapter_with_store(store)
|
|
|
|
await adapter.disconnect()
|
|
|
|
# Post-disconnect, the underlying sqlite3 conn should be closed.
|
|
# sqlite3 raises ``ProgrammingError: Cannot operate on a closed
|
|
# database`` for any further operation. We assert on the
|
|
# specific exception type (not bare ``Exception``) so the test
|
|
# only passes when the close actually took effect — a generic
|
|
# ``Exception`` catcher would mask unrelated failures (env
|
|
# issues, AttributeError, etc.).
|
|
with pytest.raises(sqlite3.ProgrammingError):
|
|
store._conn.execute("SELECT 1").fetchone()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_disconnect_swallows_response_store_close_exception(self, tmp_path):
|
|
"""A misbehaving ResponseStore.close() must not abort adapter shutdown.
|
|
|
|
Real-world failure mode: the SQLite file was unlinked out
|
|
from under us (operator rm'd ``response_store.db`` during a
|
|
disk pressure event). ``close()`` raises. The watcher must
|
|
continue with the aiohttp shutdown, not bail.
|
|
"""
|
|
store = ResponseStore(max_size=10, db_path=str(tmp_path / "rs.db"))
|
|
|
|
def _boom() -> None:
|
|
raise RuntimeError("sqlite file vanished")
|
|
|
|
store.close = _boom # type: ignore[method-assign]
|
|
adapter = self._build_adapter_with_store(store)
|
|
|
|
# Must not raise — disconnect() swallows the close error and
|
|
# continues to the aiohttp teardown (no-op here since we
|
|
# bypassed __init__).
|
|
await adapter.disconnect()
|