hermes-agent/tests/gateway/test_runner_fatal_adapter.py
teknium1 43b8ba4181 fix(telegram): preserve Bot API update queue on watcher reconnect
After a prolonged outage the in-process network-error ladder escalates to
fatal and GatewayRunner._platform_reconnect_watcher rebuilds a fresh adapter
that reconnects through the bootstrap path. That path called
start_polling(drop_pending_updates=True), discarding every update Telegram
queued during the outage — all messages sent while the bot was down were
silently lost. The in-process ladder and 409-conflict handler already passed
drop_pending_updates=False; only bootstrap did not distinguish a cold first
boot from a reconnect.

Thread an is_reconnect signal from the watcher through
_connect_adapter_with_timeout into adapter.connect(). The base
BasePlatformAdapter.connect() gains a keyword-only is_reconnect=False so every
adapter inherits a tolerant signature (no per-platform breakage when the
runner forwards the kwarg). Telegram translates is_reconnect into
drop_pending_updates=not is_reconnect on both the polling and webhook bootstrap
calls. Cold boot still drops the stale queue; a watcher reconnect preserves it.

Fixes #46621.

Co-authored-by: annguyenNous <annguyen@nousresearch.com>
Co-authored-by: kyssta-exe <kyssta-exe@users.noreply.github.com>
Co-authored-by: Kewe63 <Kewe63@users.noreply.github.com>
2026-06-25 21:29:57 -07:00

100 lines
3.3 KiB
Python

from unittest.mock import AsyncMock
import pytest
from gateway.config import GatewayConfig, Platform, PlatformConfig
from gateway.platforms.base import BasePlatformAdapter
from gateway.run import GatewayRunner
class _FatalAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=True, token="token"), Platform.TELEGRAM)
async def connect(self, *, is_reconnect: bool = False) -> bool:
self._set_fatal_error(
"telegram_token_lock",
"Another local Hermes gateway is already using this Telegram bot token.",
retryable=False,
)
return False
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
class _RuntimeRetryableAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=True, token="token"), Platform.WHATSAPP)
async def connect(self, *, is_reconnect: bool = False) -> bool:
return True
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
@pytest.mark.asyncio
async def test_runner_requests_clean_exit_for_nonretryable_startup_conflict(monkeypatch, tmp_path):
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=True, token="token")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _FatalAdapter())
ok = await runner.start()
assert ok is True
assert runner.should_exit_cleanly is True
assert "already using this Telegram bot token" in runner.exit_reason
@pytest.mark.asyncio
async def test_runner_queues_retryable_runtime_fatal_for_reconnection(monkeypatch, tmp_path):
"""Retryable runtime fatal errors queue the platform for reconnection
AND keep the gateway alive — the background reconnect watcher recovers
the platform when the underlying issue clears. (Previously this
exited-with-failure to trigger a systemd restart; that converted
transient failures into infinite restart loops.)
"""
config = GatewayConfig(
platforms={
Platform.WHATSAPP: PlatformConfig(enabled=True, token="token")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
adapter = _RuntimeRetryableAdapter()
adapter._set_fatal_error(
"whatsapp_bridge_exited",
"WhatsApp bridge process exited unexpectedly (code 1).",
retryable=True,
)
runner.adapters = {Platform.WHATSAPP: adapter}
runner.delivery_router.adapters = runner.adapters
runner.stop = AsyncMock()
await runner._handle_adapter_fatal_error(adapter)
# Gateway stays alive — watcher will retry in background
runner.stop.assert_not_awaited()
assert runner._exit_with_failure is False
assert Platform.WHATSAPP in runner._failed_platforms
assert runner._failed_platforms[Platform.WHATSAPP]["attempts"] == 0