hermes-agent/tests/gateway/test_telegram_network_reconnect.py
Siwen Wang d6137453ac fix(gateway): drain stale httpx polling connections on Telegram reconnect
Network errors through proxies (e.g. sing-box) can leave httpx
connections in a half-closed state occupying pool slots.  After enough
reconnect cycles the 256-connection default fills up entirely, causing
Pool timeout: All connections in the connection pool are occupied.

Fix: cycle only the getUpdates request object (_request[0]) via
shut-down + re-initialize before restarting polling.  This drains stale
connections without touching the general request (_request[1]) that
concurrent send_message / edit_message calls rely on.

The drain is applied to both _handle_polling_network_error and
_handle_polling_conflict reconnect paths via a shared
_drain_polling_connections() helper.  Failures in the drain are
swallowed so reconnect always proceeds.

Based on #16466 by @Mirac1eSky.
2026-04-28 06:37:22 -07:00

286 lines
9.6 KiB
Python

"""
Tests for Telegram polling network error recovery.
Specifically tests the fix for #3173 — when start_polling() fails after a
network error, the adapter must self-reschedule the next reconnect attempt
rather than silently leaving polling dead.
"""
import asyncio
import sys
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from gateway.config import PlatformConfig
def _ensure_telegram_mock():
if "telegram" in sys.modules and hasattr(sys.modules["telegram"], "__file__"):
return
telegram_mod = MagicMock()
telegram_mod.ext.ContextTypes.DEFAULT_TYPE = type(None)
telegram_mod.constants.ParseMode.MARKDOWN_V2 = "MarkdownV2"
telegram_mod.constants.ChatType.GROUP = "group"
telegram_mod.constants.ChatType.SUPERGROUP = "supergroup"
telegram_mod.constants.ChatType.CHANNEL = "channel"
telegram_mod.constants.ChatType.PRIVATE = "private"
for name in ("telegram", "telegram.ext", "telegram.constants", "telegram.request"):
sys.modules.setdefault(name, telegram_mod)
_ensure_telegram_mock()
from gateway.platforms.telegram import TelegramAdapter # noqa: E402
@pytest.fixture(autouse=True)
def _no_auto_discovery(monkeypatch):
"""Disable DoH auto-discovery so connect() uses the plain builder chain."""
async def _noop():
return []
monkeypatch.setattr("gateway.platforms.telegram.discover_fallback_ips", _noop)
def _make_adapter() -> TelegramAdapter:
return TelegramAdapter(PlatformConfig(enabled=True, token="test-token"))
@pytest.mark.asyncio
async def test_reconnect_self_schedules_on_start_polling_failure():
"""
When start_polling() raises during a network error retry, the adapter must
schedule a new _handle_polling_network_error task — otherwise polling stays
dead with no further error callbacks to trigger recovery.
Regression test for #3173: gateway becomes unresponsive after Telegram 502.
"""
adapter = _make_adapter()
adapter._polling_network_error_count = 1
mock_updater = MagicMock()
mock_updater.running = True
mock_updater.stop = AsyncMock()
mock_updater.start_polling = AsyncMock(side_effect=Exception("Timed out"))
mock_app = MagicMock()
mock_app.updater = mock_updater
adapter._app = mock_app
with patch("asyncio.sleep", new_callable=AsyncMock):
await adapter._handle_polling_network_error(Exception("Bad Gateway"))
# A retry task must have been added to _background_tasks
pending = [t for t in adapter._background_tasks if not t.done()]
assert len(pending) >= 1, (
"Expected at least one self-rescheduled retry task in _background_tasks "
f"after start_polling failure, got {len(pending)}"
)
# Clean up — cancel the pending retry so it doesn't run after the test
for t in pending:
t.cancel()
try:
await t
except (asyncio.CancelledError, Exception):
pass
@pytest.mark.asyncio
async def test_reconnect_does_not_self_schedule_when_fatal_error_set():
"""
When a fatal error is already set, the failed reconnect should NOT create
another retry task — the gateway is already shutting down this adapter.
"""
adapter = _make_adapter()
adapter._polling_network_error_count = 1
adapter._set_fatal_error("telegram_network_error", "already fatal", retryable=True)
mock_updater = MagicMock()
mock_updater.running = True
mock_updater.stop = AsyncMock()
mock_updater.start_polling = AsyncMock(side_effect=Exception("Timed out"))
mock_app = MagicMock()
mock_app.updater = mock_updater
adapter._app = mock_app
initial_count = len(adapter._background_tasks)
with patch("asyncio.sleep", new_callable=AsyncMock):
await adapter._handle_polling_network_error(Exception("Timed out"))
assert len(adapter._background_tasks) == initial_count, (
"Should not schedule a retry when a fatal error is already set"
)
@pytest.mark.asyncio
async def test_reconnect_success_resets_error_count():
"""
When start_polling() succeeds, _polling_network_error_count should reset to 0.
"""
adapter = _make_adapter()
adapter._polling_network_error_count = 3
mock_updater = MagicMock()
mock_updater.running = True
mock_updater.stop = AsyncMock()
mock_updater.start_polling = AsyncMock() # succeeds
mock_app = MagicMock()
mock_app.updater = mock_updater
adapter._app = mock_app
with patch("asyncio.sleep", new_callable=AsyncMock):
await adapter._handle_polling_network_error(Exception("Bad Gateway"))
assert adapter._polling_network_error_count == 0
@pytest.mark.asyncio
async def test_reconnect_triggers_fatal_after_max_retries():
"""
After MAX_NETWORK_RETRIES attempts, the adapter should set a fatal error
rather than retrying forever.
"""
adapter = _make_adapter()
adapter._polling_network_error_count = 10 # MAX_NETWORK_RETRIES
fatal_handler = AsyncMock()
adapter.set_fatal_error_handler(fatal_handler)
mock_app = MagicMock()
adapter._app = mock_app
await adapter._handle_polling_network_error(Exception("still failing"))
assert adapter.has_fatal_error
assert adapter.fatal_error_code == "telegram_network_error"
fatal_handler.assert_called_once()
# ---------------------------------------------------------------------------
# Connection pool drain tests (PR #16466 salvage)
# ---------------------------------------------------------------------------
def _make_mock_app():
"""Build a mock Application with an explicit polling request object."""
mock_polling_req = AsyncMock()
mock_polling_req.shutdown = AsyncMock()
mock_polling_req.initialize = AsyncMock()
mock_bot = MagicMock()
mock_bot._request = (mock_polling_req, MagicMock()) # (getUpdates, general)
mock_updater = MagicMock()
mock_updater.running = True
mock_updater.stop = AsyncMock()
mock_updater.start_polling = AsyncMock()
mock_app = MagicMock()
mock_app.updater = mock_updater
mock_app.bot = mock_bot
return mock_app, mock_polling_req
@pytest.mark.asyncio
async def test_reconnect_drains_polling_request_only():
"""During reconnect, only the polling request (_request[0]) must be cycled.
The general request (_request[1]) must NOT be touched — doing so would
break concurrent send_message / edit_message calls.
"""
adapter = _make_adapter()
adapter._polling_network_error_count = 1
mock_app, mock_polling_req = _make_mock_app()
adapter._app = mock_app
general_req = mock_app.bot._request[1]
with patch("asyncio.sleep", new_callable=AsyncMock):
await adapter._handle_polling_network_error(Exception("Bad Gateway"))
# Polling request must be shut down and re-initialized
mock_polling_req.shutdown.assert_called_once()
mock_polling_req.initialize.assert_called_once()
# General request must NOT be touched
general_req.shutdown.assert_not_called()
general_req.initialize.assert_not_called()
# Reconnect must still succeed
mock_app.updater.start_polling.assert_called_once()
assert adapter._polling_network_error_count == 0
@pytest.mark.asyncio
async def test_reconnect_continues_if_drain_fails():
"""If the polling request drain raises, start_polling must still proceed."""
adapter = _make_adapter()
adapter._polling_network_error_count = 1
mock_app, mock_polling_req = _make_mock_app()
# Both shutdown and initialize fail
mock_polling_req.shutdown = AsyncMock(side_effect=Exception("shutdown boom"))
mock_polling_req.initialize = AsyncMock(side_effect=Exception("init boom"))
adapter._app = mock_app
with patch("asyncio.sleep", new_callable=AsyncMock):
await adapter._handle_polling_network_error(Exception("Bad Gateway"))
# start_polling must still be called despite drain failure
mock_app.updater.start_polling.assert_called_once()
assert adapter._polling_network_error_count == 0
@pytest.mark.asyncio
async def test_initialize_still_runs_when_shutdown_fails():
"""If shutdown() raises, initialize() must still be attempted.
This prevents a failed shutdown from leaving the request pool in a
permanently closed state.
"""
adapter = _make_adapter()
adapter._polling_network_error_count = 1
mock_app, mock_polling_req = _make_mock_app()
mock_polling_req.shutdown = AsyncMock(side_effect=Exception("shutdown boom"))
adapter._app = mock_app
with patch("asyncio.sleep", new_callable=AsyncMock):
await adapter._handle_polling_network_error(Exception("Bad Gateway"))
# initialize MUST be called even though shutdown raised
mock_polling_req.initialize.assert_called_once()
mock_app.updater.start_polling.assert_called_once()
@pytest.mark.asyncio
async def test_conflict_retry_also_drains_polling_connections():
"""_handle_polling_conflict must also drain the polling pool on retry."""
adapter = _make_adapter()
adapter._polling_conflict_count = 0
mock_app, mock_polling_req = _make_mock_app()
adapter._app = mock_app
with patch("asyncio.sleep", new_callable=AsyncMock):
await adapter._handle_polling_conflict(Exception("Conflict: terminated by other getUpdates"))
# Polling request must be drained during conflict retry too
mock_polling_req.shutdown.assert_called_once()
mock_polling_req.initialize.assert_called_once()
mock_app.updater.start_polling.assert_called_once()
@pytest.mark.asyncio
async def test_drain_helper_noop_without_app():
"""_drain_polling_connections must be a no-op when _app is None."""
adapter = _make_adapter()
adapter._app = None
# Should not raise
await adapter._drain_polling_connections()