hermes-agent/tests/gateway/test_telegram_pending_update_probe.py
teknium1 d5ba374c03 fix(telegram): detect wedged getUpdates consumer via pending_update_count
The merged CLOSE-WAIT heartbeat (#52744) only probes get_me(), which uses the
general request path and stays healthy while PTB's getUpdates consumer is
silently wedged (updater.running=True but the long-poll task is stuck, observed
on WSL2). DMs then queue in the Bot API and never reach handlers (#42909).

Augment the existing _polling_heartbeat_loop to also probe
get_webhook_info().pending_update_count. After two consecutive probes that see a
non-draining queue while the updater claims to be running, escalate into the
existing _handle_polling_network_error recovery ladder — no new restart
machinery. No-ops in webhook mode, when the updater is not running, or when a
reconnect is already in flight.

Credit to @gazzumatteo, whose PR #42959 identified the pending_update_count
signal as the missing liveness probe. This reuses the existing heartbeat +
recovery path rather than adding a parallel watchdog.

Fixes #42909.
2026-06-28 02:44:17 -07:00

117 lines
4.5 KiB
Python

"""TelegramAdapter wedged-getUpdates detection via pending_update_count.
PTB can report ``updater.running == True`` while its long-poll consumer is
silently stuck (observed on WSL2), so DMs queue in the Bot API and never reach
handlers (#42909). ``get_me()`` stays healthy (general request path), so the
CLOSE-WAIT heartbeat is blind to it. ``_probe_pending_updates`` watches
``get_webhook_info().pending_update_count`` and escalates to the existing
network-error recovery ladder after two consecutive stuck probes.
"""
import sys
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from gateway.config import PlatformConfig
def _ensure_telegram_mock():
if "telegram" in sys.modules and hasattr(sys.modules["telegram"], "__file__"):
return
mod = MagicMock()
mod.error.NetworkError = type("NetworkError", (OSError,), {})
mod.error.TimedOut = type("TimedOut", (OSError,), {})
mod.error.BadRequest = type("BadRequest", (Exception,), {})
for name in ("telegram", "telegram.ext", "telegram.constants", "telegram.request"):
sys.modules.setdefault(name, mod)
sys.modules.setdefault("telegram.error", mod.error)
_ensure_telegram_mock()
from plugins.platforms.telegram.adapter import TelegramAdapter # noqa: E402
def _make_adapter(*, pending: int) -> TelegramAdapter:
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
adapter._webhook_mode = False
adapter._app = MagicMock()
adapter._app.updater.running = True
bot = MagicMock()
bot.get_webhook_info = AsyncMock(
return_value=MagicMock(pending_update_count=pending)
)
adapter._app.bot = bot
adapter._bot = bot
return adapter
@pytest.mark.asyncio
async def test_single_stuck_probe_does_not_escalate():
"""One probe with a queued update only increments the counter."""
adapter = _make_adapter(pending=3)
with patch.object(adapter, "_handle_polling_network_error", new=AsyncMock()) as rec:
await adapter._probe_pending_updates(adapter._app.bot, 5)
assert adapter._polling_pending_stuck_count == 1
rec.assert_not_called()
@pytest.mark.asyncio
async def test_two_consecutive_stuck_probes_trigger_recovery():
"""Second consecutive stuck probe routes into the recovery ladder."""
adapter = _make_adapter(pending=2)
recovery = AsyncMock()
with patch.object(adapter, "_handle_polling_network_error", new=recovery):
await adapter._probe_pending_updates(adapter._app.bot, 5)
assert adapter._polling_pending_stuck_count == 1
await adapter._probe_pending_updates(adapter._app.bot, 5)
# Let the scheduled recovery task run.
task = adapter._polling_error_task
assert task is not None
await task
recovery.assert_awaited_once()
# Counter resets after escalation so a fresh wedge starts from zero.
assert adapter._polling_pending_stuck_count == 0
@pytest.mark.asyncio
async def test_zero_pending_resets_counter():
"""A drained queue clears any prior stuck count without escalating."""
adapter = _make_adapter(pending=0)
adapter._polling_pending_stuck_count = 1
with patch.object(adapter, "_handle_polling_network_error", new=AsyncMock()) as rec:
await adapter._probe_pending_updates(adapter._app.bot, 5)
assert adapter._polling_pending_stuck_count == 0
rec.assert_not_called()
@pytest.mark.asyncio
async def test_webhook_mode_is_noop():
"""Webhook mode holds no server-side queue — probe never runs."""
adapter = _make_adapter(pending=9)
adapter._webhook_mode = True
await adapter._probe_pending_updates(adapter._app.bot, 5)
adapter._app.bot.get_webhook_info.assert_not_called()
assert adapter._polling_pending_stuck_count == 0
@pytest.mark.asyncio
async def test_no_probe_when_updater_not_running():
"""If the updater isn't running, recovery is already someone else's job."""
adapter = _make_adapter(pending=9)
adapter._app.updater.running = False
adapter._polling_pending_stuck_count = 1
await adapter._probe_pending_updates(adapter._app.bot, 5)
adapter._app.bot.get_webhook_info.assert_not_called()
assert adapter._polling_pending_stuck_count == 0
@pytest.mark.asyncio
async def test_reconnect_in_flight_skips_probe():
"""An active recovery task owns the connection — don't double-trigger."""
adapter = _make_adapter(pending=9)
inflight = MagicMock()
inflight.done.return_value = False
adapter._polling_error_task = inflight
await adapter._probe_pending_updates(adapter._app.bot, 5)
adapter._app.bot.get_webhook_info.assert_not_called()