hermes-agent/tests/gateway/test_telegram_conflict.py
Teknium 2e62862784
fix(telegram): use get_running_loop in polling-conflict retry reschedule (#41716)
The conflict-retry path called asyncio.get_event_loop() to reschedule
itself when a retry's start_polling raised. On Python 3.11+ (our floor)
that raises 'RuntimeError: There is no current event loop in thread
MainThread' when no loop is attached to the thread, which is what
happens when PTB dispatches this error callback. The retry never gets
scheduled, the adapter goes silent-but-alive, and gateway --replace
keeps spawning fresh instances that hit the same wall — the crash loop
reported in #19471 (worse under multi-profile, where two bots hold the
same conflict open).

We are inside a coroutine here, so asyncio.get_running_loop() is the
correct, guaranteed-valid replacement. Only get_event_loop() call in
any platform adapter, so no sibling sites.

Fixes #19471
2026-06-07 22:10:03 -07:00

400 lines
14 KiB
Python

import asyncio
import sys
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock
import pytest
from gateway.config import PlatformConfig
def _ensure_telegram_mock():
if "telegram" in sys.modules and hasattr(sys.modules["telegram"], "__file__"):
return
telegram_mod = MagicMock()
telegram_mod.ext.ContextTypes.DEFAULT_TYPE = type(None)
telegram_mod.constants.ParseMode.MARKDOWN_V2 = "MarkdownV2"
telegram_mod.constants.ChatType.GROUP = "group"
telegram_mod.constants.ChatType.SUPERGROUP = "supergroup"
telegram_mod.constants.ChatType.CHANNEL = "channel"
telegram_mod.constants.ChatType.PRIVATE = "private"
# Provide real exception classes so ``except (NetworkError, ...)`` in
# connect() doesn't blow up with "catching classes that do not inherit
# from BaseException" when another xdist worker pollutes sys.modules.
telegram_mod.error.NetworkError = type("NetworkError", (OSError,), {})
telegram_mod.error.TimedOut = type("TimedOut", (OSError,), {})
telegram_mod.error.BadRequest = type("BadRequest", (Exception,), {})
for name in ("telegram", "telegram.ext", "telegram.constants", "telegram.request"):
sys.modules.setdefault(name, telegram_mod)
sys.modules.setdefault("telegram.error", telegram_mod.error)
_ensure_telegram_mock()
from gateway.platforms.telegram import TelegramAdapter # noqa: E402
@pytest.fixture(autouse=True)
def _no_auto_discovery(monkeypatch):
"""Disable DoH auto-discovery so connect() uses the plain builder chain."""
async def _noop():
return []
monkeypatch.setattr("gateway.platforms.telegram.discover_fallback_ips", _noop)
# Mock HTTPXRequest so the builder chain doesn't fail
monkeypatch.setattr("gateway.platforms.telegram.HTTPXRequest", lambda **kwargs: MagicMock())
@pytest.mark.asyncio
async def test_connect_rejects_same_host_token_lock(monkeypatch):
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="secret-token"))
monkeypatch.setattr(
"gateway.status.acquire_scoped_lock",
lambda scope, identity, metadata=None: (False, {"pid": 4242}),
)
ok = await adapter.connect()
assert ok is False
assert adapter.fatal_error_code == "telegram-bot-token_lock"
assert adapter.has_fatal_error is True
assert "already in use" in adapter.fatal_error_message
@pytest.mark.asyncio
async def test_polling_conflict_retries_before_fatal(monkeypatch):
"""A single 409 should trigger a retry, not an immediate fatal error."""
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
fatal_handler = AsyncMock()
adapter.set_fatal_error_handler(fatal_handler)
monkeypatch.setattr(
"gateway.status.acquire_scoped_lock",
lambda scope, identity, metadata=None: (True, None),
)
monkeypatch.setattr(
"gateway.status.release_scoped_lock",
lambda scope, identity: None,
)
captured = {}
async def fake_start_polling(**kwargs):
captured["error_callback"] = kwargs["error_callback"]
updater = SimpleNamespace(
start_polling=AsyncMock(side_effect=fake_start_polling),
stop=AsyncMock(),
running=True,
)
bot = SimpleNamespace(set_my_commands=AsyncMock(), delete_webhook=AsyncMock())
app = SimpleNamespace(
bot=bot,
updater=updater,
add_handler=MagicMock(),
initialize=AsyncMock(),
start=AsyncMock(),
)
builder = MagicMock()
builder.token.return_value = builder
builder.request.return_value = builder
builder.get_updates_request.return_value = builder
builder.build.return_value = app
monkeypatch.setattr("gateway.platforms.telegram.Application", SimpleNamespace(builder=MagicMock(return_value=builder)))
# Speed up retries for testing
monkeypatch.setattr("asyncio.sleep", AsyncMock())
ok = await adapter.connect()
assert ok is True
bot.delete_webhook.assert_awaited_once_with(drop_pending_updates=False)
assert callable(captured["error_callback"])
conflict = type("Conflict", (Exception,), {})
# First conflict: should retry, NOT be fatal
captured["error_callback"](conflict("Conflict: terminated by other getUpdates request"))
await asyncio.sleep(0)
await asyncio.sleep(0)
# Give the scheduled task a chance to run
for _ in range(10):
await asyncio.sleep(0)
assert adapter.has_fatal_error is False, "First conflict should not be fatal"
assert adapter._polling_conflict_count == 0, "Count should reset after successful retry"
@pytest.mark.asyncio
async def test_polling_conflict_becomes_fatal_after_retries(monkeypatch):
"""After exhausting retries, the conflict should become fatal."""
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
fatal_handler = AsyncMock()
adapter.set_fatal_error_handler(fatal_handler)
monkeypatch.setattr(
"gateway.status.acquire_scoped_lock",
lambda scope, identity, metadata=None: (True, None),
)
monkeypatch.setattr(
"gateway.status.release_scoped_lock",
lambda scope, identity: None,
)
captured = {}
async def fake_start_polling(**kwargs):
captured["error_callback"] = kwargs["error_callback"]
# Make start_polling fail on retries to exhaust retries
call_count = {"n": 0}
async def failing_start_polling(**kwargs):
call_count["n"] += 1
if call_count["n"] == 1:
# First call (initial connect) succeeds
captured["error_callback"] = kwargs["error_callback"]
else:
# Retry calls fail
raise Exception("Connection refused")
updater = SimpleNamespace(
start_polling=AsyncMock(side_effect=failing_start_polling),
stop=AsyncMock(),
running=True,
)
bot = SimpleNamespace(set_my_commands=AsyncMock(), delete_webhook=AsyncMock())
app = SimpleNamespace(
bot=bot,
updater=updater,
add_handler=MagicMock(),
initialize=AsyncMock(),
start=AsyncMock(),
)
builder = MagicMock()
builder.token.return_value = builder
builder.request.return_value = builder
builder.get_updates_request.return_value = builder
builder.build.return_value = app
monkeypatch.setattr("gateway.platforms.telegram.Application", SimpleNamespace(builder=MagicMock(return_value=builder)))
# Speed up retries for testing
monkeypatch.setattr("asyncio.sleep", AsyncMock())
ok = await adapter.connect()
assert ok is True
conflict = type("Conflict", (Exception,), {})
# Directly call _handle_polling_conflict to avoid event-loop scheduling
# complexity. Each call simulates one 409 from Telegram.
for i in range(6):
await adapter._handle_polling_conflict(
conflict("Conflict: terminated by other getUpdates request")
)
# After 5 failed retries (count 1-5 each enter the retry branch but
# start_polling raises), the 6th conflict pushes count to 6 which
# exceeds MAX_CONFLICT_RETRIES (5), entering the fatal branch.
assert adapter.fatal_error_code == "telegram_polling_conflict", (
f"Expected fatal after 6 conflicts, got code={adapter.fatal_error_code}, "
f"count={adapter._polling_conflict_count}"
)
assert adapter.has_fatal_error is True
fatal_handler.assert_awaited_once()
@pytest.mark.asyncio
async def test_connect_marks_retryable_fatal_error_for_startup_network_failure(monkeypatch):
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
monkeypatch.setattr(
"gateway.status.acquire_scoped_lock",
lambda scope, identity, metadata=None: (True, None),
)
monkeypatch.setattr(
"gateway.status.release_scoped_lock",
lambda scope, identity: None,
)
builder = MagicMock()
builder.token.return_value = builder
builder.request.return_value = builder
builder.get_updates_request.return_value = builder
app = SimpleNamespace(
bot=SimpleNamespace(delete_webhook=AsyncMock(), set_my_commands=AsyncMock()),
updater=SimpleNamespace(),
add_handler=MagicMock(),
initialize=AsyncMock(side_effect=RuntimeError("Temporary failure in name resolution")),
start=AsyncMock(),
)
builder.build.return_value = app
monkeypatch.setattr("gateway.platforms.telegram.Application", SimpleNamespace(builder=MagicMock(return_value=builder)))
ok = await adapter.connect()
assert ok is False
assert adapter.fatal_error_code == "telegram_connect_error"
assert adapter.fatal_error_retryable is True
assert "Temporary failure in name resolution" in adapter.fatal_error_message
@pytest.mark.asyncio
async def test_connect_clears_webhook_before_polling(monkeypatch):
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
monkeypatch.setattr(
"gateway.status.acquire_scoped_lock",
lambda scope, identity, metadata=None: (True, None),
)
monkeypatch.setattr(
"gateway.status.release_scoped_lock",
lambda scope, identity: None,
)
updater = SimpleNamespace(
start_polling=AsyncMock(),
stop=AsyncMock(),
running=True,
)
bot = SimpleNamespace(
delete_webhook=AsyncMock(),
set_my_commands=AsyncMock(),
)
app = SimpleNamespace(
bot=bot,
updater=updater,
add_handler=MagicMock(),
initialize=AsyncMock(),
start=AsyncMock(),
)
builder = MagicMock()
builder.token.return_value = builder
builder.request.return_value = builder
builder.get_updates_request.return_value = builder
builder.build.return_value = app
monkeypatch.setattr(
"gateway.platforms.telegram.Application",
SimpleNamespace(builder=MagicMock(return_value=builder)),
)
ok = await adapter.connect()
assert ok is True
bot.delete_webhook.assert_awaited_once_with(drop_pending_updates=False)
@pytest.mark.asyncio
async def test_disconnect_skips_inactive_updater_and_app(monkeypatch):
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
updater = SimpleNamespace(running=False, stop=AsyncMock())
app = SimpleNamespace(
updater=updater,
running=False,
stop=AsyncMock(),
shutdown=AsyncMock(),
)
adapter._app = app
warning = MagicMock()
monkeypatch.setattr("gateway.platforms.telegram.logger.warning", warning)
await adapter.disconnect()
updater.stop.assert_not_awaited()
app.stop.assert_not_awaited()
app.shutdown.assert_awaited_once()
warning.assert_not_called()
@pytest.mark.asyncio
async def test_polling_conflict_reschedule_uses_running_loop(monkeypatch):
"""Regression for #19471.
When a conflict-retry's start_polling raises and we are still below the
retry ceiling, the handler reschedules itself via loop.create_task. The
old code used the deprecated asyncio.get_event_loop(), which raises
"RuntimeError: There is no current event loop in thread 'MainThread'" on
Python 3.11+ when no loop is attached to the thread (as happens when PTB
dispatches this error callback). That left the gateway alive but silent
and drove the --replace crash loop. The fix uses get_running_loop(), which
is always valid inside a coroutine. Force get_event_loop() to raise so a
regression would surface as the original RuntimeError, not pass silently.
"""
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
adapter.set_fatal_error_handler(AsyncMock())
monkeypatch.setattr(
"gateway.status.acquire_scoped_lock",
lambda scope, identity, metadata=None: (True, None),
)
monkeypatch.setattr(
"gateway.status.release_scoped_lock",
lambda scope, identity: None,
)
captured = {}
call_count = {"n": 0}
async def failing_start_polling(**kwargs):
call_count["n"] += 1
if call_count["n"] == 1:
captured["error_callback"] = kwargs["error_callback"]
else:
# Retry attempt fails so the handler enters the reschedule branch.
raise Exception("Connection refused")
updater = SimpleNamespace(
start_polling=AsyncMock(side_effect=failing_start_polling),
stop=AsyncMock(),
running=True,
)
bot = SimpleNamespace(set_my_commands=AsyncMock(), delete_webhook=AsyncMock())
app = SimpleNamespace(
bot=bot,
updater=updater,
add_handler=MagicMock(),
initialize=AsyncMock(),
start=AsyncMock(),
)
builder = MagicMock()
builder.token.return_value = builder
builder.request.return_value = builder
builder.get_updates_request.return_value = builder
builder.build.return_value = app
monkeypatch.setattr(
"gateway.platforms.telegram.Application",
SimpleNamespace(builder=MagicMock(return_value=builder)),
)
monkeypatch.setattr("asyncio.sleep", AsyncMock())
ok = await adapter.connect()
assert ok is True
# If the fix regresses to get_event_loop(), this makes it raise — the same
# RuntimeError users hit in #19471. The running-loop path ignores it.
def _boom():
raise RuntimeError("There is no current event loop in thread 'MainThread'.")
monkeypatch.setattr("asyncio.get_event_loop", _boom)
conflict = type("Conflict", (Exception,), {})
# One conflict: count goes to 1 (< MAX), retry's start_polling raises,
# handler reschedules via loop.create_task — the previously-broken line.
await adapter._handle_polling_conflict(
conflict("Conflict: terminated by other getUpdates request")
)
assert adapter.has_fatal_error is False
assert adapter._polling_error_task is not None
# The rescheduled task must be schedulable on the running loop.
adapter._polling_error_task.cancel()
try:
await adapter._polling_error_task
except (asyncio.CancelledError, Exception):
pass