fix(discord): cancel _bot_task on connect() timeout to prevent zombie client

When connect() times out waiting for the Discord ready event, the background
asyncio.Task running client.start() was not cancelled. discord.py's internal
reconnect loop can ignore client.close() while a WebSocket handshake is in
flight, so the orphaned task eventually completes and fires on_ready.

A later successful reconnect then leaves two live Discord clients in the same
process — each with its own on_message handler and MessageDeduplicator instance
— so every @mention creates two threads because the per-adapter dedup caches
cannot catch cross-client duplicates.

Fix: explicitly cancel and await _bot_task in two places:
1. The asyncio.TimeoutError handler inside connect() — catches the case where
   the adapter's own inner wait_for fires before the gateway's outer timeout.
2. The start of disconnect() — the load-bearing path, always reached via
   _dispose_unused_adapter regardless of which timeout fired first.

Root cause confirmed from production logs: a Jun 8 network outage caused three
consecutive connect() timeouts. The first attempt's bot_task completed its
handshake 4 minutes later ("Connected as") with no preceding watcher line,
then the watcher's real reconnect also connected 90 seconds after that. The two
clients ran continuously for 41+ hours, confirmed by the same user message
appearing as two separate inbound events in two different thread IDs 357ms apart.

Regression tests added to tests/gateway/test_discord_connect.py:
- test_connect_timeout_cancels_bot_task: simulates a connect() timeout with a
  NeverReadyBot and asserts _bot_task is None afterward
- test_disconnect_cancels_running_bot_task: injects a live zombie task, calls
  disconnect(), and asserts the task is cancelled and the attribute cleared
This commit is contained in:
Dineth Hettiarachchi 2026-06-11 22:19:47 +05:30 committed by Teknium
parent 13650ab7f8
commit 020ef76cf1
2 changed files with 107 additions and 0 deletions

View file

@ -910,6 +910,17 @@ class DiscordAdapter(BasePlatformAdapter):
except asyncio.TimeoutError:
logger.error("[%s] Timeout waiting for connection to Discord", self.name, exc_info=True)
# Cancel the background bot task so it cannot fire on_message after
# this adapter is discarded. Without this, the task keeps running and
# a later successful reconnect leaves two active Discord clients that
# each process every message, producing duplicate threads/responses.
if self._bot_task and not self._bot_task.done():
self._bot_task.cancel()
try:
await self._bot_task
except (asyncio.CancelledError, Exception):
pass
self._bot_task = None
self._release_platform_lock()
return False
except Exception as e: # pragma: no cover - defensive logging
@ -919,6 +930,20 @@ class DiscordAdapter(BasePlatformAdapter):
async def disconnect(self) -> None:
"""Disconnect from Discord."""
# Cancel the bot task before closing the client. If connect() timed out
# and returned False, the background client.start() task may still be
# running; calling client.close() alone is not enough to stop it because
# discord.py's reconnect loop can ignore the closed flag while a
# WebSocket handshake is in flight. Explicitly cancelling the task here
# ensures the zombie client cannot receive or dispatch any further events.
if self._bot_task and not self._bot_task.done():
self._bot_task.cancel()
try:
await self._bot_task
except (asyncio.CancelledError, Exception):
pass
self._bot_task = None
# Clean up all active voice connections before closing the client
for guild_id in list(self._voice_clients.keys()):
try:

View file

@ -279,6 +279,88 @@ async def test_connect_releases_token_lock_on_timeout(monkeypatch):
assert adapter._platform_lock_identity is None
@pytest.mark.asyncio
async def test_connect_timeout_cancels_bot_task(monkeypatch):
"""Regression: connect() timeout must cancel _bot_task so the zombie
Discord client cannot fire on_message after the adapter is discarded.
Without this fix, the orphaned task eventually completes its WebSocket
handshake and a subsequent successful reconnect leaves two live clients
that each process every message, producing duplicate threads.
"""
adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))
monkeypatch.setattr("gateway.status.acquire_scoped_lock", lambda scope, identity, metadata=None: (True, None))
monkeypatch.setattr("gateway.status.release_scoped_lock", lambda scope, identity: None)
intents = SimpleNamespace(
message_content=False, dm_messages=False, guild_messages=False,
members=False, voice_states=False,
)
monkeypatch.setattr(discord_platform.Intents, "default", lambda: intents)
class NeverReadyBot(FakeBot):
"""Bot whose start() never fires on_ready — simulates a slow gateway handshake."""
async def start(self, token):
await asyncio.Event().wait() # hang forever
monkeypatch.setattr(
discord_platform.commands,
"Bot",
lambda **kwargs: NeverReadyBot(
intents=kwargs["intents"],
proxy=kwargs.get("proxy"),
allowed_mentions=kwargs.get("allowed_mentions"),
),
)
async def fake_wait_for(awaitable, timeout):
awaitable.close()
raise asyncio.TimeoutError()
monkeypatch.setattr(discord_platform.asyncio, "wait_for", fake_wait_for)
ok = await adapter.connect()
assert ok is False
assert adapter._bot_task is None, (
"_bot_task must be cancelled and cleared on connect() timeout; "
"leaving it alive creates a zombie Discord client that produces duplicate threads"
)
@pytest.mark.asyncio
async def test_disconnect_cancels_running_bot_task(monkeypatch):
"""Regression: disconnect() must cancel _bot_task even when connect() timed out.
_dispose_unused_adapter calls disconnect() on adapters whose connect() returned
False. If _bot_task was still running (zombie), disconnect() must cancel it.
"""
adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))
monkeypatch.setattr("gateway.status.acquire_scoped_lock", lambda scope, identity, metadata=None: (True, None))
monkeypatch.setattr("gateway.status.release_scoped_lock", lambda scope, identity: None)
# Simulate a zombie bot_task that never finishes (as if discord.py is mid-handshake)
async def _forever():
await asyncio.Event().wait() # hang forever
zombie_task = asyncio.create_task(_forever())
adapter._bot_task = zombie_task
adapter._client = AsyncMock()
adapter._post_connect_task = None
adapter._voice_clients = {}
adapter._running = True
adapter._ready_event = asyncio.Event()
await adapter.disconnect()
# The task must have been cancelled (done + cancelled) and cleared from the adapter.
assert adapter._bot_task is None, "disconnect() must clear _bot_task"
assert zombie_task.done(), "disconnect() must have awaited the bot task to completion"
assert zombie_task.cancelled(), "disconnect() must cancel the zombie bot task"
@pytest.mark.asyncio
async def test_connect_does_not_wait_for_slash_sync(monkeypatch):
adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))