From e0a2b087681e98233e619ebbd073a9ee3d592295 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 7 May 2026 07:04:38 -0700 Subject: [PATCH] fix(mcp): re-raise CancelledError explicitly in MCPServerTask.run (#21318) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Python 3.11+, `asyncio.CancelledError` inherits from `BaseException` (not `Exception`), so the broad `except Exception as exc:` in `MCPServerTask.run`'s transport loop did NOT catch it. Task cancellation from gateway restart / explicit `task.cancel()` silently escaped past the reconnect logic — the MCP server task died without going through the shutdown/reconnect code paths that check `_shutdown_event`. Add an explicit `except asyncio.CancelledError: raise` before the broad catch so cancellation propagation is self-documenting rather than an accident of exception hierarchy, and future sibling-site work (e.g. distinguishing shutdown-cancel from transport-cancel) has an obvious hook. Behavior on pre-3.8 Pythons where CancelledError WAS an Exception subclass is also corrected: the old path would have caught it and treated it as a connection failure worth retrying. Closes #9930. --- .../test_mcp_cancelled_error_propagation.py | 92 +++++++++++++++++++ tools/mcp_tool.py | 12 +++ 2 files changed, 104 insertions(+) create mode 100644 tests/tools/test_mcp_cancelled_error_propagation.py diff --git a/tests/tools/test_mcp_cancelled_error_propagation.py b/tests/tools/test_mcp_cancelled_error_propagation.py new file mode 100644 index 0000000000..ce05d03f43 --- /dev/null +++ b/tests/tools/test_mcp_cancelled_error_propagation.py @@ -0,0 +1,92 @@ +"""Regression tests for ``MCPServerTask.run`` + ``asyncio.CancelledError``. + +Background +========== +On Python 3.11+, ``asyncio.CancelledError`` inherits from ``BaseException`` +rather than ``Exception``, so a bare ``except Exception`` does NOT catch it. +``MCPServerTask.run`` had a broad ``except Exception`` around the transport +loop which meant a task cancellation (gateway restart, explicit +``task.cancel()``) caused the reconnect loop to exit silently — the MCP +server stayed dead until Hermes was restarted. See #9930. + +The fix adds an explicit ``except asyncio.CancelledError: raise`` BEFORE +the broad catch so cancellation propagates cleanly to asyncio's task +machinery and ``MCPServerTask.shutdown()``'s ``await self._task`` completes +without hanging the reconnect loop. +""" + +from __future__ import annotations + +import asyncio +from unittest.mock import patch + +import pytest + + +async def _hanging_run(self, cfg): + """Stand-in transport that hangs forever so we can cancel it.""" + await asyncio.sleep(3600) + + +class TestCancelledErrorPropagation: + def test_cancelled_error_is_not_swallowed_by_except_exception(self): + """CancelledError raised inside the transport call must re-raise + so the reconnect loop terminates cleanly on cancel — not stay wedged.""" + from tools.mcp_tool import MCPServerTask + + server = MCPServerTask("cancel-test") + + async def drive(): + with patch.object(MCPServerTask, "_run_stdio", _hanging_run), \ + patch.object(MCPServerTask, "_is_http", lambda self: False): + task = asyncio.create_task(server.run({"command": "fake"})) + # Let the run loop enter the try/except and start awaiting. + await asyncio.sleep(0.05) + task.cancel() + # The fix guarantees the task completes (either via + # CancelledError propagation or clean exit) rather than + # hanging forever. + try: + await asyncio.wait_for(task, timeout=2.0) + except asyncio.CancelledError: + return "cancelled_cleanly" + except asyncio.TimeoutError: + # If we hit this, the reconnect loop swallowed the cancel + # and stayed wedged — the exact #9930 bug. + task.cancel() + try: + await task + except Exception: + pass + return "wedged" + return "clean_return" + + outcome = asyncio.run(drive()) + assert outcome in ("cancelled_cleanly", "clean_return"), ( + f"MCPServerTask.run wedged on cancel (outcome={outcome}) — " + f"#9930 regression" + ) + + def test_shutdown_completes_promptly_when_task_is_cancelled(self): + """``shutdown()`` falls through to ``task.cancel()`` + ``await self._task`` + after a grace period. That cancel must unwedge the reconnect loop — + otherwise ``await self._task`` hangs indefinitely.""" + from tools.mcp_tool import MCPServerTask + + server = MCPServerTask("shutdown-cancel-test") + + async def drive(): + with patch.object(MCPServerTask, "_run_stdio", _hanging_run), \ + patch.object(MCPServerTask, "_is_http", lambda self: False): + server._task = asyncio.ensure_future(server.run({"command": "fake"})) + await asyncio.sleep(0.05) + server._shutdown_event.set() + server._task.cancel() + try: + await asyncio.wait_for(server._task, timeout=2.0) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + return server._task.done() + + done = asyncio.run(drive()) + assert done, "MCPServerTask did not finish after cancel — #9930 regression" diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index e1c8ef393e..d2eb5bd344 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -1399,6 +1399,18 @@ class MCPServerTask: # still detect a transient in-flight state — it'll be # re-set after the fresh session initializes. continue + except asyncio.CancelledError: + # Task was cancelled (shutdown, gateway restart, explicit + # task.cancel()). Don't treat this as a connection failure — + # CancelledError inherits from BaseException (not Exception) + # in Python 3.11+, so the broad ``except Exception`` below + # would NOT catch it; we'd silently exit the reconnect loop + # and the MCP server would stay dead until Hermes is fully + # restarted. Re-raise so the task's cancellation propagates + # correctly to asyncio's task machinery and ``shutdown()``'s + # ``await self._task`` completes. See #9930. + self.session = None + raise except Exception as exc: self.session = None