mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(mcp): re-raise CancelledError explicitly in MCPServerTask.run (#21318)
On Python 3.11+, `asyncio.CancelledError` inherits from `BaseException` (not `Exception`), so the broad `except Exception as exc:` in `MCPServerTask.run`'s transport loop did NOT catch it. Task cancellation from gateway restart / explicit `task.cancel()` silently escaped past the reconnect logic — the MCP server task died without going through the shutdown/reconnect code paths that check `_shutdown_event`. Add an explicit `except asyncio.CancelledError: raise` before the broad catch so cancellation propagation is self-documenting rather than an accident of exception hierarchy, and future sibling-site work (e.g. distinguishing shutdown-cancel from transport-cancel) has an obvious hook. Behavior on pre-3.8 Pythons where CancelledError WAS an Exception subclass is also corrected: the old path would have caught it and treated it as a connection failure worth retrying. Closes #9930.
This commit is contained in:
parent
5a3e5b23d2
commit
e0a2b08768
2 changed files with 104 additions and 0 deletions
92
tests/tools/test_mcp_cancelled_error_propagation.py
Normal file
92
tests/tools/test_mcp_cancelled_error_propagation.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
"""Regression tests for ``MCPServerTask.run`` + ``asyncio.CancelledError``.
|
||||
|
||||
Background
|
||||
==========
|
||||
On Python 3.11+, ``asyncio.CancelledError`` inherits from ``BaseException``
|
||||
rather than ``Exception``, so a bare ``except Exception`` does NOT catch it.
|
||||
``MCPServerTask.run`` had a broad ``except Exception`` around the transport
|
||||
loop which meant a task cancellation (gateway restart, explicit
|
||||
``task.cancel()``) caused the reconnect loop to exit silently — the MCP
|
||||
server stayed dead until Hermes was restarted. See #9930.
|
||||
|
||||
The fix adds an explicit ``except asyncio.CancelledError: raise`` BEFORE
|
||||
the broad catch so cancellation propagates cleanly to asyncio's task
|
||||
machinery and ``MCPServerTask.shutdown()``'s ``await self._task`` completes
|
||||
without hanging the reconnect loop.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
async def _hanging_run(self, cfg):
|
||||
"""Stand-in transport that hangs forever so we can cancel it."""
|
||||
await asyncio.sleep(3600)
|
||||
|
||||
|
||||
class TestCancelledErrorPropagation:
|
||||
def test_cancelled_error_is_not_swallowed_by_except_exception(self):
|
||||
"""CancelledError raised inside the transport call must re-raise
|
||||
so the reconnect loop terminates cleanly on cancel — not stay wedged."""
|
||||
from tools.mcp_tool import MCPServerTask
|
||||
|
||||
server = MCPServerTask("cancel-test")
|
||||
|
||||
async def drive():
|
||||
with patch.object(MCPServerTask, "_run_stdio", _hanging_run), \
|
||||
patch.object(MCPServerTask, "_is_http", lambda self: False):
|
||||
task = asyncio.create_task(server.run({"command": "fake"}))
|
||||
# Let the run loop enter the try/except and start awaiting.
|
||||
await asyncio.sleep(0.05)
|
||||
task.cancel()
|
||||
# The fix guarantees the task completes (either via
|
||||
# CancelledError propagation or clean exit) rather than
|
||||
# hanging forever.
|
||||
try:
|
||||
await asyncio.wait_for(task, timeout=2.0)
|
||||
except asyncio.CancelledError:
|
||||
return "cancelled_cleanly"
|
||||
except asyncio.TimeoutError:
|
||||
# If we hit this, the reconnect loop swallowed the cancel
|
||||
# and stayed wedged — the exact #9930 bug.
|
||||
task.cancel()
|
||||
try:
|
||||
await task
|
||||
except Exception:
|
||||
pass
|
||||
return "wedged"
|
||||
return "clean_return"
|
||||
|
||||
outcome = asyncio.run(drive())
|
||||
assert outcome in ("cancelled_cleanly", "clean_return"), (
|
||||
f"MCPServerTask.run wedged on cancel (outcome={outcome}) — "
|
||||
f"#9930 regression"
|
||||
)
|
||||
|
||||
def test_shutdown_completes_promptly_when_task_is_cancelled(self):
|
||||
"""``shutdown()`` falls through to ``task.cancel()`` + ``await self._task``
|
||||
after a grace period. That cancel must unwedge the reconnect loop —
|
||||
otherwise ``await self._task`` hangs indefinitely."""
|
||||
from tools.mcp_tool import MCPServerTask
|
||||
|
||||
server = MCPServerTask("shutdown-cancel-test")
|
||||
|
||||
async def drive():
|
||||
with patch.object(MCPServerTask, "_run_stdio", _hanging_run), \
|
||||
patch.object(MCPServerTask, "_is_http", lambda self: False):
|
||||
server._task = asyncio.ensure_future(server.run({"command": "fake"}))
|
||||
await asyncio.sleep(0.05)
|
||||
server._shutdown_event.set()
|
||||
server._task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(server._task, timeout=2.0)
|
||||
except (asyncio.CancelledError, asyncio.TimeoutError):
|
||||
pass
|
||||
return server._task.done()
|
||||
|
||||
done = asyncio.run(drive())
|
||||
assert done, "MCPServerTask did not finish after cancel — #9930 regression"
|
||||
|
|
@ -1399,6 +1399,18 @@ class MCPServerTask:
|
|||
# still detect a transient in-flight state — it'll be
|
||||
# re-set after the fresh session initializes.
|
||||
continue
|
||||
except asyncio.CancelledError:
|
||||
# Task was cancelled (shutdown, gateway restart, explicit
|
||||
# task.cancel()). Don't treat this as a connection failure —
|
||||
# CancelledError inherits from BaseException (not Exception)
|
||||
# in Python 3.11+, so the broad ``except Exception`` below
|
||||
# would NOT catch it; we'd silently exit the reconnect loop
|
||||
# and the MCP server would stay dead until Hermes is fully
|
||||
# restarted. Re-raise so the task's cancellation propagates
|
||||
# correctly to asyncio's task machinery and ``shutdown()``'s
|
||||
# ``await self._task`` completes. See #9930.
|
||||
self.session = None
|
||||
raise
|
||||
except Exception as exc:
|
||||
self.session = None
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue