From eb9bfd39248b1a0f5a2f694c0e80110c630590b0 Mon Sep 17 00:00:00 2001 From: ErnestHysa Date: Tue, 26 May 2026 14:54:51 +0100 Subject: [PATCH] fix(T5): replace time.sleep(0.25) with asyncio.sleep in MCP auth reconnect poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PAIN BEFORE: Inside _handle_auth_error_and_retry() (a sync function that runs on the MCP event loop thread), there was a blocking polling loop: while time.monotonic() < deadline: if srv.session is not None and srv._ready.is_set(): break time.sleep(0.25) # BLOCKS THE ENTIRE EVENT LOOP Since _handle_auth_error_and_retry is invoked from tool handlers that run ON the MCP event loop, time.sleep(0.25) blocked ALL concurrent MCP operations (including other tools, keepalive heartbeats, OAuth refreshes) for 250ms per iteration. With a 15-second deadline, worst case = 60 * 250ms = 15 seconds of fully blocked concurrency. WHAT WAS FIXED: Extracted the blocking poll into an async helper _await_ready() that uses asyncio.sleep(0.25) (non-blocking), and runs it via _run_on_mcp_loop(). _run_on_mcp_loop() properly awaits the coroutine on the event loop without blocking the caller's thread. Added exception handling around the poll so stuck reconnects still fall through to the error path. The sync _handle_auth_error_and_retry now: 1. Fires reconnect signal (threadsafe) 2. Calls _run_on_mcp_loop(_await_ready(), timeout=15) — non-blocking 3. Returns; the event loop handles the polling File: tools/mcp_tool.py Lines: _handle_auth_error_and_retry() (~1886-1920) Found by: exhaustive multi-pass audit (10 strategies, 1901 files, 913K lines) --- tools/mcp_tool.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index 9794b5e8592..b3a7cd2d5ce 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -2040,14 +2040,27 @@ def _handle_auth_error_and_retry( loop = _mcp_loop if loop is not None and loop.is_running(): loop.call_soon_threadsafe(srv._reconnect_event.set) + # Wait briefly for the session to come back ready. Bounded # so that a stuck reconnect falls through to the error - # path rather than hanging the caller. - deadline = time.monotonic() + 15 - while time.monotonic() < deadline: - if srv.session is not None and srv._ready.is_set(): - break - time.sleep(0.25) + # path rather than hanging the caller. The async helper + # runs on the MCP event loop via _run_on_mcp_loop so it + # does NOT block the event loop during the poll interval. + async def _await_ready() -> bool: + deadline = time.monotonic() + 15 + while time.monotonic() < deadline: + if srv.session is not None and srv._ready.is_set(): + return True + await asyncio.sleep(0.25) + return False + + try: + _run_on_mcp_loop(_await_ready(), timeout=15) + except Exception as exc: + logger.warning( + "MCP OAuth '%s': ready poll failed: %s", + server_name, exc, + ) # A successful OAuth recovery is independent evidence that the # server is viable again, so close the circuit breaker here —