From 44cf33449d4f78f9a13eb37c7d99d2c4b021f696 Mon Sep 17 00:00:00 2001 From: vominh1919 Date: Tue, 5 May 2026 05:26:49 -0700 Subject: [PATCH] fix(mcp): add periodic keepalive to _wait_for_lifecycle_event Sends a lightweight list_tools() probe every 3 minutes during idle periods to prevent TCP connections from going stale behind LB / NAT idle timeouts (commonly 300-600s). When the keepalive fails, the reconnect event fires so the transport rebuilds the session cleanly. Salvages the keepalive portion of @vominh1919's PR #17016. The circuit-breaker half-open recovery from the same PR was independently landed on main via #benbarclay's commit 8cc3cebca ("fix(mcp): add half-open state to circuit breaker", Apr 21); only the keepalive is salvaged here. Fixes #17003. --- tools/mcp_tool.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index 21e935a12f..9ed8ac75d0 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -1038,14 +1038,43 @@ class MCPServerTask: with a fresh signal. Shutdown takes precedence if both events are set simultaneously. + + Periodically sends a lightweight keepalive (``list_tools``) to + prevent TCP connections from going stale during long idle + periods (#17003). If the keepalive fails, triggers a reconnect. """ + # Keepalive interval in seconds. Must be shorter than typical + # LB / NAT idle-timeout (commonly 300-600s). + _KEEPALIVE_INTERVAL = 180 # 3 minutes + shutdown_task = asyncio.create_task(self._shutdown_event.wait()) reconnect_task = asyncio.create_task(self._reconnect_event.wait()) try: - await asyncio.wait( - {shutdown_task, reconnect_task}, - return_when=asyncio.FIRST_COMPLETED, - ) + while True: + done, _pending = await asyncio.wait( + {shutdown_task, reconnect_task}, + timeout=_KEEPALIVE_INTERVAL, + return_when=asyncio.FIRST_COMPLETED, + ) + if done: + break + + # Timeout — no lifecycle event fired. Send a keepalive + # to exercise the connection and detect stale sockets. + if self.session: + try: + await asyncio.wait_for( + self.session.list_tools(), + timeout=30.0, + ) + except Exception as exc: + logger.warning( + "MCP server '%s' keepalive failed, " + "triggering reconnect: %s", + self.name, exc, + ) + self._reconnect_event.set() + break finally: for t in (shutdown_task, reconnect_task): if not t.done():