mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(mcp): add periodic keepalive to _wait_for_lifecycle_event
Sends a lightweight list_tools() probe every 3 minutes during idle
periods to prevent TCP connections from going stale behind LB / NAT
idle timeouts (commonly 300-600s). When the keepalive fails, the
reconnect event fires so the transport rebuilds the session cleanly.
Salvages the keepalive portion of @vominh1919's PR #17016. The
circuit-breaker half-open recovery from the same PR was independently
landed on main via #benbarclay's commit 8cc3cebca ("fix(mcp): add
half-open state to circuit breaker", Apr 21); only the keepalive is
salvaged here.
Fixes #17003.
This commit is contained in:
parent
005b2f4c5d
commit
44cf33449d
1 changed files with 33 additions and 4 deletions
|
|
@ -1038,14 +1038,43 @@ class MCPServerTask:
|
|||
with a fresh signal.
|
||||
|
||||
Shutdown takes precedence if both events are set simultaneously.
|
||||
|
||||
Periodically sends a lightweight keepalive (``list_tools``) to
|
||||
prevent TCP connections from going stale during long idle
|
||||
periods (#17003). If the keepalive fails, triggers a reconnect.
|
||||
"""
|
||||
# Keepalive interval in seconds. Must be shorter than typical
|
||||
# LB / NAT idle-timeout (commonly 300-600s).
|
||||
_KEEPALIVE_INTERVAL = 180 # 3 minutes
|
||||
|
||||
shutdown_task = asyncio.create_task(self._shutdown_event.wait())
|
||||
reconnect_task = asyncio.create_task(self._reconnect_event.wait())
|
||||
try:
|
||||
await asyncio.wait(
|
||||
{shutdown_task, reconnect_task},
|
||||
return_when=asyncio.FIRST_COMPLETED,
|
||||
)
|
||||
while True:
|
||||
done, _pending = await asyncio.wait(
|
||||
{shutdown_task, reconnect_task},
|
||||
timeout=_KEEPALIVE_INTERVAL,
|
||||
return_when=asyncio.FIRST_COMPLETED,
|
||||
)
|
||||
if done:
|
||||
break
|
||||
|
||||
# Timeout — no lifecycle event fired. Send a keepalive
|
||||
# to exercise the connection and detect stale sockets.
|
||||
if self.session:
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self.session.list_tools(),
|
||||
timeout=30.0,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"MCP server '%s' keepalive failed, "
|
||||
"triggering reconnect: %s",
|
||||
self.name, exc,
|
||||
)
|
||||
self._reconnect_event.set()
|
||||
break
|
||||
finally:
|
||||
for t in (shutdown_task, reconnect_task):
|
||||
if not t.done():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue