mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-14 04:02:26 +00:00
fix(mcp): add periodic keepalive to _wait_for_lifecycle_event
Sends a lightweight list_tools() probe every 3 minutes during idle
periods to prevent TCP connections from going stale behind LB / NAT
idle timeouts (commonly 300-600s). When the keepalive fails, the
reconnect event fires so the transport rebuilds the session cleanly.
Salvages the keepalive portion of @vominh1919's PR #17016. The
circuit-breaker half-open recovery from the same PR was independently
landed on main via #benbarclay's commit 8cc3cebca ("fix(mcp): add
half-open state to circuit breaker", Apr 21); only the keepalive is
salvaged here.
Fixes #17003.
This commit is contained in:
parent
005b2f4c5d
commit
44cf33449d
1 changed files with 33 additions and 4 deletions
|
|
@ -1038,14 +1038,43 @@ class MCPServerTask:
|
||||||
with a fresh signal.
|
with a fresh signal.
|
||||||
|
|
||||||
Shutdown takes precedence if both events are set simultaneously.
|
Shutdown takes precedence if both events are set simultaneously.
|
||||||
|
|
||||||
|
Periodically sends a lightweight keepalive (``list_tools``) to
|
||||||
|
prevent TCP connections from going stale during long idle
|
||||||
|
periods (#17003). If the keepalive fails, triggers a reconnect.
|
||||||
"""
|
"""
|
||||||
|
# Keepalive interval in seconds. Must be shorter than typical
|
||||||
|
# LB / NAT idle-timeout (commonly 300-600s).
|
||||||
|
_KEEPALIVE_INTERVAL = 180 # 3 minutes
|
||||||
|
|
||||||
shutdown_task = asyncio.create_task(self._shutdown_event.wait())
|
shutdown_task = asyncio.create_task(self._shutdown_event.wait())
|
||||||
reconnect_task = asyncio.create_task(self._reconnect_event.wait())
|
reconnect_task = asyncio.create_task(self._reconnect_event.wait())
|
||||||
try:
|
try:
|
||||||
await asyncio.wait(
|
while True:
|
||||||
|
done, _pending = await asyncio.wait(
|
||||||
{shutdown_task, reconnect_task},
|
{shutdown_task, reconnect_task},
|
||||||
|
timeout=_KEEPALIVE_INTERVAL,
|
||||||
return_when=asyncio.FIRST_COMPLETED,
|
return_when=asyncio.FIRST_COMPLETED,
|
||||||
)
|
)
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Timeout — no lifecycle event fired. Send a keepalive
|
||||||
|
# to exercise the connection and detect stale sockets.
|
||||||
|
if self.session:
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
self.session.list_tools(),
|
||||||
|
timeout=30.0,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(
|
||||||
|
"MCP server '%s' keepalive failed, "
|
||||||
|
"triggering reconnect: %s",
|
||||||
|
self.name, exc,
|
||||||
|
)
|
||||||
|
self._reconnect_event.set()
|
||||||
|
break
|
||||||
finally:
|
finally:
|
||||||
for t in (shutdown_task, reconnect_task):
|
for t in (shutdown_task, reconnect_task):
|
||||||
if not t.done():
|
if not t.done():
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue