mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: add vLLM/local server error patterns + MCP initial connection retry (#9281)
Port two improvements inspired by Kilo-Org/kilocode analysis: 1. Error classifier: add context overflow patterns for vLLM, Ollama, and llama.cpp/llama-server. These local inference servers return different error formats than cloud providers (e.g., 'exceeds the max_model_len', 'context length exceeded', 'slot context'). Without these patterns, context overflow errors from local servers are misclassified as format errors, causing infinite retries instead of triggering compression. 2. MCP initial connection retry: previously, if the very first connection attempt to an MCP server failed (e.g., transient DNS blip at startup), the server was permanently marked as failed with no retry. Post-connect reconnection had 5 retries with exponential backoff, but initial connection had zero. Now initial connections retry up to 3 times with backoff before giving up, matching the resilience of post-connect reconnection. (Inspired by Kilo Code's MCP server disappearing fix in v1.3.3) Tests: 6 new error classifier tests, 4 new MCP retry tests, 1 updated existing test. All 276 affected tests pass.
This commit is contained in:
parent
0a4cf5b3e1
commit
f324222b79
5 changed files with 204 additions and 8 deletions
|
|
@ -162,6 +162,7 @@ if _MCP_AVAILABLE and not _MCP_MESSAGE_HANDLER_SUPPORTED:
|
|||
_DEFAULT_TOOL_TIMEOUT = 120 # seconds for tool calls
|
||||
_DEFAULT_CONNECT_TIMEOUT = 60 # seconds for initial connection per server
|
||||
_MAX_RECONNECT_RETRIES = 5
|
||||
_MAX_INITIAL_CONNECT_RETRIES = 3 # retries for the very first connection attempt
|
||||
_MAX_BACKOFF_SECONDS = 60
|
||||
|
||||
# Environment variables that are safe to pass to stdio subprocesses
|
||||
|
|
@ -984,6 +985,7 @@ class MCPServerTask:
|
|||
self.name,
|
||||
)
|
||||
retries = 0
|
||||
initial_retries = 0
|
||||
backoff = 1.0
|
||||
|
||||
while True:
|
||||
|
|
@ -997,11 +999,37 @@ class MCPServerTask:
|
|||
except Exception as exc:
|
||||
self.session = None
|
||||
|
||||
# If this is the first connection attempt, report the error
|
||||
# If this is the first connection attempt, retry with backoff
|
||||
# before giving up. A transient DNS/network blip at startup
|
||||
# should not permanently kill the server.
|
||||
# (Ported from Kilo Code's MCP resilience fix.)
|
||||
if not self._ready.is_set():
|
||||
self._error = exc
|
||||
self._ready.set()
|
||||
return
|
||||
initial_retries += 1
|
||||
if initial_retries > _MAX_INITIAL_CONNECT_RETRIES:
|
||||
logger.warning(
|
||||
"MCP server '%s' failed initial connection after "
|
||||
"%d attempts, giving up: %s",
|
||||
self.name, _MAX_INITIAL_CONNECT_RETRIES, exc,
|
||||
)
|
||||
self._error = exc
|
||||
self._ready.set()
|
||||
return
|
||||
|
||||
logger.warning(
|
||||
"MCP server '%s' initial connection failed "
|
||||
"(attempt %d/%d), retrying in %.0fs: %s",
|
||||
self.name, initial_retries,
|
||||
_MAX_INITIAL_CONNECT_RETRIES, backoff, exc,
|
||||
)
|
||||
await asyncio.sleep(backoff)
|
||||
backoff = min(backoff * 2, _MAX_BACKOFF_SECONDS)
|
||||
|
||||
# Check if shutdown was requested during the sleep
|
||||
if self._shutdown_event.is_set():
|
||||
self._error = exc
|
||||
self._ready.set()
|
||||
return
|
||||
continue
|
||||
|
||||
# If shutdown was requested, don't reconnect
|
||||
if self._shutdown_event.is_set():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue