diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 6e50a66ad..e436e5571 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -156,6 +156,18 @@ _CONTEXT_OVERFLOW_PATTERNS = [ "prompt exceeds max length", "max_tokens", "maximum number of tokens", + # vLLM / local inference server patterns + "exceeds the max_model_len", + "max_model_len", + "prompt length", # "engine prompt length X exceeds" + "input is too long", + "maximum model length", + # Ollama patterns + "context length exceeded", + "truncating input", + # llama.cpp / llama-server patterns + "slot context", # "slot context: N tokens, prompt N tokens" + "n_ctx_slot", # Chinese error messages (some providers return these) "超过最大长度", "上下文长度", diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index b4bf7c5f0..766c5475f 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -580,6 +580,48 @@ class TestClassifyApiError: result = classify_api_error(e) assert result.reason == FailoverReason.context_overflow + # ── vLLM / local inference server error messages ── + + def test_vllm_max_model_len_overflow(self): + """vLLM's 'exceeds the max_model_len' error → context_overflow.""" + e = MockAPIError( + "The engine prompt length 1327246 exceeds the max_model_len 131072. " + "Please reduce prompt.", + status_code=400, + ) + result = classify_api_error(e) + assert result.reason == FailoverReason.context_overflow + + def test_vllm_prompt_length_exceeds(self): + """vLLM prompt length error → context_overflow.""" + e = MockAPIError( + "prompt length 200000 exceeds maximum model length 131072", + status_code=400, + ) + result = classify_api_error(e) + assert result.reason == FailoverReason.context_overflow + + def test_vllm_input_too_long(self): + """vLLM 'input is too long' error → context_overflow.""" + e = MockAPIError("input is too long for model", status_code=400) + result = classify_api_error(e) + assert result.reason == FailoverReason.context_overflow + + def test_ollama_context_length_exceeded(self): + """Ollama 'context length exceeded' error → context_overflow.""" + e = MockAPIError("context length exceeded", status_code=400) + result = classify_api_error(e) + assert result.reason == FailoverReason.context_overflow + + def test_llamacpp_slot_context(self): + """llama.cpp / llama-server 'slot context' error → context_overflow.""" + e = MockAPIError( + "slot context: 4096 tokens, prompt 8192 tokens — not enough space", + status_code=400, + ) + result = classify_api_error(e) + assert result.reason == FailoverReason.context_overflow + # ── Result metadata ── def test_provider_and_model_in_result(self): diff --git a/tests/tools/test_mcp_stability.py b/tests/tools/test_mcp_stability.py index 576d053df..e3827f0a5 100644 --- a/tests/tools/test_mcp_stability.py +++ b/tests/tools/test_mcp_stability.py @@ -180,3 +180,113 @@ class TestMCPReloadTimeout: # The fix adds threading.Thread for _reload_mcp assert "Thread" in source or "thread" in source.lower(), \ "_check_config_mcp_changes should use a thread for _reload_mcp" + + +# --------------------------------------------------------------------------- +# Fix 4: MCP initial connection retry with backoff +# (Ported from Kilo Code's MCP resilience fix) +# --------------------------------------------------------------------------- + +class TestMCPInitialConnectionRetry: + """MCPServerTask.run() retries initial connection failures instead of giving up.""" + + def test_initial_connect_retries_constant_exists(self): + """_MAX_INITIAL_CONNECT_RETRIES should be defined.""" + from tools.mcp_tool import _MAX_INITIAL_CONNECT_RETRIES + assert _MAX_INITIAL_CONNECT_RETRIES >= 1 + + def test_initial_connect_retry_succeeds_on_second_attempt(self): + """Server succeeds after one transient initial failure.""" + from tools.mcp_tool import MCPServerTask, _MAX_INITIAL_CONNECT_RETRIES + + call_count = 0 + + async def _run(): + nonlocal call_count + server = MCPServerTask("test-retry") + + # Track calls via patching the method on the class + original_run_stdio = MCPServerTask._run_stdio + + async def fake_run_stdio(self_inner, config): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise ConnectionError("DNS resolution failed") + # Second attempt: success — set ready and "run" until shutdown + self_inner._ready.set() + await self_inner._shutdown_event.wait() + + with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio): + task = asyncio.ensure_future(server.run({"command": "fake"})) + await server._ready.wait() + + # It should have succeeded (no error) after retrying + assert server._error is None, f"Expected no error, got: {server._error}" + assert call_count == 2, f"Expected 2 attempts, got {call_count}" + + # Clean shutdown + server._shutdown_event.set() + await task + + asyncio.get_event_loop().run_until_complete(_run()) + + def test_initial_connect_gives_up_after_max_retries(self): + """Server gives up after _MAX_INITIAL_CONNECT_RETRIES failures.""" + from tools.mcp_tool import MCPServerTask, _MAX_INITIAL_CONNECT_RETRIES + + call_count = 0 + + async def _run(): + nonlocal call_count + server = MCPServerTask("test-exhaust") + + async def fake_run_stdio(self_inner, config): + nonlocal call_count + call_count += 1 + raise ConnectionError("DNS resolution failed") + + with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio): + task = asyncio.ensure_future(server.run({"command": "fake"})) + await server._ready.wait() + + # Should have an error after exhausting retries + assert server._error is not None + assert "DNS resolution failed" in str(server._error) + # 1 initial + N retries = _MAX_INITIAL_CONNECT_RETRIES + 1 total attempts + assert call_count == _MAX_INITIAL_CONNECT_RETRIES + 1 + + await task + + asyncio.get_event_loop().run_until_complete(_run()) + + def test_initial_connect_retry_respects_shutdown(self): + """Shutdown during initial retry backoff aborts cleanly.""" + from tools.mcp_tool import MCPServerTask + + async def _run(): + server = MCPServerTask("test-shutdown") + attempt = 0 + + async def fake_run_stdio(self_inner, config): + nonlocal attempt + attempt += 1 + if attempt == 1: + raise ConnectionError("transient failure") + # Should not reach here because shutdown fires during sleep + raise AssertionError("Should not attempt after shutdown") + + with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio): + task = asyncio.ensure_future(server.run({"command": "fake"})) + + # Give the first attempt time to fail, then set shutdown + # during the backoff sleep + await asyncio.sleep(0.1) + server._shutdown_event.set() + await server._ready.wait() + + # Should have the error set and be done + assert server._error is not None + await task + + asyncio.get_event_loop().run_until_complete(_run()) diff --git a/tests/tools/test_mcp_tool.py b/tests/tools/test_mcp_tool.py index 726c40cc9..663895c0b 100644 --- a/tests/tools/test_mcp_tool.py +++ b/tests/tools/test_mcp_tool.py @@ -1008,8 +1008,12 @@ class TestReconnection: asyncio.run(_test()) def test_no_reconnect_on_initial_failure(self): - """First connection failure reports error immediately, no retry.""" - from tools.mcp_tool import MCPServerTask + """First connection failure retries up to _MAX_INITIAL_CONNECT_RETRIES times. + + Before the MCP resilience fix, initial failures gave up immediately. + Now they retry with backoff to handle transient DNS/network blips. + """ + from tools.mcp_tool import MCPServerTask, _MAX_INITIAL_CONNECT_RETRIES run_count = 0 target_server = None @@ -1032,8 +1036,8 @@ class TestReconnection: patch("asyncio.sleep", new_callable=AsyncMock): await server.run({"command": "test"}) - # Only one attempt, no retry on initial failure - assert run_count == 1 + # Now retries up to _MAX_INITIAL_CONNECT_RETRIES before giving up + assert run_count == _MAX_INITIAL_CONNECT_RETRIES + 1 assert server._error is not None assert "cannot connect" in str(server._error) diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index 035564c7b..e953998cc 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -162,6 +162,7 @@ if _MCP_AVAILABLE and not _MCP_MESSAGE_HANDLER_SUPPORTED: _DEFAULT_TOOL_TIMEOUT = 120 # seconds for tool calls _DEFAULT_CONNECT_TIMEOUT = 60 # seconds for initial connection per server _MAX_RECONNECT_RETRIES = 5 +_MAX_INITIAL_CONNECT_RETRIES = 3 # retries for the very first connection attempt _MAX_BACKOFF_SECONDS = 60 # Environment variables that are safe to pass to stdio subprocesses @@ -984,6 +985,7 @@ class MCPServerTask: self.name, ) retries = 0 + initial_retries = 0 backoff = 1.0 while True: @@ -997,11 +999,37 @@ class MCPServerTask: except Exception as exc: self.session = None - # If this is the first connection attempt, report the error + # If this is the first connection attempt, retry with backoff + # before giving up. A transient DNS/network blip at startup + # should not permanently kill the server. + # (Ported from Kilo Code's MCP resilience fix.) if not self._ready.is_set(): - self._error = exc - self._ready.set() - return + initial_retries += 1 + if initial_retries > _MAX_INITIAL_CONNECT_RETRIES: + logger.warning( + "MCP server '%s' failed initial connection after " + "%d attempts, giving up: %s", + self.name, _MAX_INITIAL_CONNECT_RETRIES, exc, + ) + self._error = exc + self._ready.set() + return + + logger.warning( + "MCP server '%s' initial connection failed " + "(attempt %d/%d), retrying in %.0fs: %s", + self.name, initial_retries, + _MAX_INITIAL_CONNECT_RETRIES, backoff, exc, + ) + await asyncio.sleep(backoff) + backoff = min(backoff * 2, _MAX_BACKOFF_SECONDS) + + # Check if shutdown was requested during the sleep + if self._shutdown_event.is_set(): + self._error = exc + self._ready.set() + return + continue # If shutdown was requested, don't reconnect if self._shutdown_event.is_set():