fix: add vLLM/local server error patterns + MCP initial connection retry (#9281)

Port two improvements inspired by Kilo-Org/kilocode analysis: 1. Error classifier: add context overflow patterns for vLLM, Ollama, and llama.cpp/llama-server. These local inference servers return different error formats than cloud providers (e.g., 'exceeds the max_model_len', 'context length exceeded', 'slot context'). Without these patterns, context overflow errors from local servers are misclassified as format errors, causing infinite retries instead of triggering compression. 2. MCP initial connection retry: previously, if the very first connection attempt to an MCP server failed (e.g., transient DNS blip at startup), the server was permanently marked as failed with no retry. Post-connect reconnection had 5 retries with exponential backoff, but initial connection had zero. Now initial connections retry up to 3 times with backoff before giving up, matching the resilience of post-connect reconnection. (Inspired by Kilo Code's MCP server disappearing fix in v1.3.3) Tests: 6 new error classifier tests, 4 new MCP retry tests, 1 updated existing test. All 276 affected tests pass.
2026-04-25 00:51:20 +00:00 · 2026-04-13 18:46:14 -07:00 · 2026-04-13 18:46:14 -07:00 · f324222b79
commit f324222b79
parent 0a4cf5b3e1
5 changed files with 204 additions and 8 deletions
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@ -156,6 +156,18 @@ _CONTEXT_OVERFLOW_PATTERNS = [
    "prompt exceeds max length",
    "max_tokens",
    "maximum number of tokens",
+    # vLLM / local inference server patterns
+    "exceeds the max_model_len",
+    "max_model_len",
+    "prompt length",             # "engine prompt length X exceeds"
+    "input is too long",
+    "maximum model length",
+    # Ollama patterns
+    "context length exceeded",
+    "truncating input",
+    # llama.cpp / llama-server patterns
+    "slot context",              # "slot context: N tokens, prompt N tokens"
+    "n_ctx_slot",
    # Chinese error messages (some providers return these)
    "超过最大长度",
    "上下文长度",
--- a/tests/agent/test_error_classifier.py
+++ b/tests/agent/test_error_classifier.py
@ -580,6 +580,48 @@ class TestClassifyApiError:
        result = classify_api_error(e)
        assert result.reason == FailoverReason.context_overflow

+    # ── vLLM / local inference server error messages ──
+
+    def test_vllm_max_model_len_overflow(self):
+        """vLLM's 'exceeds the max_model_len' error → context_overflow."""
+        e = MockAPIError(
+            "The engine prompt length 1327246 exceeds the max_model_len 131072. "
+            "Please reduce prompt.",
+            status_code=400,
+        )
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.context_overflow
+
+    def test_vllm_prompt_length_exceeds(self):
+        """vLLM prompt length error → context_overflow."""
+        e = MockAPIError(
+            "prompt length 200000 exceeds maximum model length 131072",
+            status_code=400,
+        )
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.context_overflow
+
+    def test_vllm_input_too_long(self):
+        """vLLM 'input is too long' error → context_overflow."""
+        e = MockAPIError("input is too long for model", status_code=400)
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.context_overflow
+
+    def test_ollama_context_length_exceeded(self):
+        """Ollama 'context length exceeded' error → context_overflow."""
+        e = MockAPIError("context length exceeded", status_code=400)
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.context_overflow
+
+    def test_llamacpp_slot_context(self):
+        """llama.cpp / llama-server 'slot context' error → context_overflow."""
+        e = MockAPIError(
+            "slot context: 4096 tokens, prompt 8192 tokens — not enough space",
+            status_code=400,
+        )
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.context_overflow
+
    # ── Result metadata ──

    def test_provider_and_model_in_result(self):
--- a/tests/tools/test_mcp_stability.py
+++ b/tests/tools/test_mcp_stability.py
@ -180,3 +180,113 @@ class TestMCPReloadTimeout:
        # The fix adds threading.Thread for _reload_mcp
        assert "Thread" in source or "thread" in source.lower(), \
            "_check_config_mcp_changes should use a thread for _reload_mcp"
+
+
+# ---------------------------------------------------------------------------
+# Fix 4: MCP initial connection retry with backoff
+# (Ported from Kilo Code's MCP resilience fix)
+# ---------------------------------------------------------------------------
+
+class TestMCPInitialConnectionRetry:
+    """MCPServerTask.run() retries initial connection failures instead of giving up."""
+
+    def test_initial_connect_retries_constant_exists(self):
+        """_MAX_INITIAL_CONNECT_RETRIES should be defined."""
+        from tools.mcp_tool import _MAX_INITIAL_CONNECT_RETRIES
+        assert _MAX_INITIAL_CONNECT_RETRIES >= 1
+
+    def test_initial_connect_retry_succeeds_on_second_attempt(self):
+        """Server succeeds after one transient initial failure."""
+        from tools.mcp_tool import MCPServerTask, _MAX_INITIAL_CONNECT_RETRIES
+
+        call_count = 0
+
+        async def _run():
+            nonlocal call_count
+            server = MCPServerTask("test-retry")
+
+            # Track calls via patching the method on the class
+            original_run_stdio = MCPServerTask._run_stdio
+
+            async def fake_run_stdio(self_inner, config):
+                nonlocal call_count
+                call_count += 1
+                if call_count == 1:
+                    raise ConnectionError("DNS resolution failed")
+                # Second attempt: success — set ready and "run" until shutdown
+                self_inner._ready.set()
+                await self_inner._shutdown_event.wait()
+
+            with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio):
+                task = asyncio.ensure_future(server.run({"command": "fake"}))
+                await server._ready.wait()
+
+                # It should have succeeded (no error) after retrying
+                assert server._error is None, f"Expected no error, got: {server._error}"
+                assert call_count == 2, f"Expected 2 attempts, got {call_count}"
+
+                # Clean shutdown
+                server._shutdown_event.set()
+                await task
+
+        asyncio.get_event_loop().run_until_complete(_run())
+
+    def test_initial_connect_gives_up_after_max_retries(self):
+        """Server gives up after _MAX_INITIAL_CONNECT_RETRIES failures."""
+        from tools.mcp_tool import MCPServerTask, _MAX_INITIAL_CONNECT_RETRIES
+
+        call_count = 0
+
+        async def _run():
+            nonlocal call_count
+            server = MCPServerTask("test-exhaust")
+
+            async def fake_run_stdio(self_inner, config):
+                nonlocal call_count
+                call_count += 1
+                raise ConnectionError("DNS resolution failed")
+
+            with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio):
+                task = asyncio.ensure_future(server.run({"command": "fake"}))
+                await server._ready.wait()
+
+                # Should have an error after exhausting retries
+                assert server._error is not None
+                assert "DNS resolution failed" in str(server._error)
+                # 1 initial + N retries = _MAX_INITIAL_CONNECT_RETRIES + 1 total attempts
+                assert call_count == _MAX_INITIAL_CONNECT_RETRIES + 1
+
+                await task
+
+        asyncio.get_event_loop().run_until_complete(_run())
+
+    def test_initial_connect_retry_respects_shutdown(self):
+        """Shutdown during initial retry backoff aborts cleanly."""
+        from tools.mcp_tool import MCPServerTask
+
+        async def _run():
+            server = MCPServerTask("test-shutdown")
+            attempt = 0
+
+            async def fake_run_stdio(self_inner, config):
+                nonlocal attempt
+                attempt += 1
+                if attempt == 1:
+                    raise ConnectionError("transient failure")
+                # Should not reach here because shutdown fires during sleep
+                raise AssertionError("Should not attempt after shutdown")
+
+            with patch.object(MCPServerTask, '_run_stdio', fake_run_stdio):
+                task = asyncio.ensure_future(server.run({"command": "fake"}))
+
+                # Give the first attempt time to fail, then set shutdown
+                # during the backoff sleep
+                await asyncio.sleep(0.1)
+                server._shutdown_event.set()
+                await server._ready.wait()
+
+                # Should have the error set and be done
+                assert server._error is not None
+                await task
+
+        asyncio.get_event_loop().run_until_complete(_run())
--- a/tests/tools/test_mcp_tool.py
+++ b/tests/tools/test_mcp_tool.py
@ -1008,8 +1008,12 @@ class TestReconnection:
        asyncio.run(_test())

    def test_no_reconnect_on_initial_failure(self):
-        """First connection failure reports error immediately, no retry."""
-        from tools.mcp_tool import MCPServerTask
+        """First connection failure retries up to _MAX_INITIAL_CONNECT_RETRIES times.
+
+        Before the MCP resilience fix, initial failures gave up immediately.
+        Now they retry with backoff to handle transient DNS/network blips.
+        """
+        from tools.mcp_tool import MCPServerTask, _MAX_INITIAL_CONNECT_RETRIES

        run_count = 0
        target_server = None
@ -1032,8 +1036,8 @@ class TestReconnection:
                 patch("asyncio.sleep", new_callable=AsyncMock):
                await server.run({"command": "test"})

-            # Only one attempt, no retry on initial failure
-            assert run_count == 1
+            # Now retries up to _MAX_INITIAL_CONNECT_RETRIES before giving up
+            assert run_count == _MAX_INITIAL_CONNECT_RETRIES + 1
            assert server._error is not None
            assert "cannot connect" in str(server._error)

--- a/tools/mcp_tool.py
+++ b/tools/mcp_tool.py
@ -162,6 +162,7 @@ if _MCP_AVAILABLE and not _MCP_MESSAGE_HANDLER_SUPPORTED:
 _DEFAULT_TOOL_TIMEOUT = 120      # seconds for tool calls
 _DEFAULT_CONNECT_TIMEOUT = 60    # seconds for initial connection per server
 _MAX_RECONNECT_RETRIES = 5
+_MAX_INITIAL_CONNECT_RETRIES = 3 # retries for the very first connection attempt
 _MAX_BACKOFF_SECONDS = 60

 # Environment variables that are safe to pass to stdio subprocesses
@ -984,6 +985,7 @@ class MCPServerTask:
                self.name,
            )
        retries = 0
+        initial_retries = 0
        backoff = 1.0

        while True:
@ -997,11 +999,37 @@ class MCPServerTask:
            except Exception as exc:
                self.session = None

-                # If this is the first connection attempt, report the error
+                # If this is the first connection attempt, retry with backoff
+                # before giving up. A transient DNS/network blip at startup
+                # should not permanently kill the server.
+                # (Ported from Kilo Code's MCP resilience fix.)
                if not self._ready.is_set():
-                    self._error = exc
-                    self._ready.set()
-                    return
+                    initial_retries += 1
+                    if initial_retries > _MAX_INITIAL_CONNECT_RETRIES:
+                        logger.warning(
+                            "MCP server '%s' failed initial connection after "
+                            "%d attempts, giving up: %s",
+                            self.name, _MAX_INITIAL_CONNECT_RETRIES, exc,
+                        )
+                        self._error = exc
+                        self._ready.set()
+                        return
+
+                    logger.warning(
+                        "MCP server '%s' initial connection failed "
+                        "(attempt %d/%d), retrying in %.0fs: %s",
+                        self.name, initial_retries,
+                        _MAX_INITIAL_CONNECT_RETRIES, backoff, exc,
+                    )
+                    await asyncio.sleep(backoff)
+                    backoff = min(backoff * 2, _MAX_BACKOFF_SECONDS)
+
+                    # Check if shutdown was requested during the sleep
+                    if self._shutdown_event.is_set():
+                        self._error = exc
+                        self._ready.set()
+                        return
+                    continue

                # If shutdown was requested, don't reconnect
                if self._shutdown_event.is_set():