From 4f8d8ad912452069e708fd1edac91f5f12a82147 Mon Sep 17 00:00:00 2001 From: Wesley Simplicio Date: Sat, 9 May 2026 14:58:35 -0700 Subject: [PATCH] fix(error_classifier): classify generic-typed timeout messages as transient (carve-out of #22664) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RuntimeError('claude CLI turn timed out') from a local OpenAI-compatible shim was falling through to FailoverReason.unknown, surfacing as 'Empty response from model' and burning 3 retry slots on the same failing endpoint. _classify_by_message had no timeout-message branch — only billing/rate_limit/auth/context_overflow/model_not_found patterns. The type-based check at line 565 also requires isinstance(error, (TimeoutError, ConnectionError, OSError)) — a plain RuntimeError doesn't match. Add _TIMEOUT_MESSAGE_PATTERNS for 'timed out', 'deadline exceeded', 'request timed out', 'operation timed out', 'upstream timed out', 'turn timed out'. _classify_by_message returns FailoverReason.timeout (retryable=True) when any pattern matches. Salvage of #22664's classifier portion. The original PR also bundled a fallback self-selection guard which is now redundant (already on main via #22780) plus DeepSeek thinking and session_search fixes that are their own separate concerns. Follow-up to #22780 — fixes the still-broken classification of generic-typed provider-shim timeouts that #22780's dedup didn't cover. --- agent/error_classifier.py | 22 ++++++++++++++++++++++ tests/agent/test_error_classifier.py | 22 ++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 419a984b75e..1a42a9589ee 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -254,6 +254,20 @@ _THINKING_SIG_PATTERNS = [ "signature", # Combined with "thinking" check ] +# Message-string patterns that indicate a provider-side timeout even when +# the exception type is generic (e.g. RuntimeError from a local shim that +# wraps a subprocess timeout). Checked before the type-based transport +# heuristics so custom-provider "timed out" errors don't fall through to +# the unknown bucket and get misreported as empty responses. +_TIMEOUT_MESSAGE_PATTERNS = [ + "timed out", + "turn timed out", + "request timed out", + "deadline exceeded", + "operation timed out", + "upstream timed out", +] + # Transport error type names _TRANSPORT_ERROR_TYPES = frozenset({ "ReadTimeout", "ConnectTimeout", "PoolTimeout", @@ -963,6 +977,14 @@ def _classify_by_message( should_fallback=True, ) + # Timeout message patterns — generic exception types (e.g. RuntimeError) + # raised by local shims or custom providers that internally wrap a + # subprocess/HTTP timeout. Classified as transport timeout so the retry + # loop rebuilds the client instead of treating the turn as an empty + # model response. + if any(p in error_msg for p in _TIMEOUT_MESSAGE_PATTERNS): + return result_fn(FailoverReason.timeout, retryable=True) + return None diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index d3f62c847c7..a6fb56a7075 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -587,6 +587,28 @@ class TestClassifyApiError: result = classify_api_error(e) assert result.reason == FailoverReason.timeout + def test_runtime_error_cli_turn_timed_out_classifies_as_timeout(self): + # RuntimeError from a local claude-cli shim that wraps a subprocess + # timeout must classify as FailoverReason.timeout, not unknown, so + # the retry loop rebuilds the client instead of treating the turn as + # an empty model response (#22548). + e = RuntimeError("claude CLI turn timed out") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + + def test_runtime_error_request_timed_out_classifies_as_timeout(self): + e = RuntimeError("request timed out after 120s") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + + def test_runtime_error_deadline_exceeded_classifies_as_timeout(self): + e = RuntimeError("deadline exceeded") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + # ── Error code classification ── def test_error_code_resource_exhausted(self):