diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 419a984b75e..1a42a9589ee 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -254,6 +254,20 @@ _THINKING_SIG_PATTERNS = [ "signature", # Combined with "thinking" check ] +# Message-string patterns that indicate a provider-side timeout even when +# the exception type is generic (e.g. RuntimeError from a local shim that +# wraps a subprocess timeout). Checked before the type-based transport +# heuristics so custom-provider "timed out" errors don't fall through to +# the unknown bucket and get misreported as empty responses. +_TIMEOUT_MESSAGE_PATTERNS = [ + "timed out", + "turn timed out", + "request timed out", + "deadline exceeded", + "operation timed out", + "upstream timed out", +] + # Transport error type names _TRANSPORT_ERROR_TYPES = frozenset({ "ReadTimeout", "ConnectTimeout", "PoolTimeout", @@ -963,6 +977,14 @@ def _classify_by_message( should_fallback=True, ) + # Timeout message patterns — generic exception types (e.g. RuntimeError) + # raised by local shims or custom providers that internally wrap a + # subprocess/HTTP timeout. Classified as transport timeout so the retry + # loop rebuilds the client instead of treating the turn as an empty + # model response. + if any(p in error_msg for p in _TIMEOUT_MESSAGE_PATTERNS): + return result_fn(FailoverReason.timeout, retryable=True) + return None diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index d3f62c847c7..a6fb56a7075 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -587,6 +587,28 @@ class TestClassifyApiError: result = classify_api_error(e) assert result.reason == FailoverReason.timeout + def test_runtime_error_cli_turn_timed_out_classifies_as_timeout(self): + # RuntimeError from a local claude-cli shim that wraps a subprocess + # timeout must classify as FailoverReason.timeout, not unknown, so + # the retry loop rebuilds the client instead of treating the turn as + # an empty model response (#22548). + e = RuntimeError("claude CLI turn timed out") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + + def test_runtime_error_request_timed_out_classifies_as_timeout(self): + e = RuntimeError("request timed out after 120s") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + + def test_runtime_error_deadline_exceeded_classifies_as_timeout(self): + e = RuntimeError("deadline exceeded") + result = classify_api_error(e) + assert result.reason == FailoverReason.timeout + assert result.retryable is True + # ── Error code classification ── def test_error_code_resource_exhausted(self):