diff --git a/agent/error_classifier.py b/agent/error_classifier.py index c3b356d4e45..2412098cbae 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -1214,6 +1214,20 @@ def _classify_by_message( should_fallback=True, ) + # Overloaded / server-busy patterns — must come BEFORE the rate_limit and + # billing checks so that a message-only "overloaded" (no 503/529 status, + # e.g. some Anthropic-compatible proxies) classifies as a transient + # overload (backoff + retry) instead of falling through to `unknown` or + # incorrectly triggering credential rotation. + if any(p in error_msg for p in ( + "overloaded", "temporarily overloaded", + "service is temporarily overloaded", + )): + return result_fn( + FailoverReason.overloaded, + retryable=True, + ) + # Billing patterns if any(p in error_msg for p in _BILLING_PATTERNS): return result_fn( diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index 9708d7aadc3..e90d86885fd 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -347,6 +347,18 @@ class TestClassifyApiError: result = classify_api_error(e) assert result.reason == FailoverReason.overloaded + def test_message_only_overloaded_without_status_is_overloaded(self): + """Some Anthropic-compatible proxies surface 'overloaded' in the + message with no 503/529 status_code. It must classify as overloaded + (transient backoff+retry), not unknown / credential rotation. (#14261)""" + e = MockAPIError( + "Anthropic API error: Overloaded - the service is temporarily overloaded" + ) # no status_code + result = classify_api_error(e, provider="anthropic") + assert result.reason == FailoverReason.overloaded + assert result.retryable is True + assert result.should_rotate_credential is False + # ── 5xx that are actually request-validation errors ── # Some OpenAI-compatible gateways (e.g. codex.nekos.me) return # request-validation failures with a 5xx status. These are