fix(agent): classify message-only 'overloaded' as server overload

Salvage of #14261 by @ms-alan — rebased onto current main, scoped to the
overloaded-classification fix, with a regression test that fails without it.
This commit is contained in:
Bartok9 2026-06-26 01:43:39 -04:00 committed by Teknium
parent 151ae1e937
commit 45ce35ed72
2 changed files with 26 additions and 0 deletions

View file

@ -1214,6 +1214,20 @@ def _classify_by_message(
should_fallback=True,
)
# Overloaded / server-busy patterns — must come BEFORE the rate_limit and
# billing checks so that a message-only "overloaded" (no 503/529 status,
# e.g. some Anthropic-compatible proxies) classifies as a transient
# overload (backoff + retry) instead of falling through to `unknown` or
# incorrectly triggering credential rotation.
if any(p in error_msg for p in (
"overloaded", "temporarily overloaded",
"service is temporarily overloaded",
)):
return result_fn(
FailoverReason.overloaded,
retryable=True,
)
# Billing patterns
if any(p in error_msg for p in _BILLING_PATTERNS):
return result_fn(

View file

@ -347,6 +347,18 @@ class TestClassifyApiError:
result = classify_api_error(e)
assert result.reason == FailoverReason.overloaded
def test_message_only_overloaded_without_status_is_overloaded(self):
"""Some Anthropic-compatible proxies surface 'overloaded' in the
message with no 503/529 status_code. It must classify as overloaded
(transient backoff+retry), not unknown / credential rotation. (#14261)"""
e = MockAPIError(
"Anthropic API error: Overloaded - the service is temporarily overloaded"
) # no status_code
result = classify_api_error(e, provider="anthropic")
assert result.reason == FailoverReason.overloaded
assert result.retryable is True
assert result.should_rotate_credential is False
# ── 5xx that are actually request-validation errors ──
# Some OpenAI-compatible gateways (e.g. codex.nekos.me) return
# request-validation failures with a 5xx status. These are