fix(agent): classify 429 'overloaded' bodies as overloaded, not rate_limit

Z.AI / Zhipu reuse HTTP 429 for server-wide overload. The 429 status
path classified these unconditionally as rate_limit with
should_rotate_credential=True, so an overloaded provider exhausted the
credential pool after two errors — fatal for a single-key user, who has
nothing to rotate to.

The credential is valid; the server is just busy. Disambiguate the 429
body against a shared _OVERLOADED_PATTERNS list and route overload
language to FailoverReason.overloaded (retryable, no rotation), matching
the existing 503/529 path and the message-only path (#52890). Genuine
rate limits (no overload language) still rotate.

Extracted the inline overloaded tuple #52890 added into the shared
_OVERLOADED_PATTERNS constant so the status-code and message paths use
one list.

Closes #14038.
This commit is contained in:
teknium1 2026-06-27 03:55:17 -07:00 committed by Teknium
parent 16192103f4
commit 38e7bd8a08
2 changed files with 63 additions and 5 deletions

View file

@ -133,6 +133,31 @@ _RATE_LIMIT_PATTERNS = [
"servicequotaexceededexception",
]
# Patterns that indicate provider-side overload, NOT a per-credential rate
# limit or billing problem. The credential is valid — the server is just
# busy — so the correct recovery is "back off and retry the same key", never
# "rotate the credential" (rotating exhausts the pool while the endpoint is
# still busy; a single-key user has nothing to rotate to). Some providers
# (notably Z.AI / Zhipu) reuse HTTP 429 for server-wide overload, so the 429
# status path matches the body against this list before falling through to
# the rate_limit default. Phrases are kept narrow and overload-flavoured so a
# normal rate-limit message ("you have been rate-limited") doesn't hit this
# bucket. (#14038, #15297)
_OVERLOADED_PATTERNS = [
"overloaded",
"temporarily overloaded",
"service is temporarily overloaded",
"service may be temporarily overloaded",
"server is overloaded",
"server overloaded",
"service overloaded",
"service is overloaded",
"upstream overloaded",
"currently overloaded",
"at capacity",
"over capacity",
]
# Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
_USAGE_LIMIT_PATTERNS = [
"usage limit",
@ -863,7 +888,19 @@ def _classify_by_status(
)
if status_code == 429:
# Already checked long_context_tier above; this is a normal rate limit
# Already checked long_context_tier above. Some providers (notably
# Z.AI / Zhipu) reuse HTTP 429 for server-wide overload — same status
# code as a true per-credential rate limit, but the credential is
# valid and the correct recovery is "back off and retry the same key",
# NOT "rotate the credential" (which exhausts the pool while the
# endpoint is still busy, and does nothing for a single-key user).
# Disambiguate on the error body so an overload 429 takes the
# transient-overload path instead of burning the pool. (#14038)
if any(p in error_msg for p in _OVERLOADED_PATTERNS):
return result_fn(
FailoverReason.overloaded,
retryable=True,
)
return result_fn(
FailoverReason.rate_limit,
retryable=True,
@ -1219,10 +1256,7 @@ def _classify_by_message(
# e.g. some Anthropic-compatible proxies) classifies as a transient
# overload (backoff + retry) instead of falling through to `unknown` or
# incorrectly triggering credential rotation.
if any(p in error_msg for p in (
"overloaded", "temporarily overloaded",
"service is temporarily overloaded",
)):
if any(p in error_msg for p in _OVERLOADED_PATTERNS):
return result_fn(
FailoverReason.overloaded,
retryable=True,

View file

@ -386,6 +386,30 @@ class TestClassifyApiError:
assert result.retryable is True
assert result.should_rotate_credential is False
def test_429_with_overloaded_body_is_overloaded_not_rate_limit(self):
"""Z.AI / Zhipu reuse HTTP 429 for server-wide overload. The credential
is valid the server is just busy so it must classify as overloaded
(back off + retry the same key), NOT rate_limit (which would rotate and
exhaust the pool, doing nothing for a single-key user). (#14038)"""
e = MockAPIError(
"The service may be temporarily overloaded, please try again later",
status_code=429,
)
result = classify_api_error(e, provider="zai")
assert result.reason == FailoverReason.overloaded
assert result.retryable is True
assert result.should_rotate_credential is False
def test_429_normal_rate_limit_still_rotates(self):
"""Guard: a genuine 429 rate limit (no overload language) must still
classify as rate_limit and rotate the credential. (#14038)"""
e = MockAPIError(
"Rate limit exceeded: too many requests", status_code=429
)
result = classify_api_error(e, provider="zai")
assert result.reason == FailoverReason.rate_limit
assert result.should_rotate_credential is True
# ── 5xx that are actually request-validation errors ──
# Some OpenAI-compatible gateways (e.g. codex.nekos.me) return
# request-validation failures with a 5xx status. These are