From 38e7bd8a08a9df450b7d8661778ecc47f25323c2 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sat, 27 Jun 2026 03:55:17 -0700 Subject: [PATCH] fix(agent): classify 429 'overloaded' bodies as overloaded, not rate_limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Z.AI / Zhipu reuse HTTP 429 for server-wide overload. The 429 status path classified these unconditionally as rate_limit with should_rotate_credential=True, so an overloaded provider exhausted the credential pool after two errors — fatal for a single-key user, who has nothing to rotate to. The credential is valid; the server is just busy. Disambiguate the 429 body against a shared _OVERLOADED_PATTERNS list and route overload language to FailoverReason.overloaded (retryable, no rotation), matching the existing 503/529 path and the message-only path (#52890). Genuine rate limits (no overload language) still rotate. Extracted the inline overloaded tuple #52890 added into the shared _OVERLOADED_PATTERNS constant so the status-code and message paths use one list. Closes #14038. --- agent/error_classifier.py | 44 ++++++++++++++++++++++++---- tests/agent/test_error_classifier.py | 24 +++++++++++++++ 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/agent/error_classifier.py b/agent/error_classifier.py index a54738befb7..2d43806bc07 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -133,6 +133,31 @@ _RATE_LIMIT_PATTERNS = [ "servicequotaexceededexception", ] +# Patterns that indicate provider-side overload, NOT a per-credential rate +# limit or billing problem. The credential is valid — the server is just +# busy — so the correct recovery is "back off and retry the same key", never +# "rotate the credential" (rotating exhausts the pool while the endpoint is +# still busy; a single-key user has nothing to rotate to). Some providers +# (notably Z.AI / Zhipu) reuse HTTP 429 for server-wide overload, so the 429 +# status path matches the body against this list before falling through to +# the rate_limit default. Phrases are kept narrow and overload-flavoured so a +# normal rate-limit message ("you have been rate-limited") doesn't hit this +# bucket. (#14038, #15297) +_OVERLOADED_PATTERNS = [ + "overloaded", + "temporarily overloaded", + "service is temporarily overloaded", + "service may be temporarily overloaded", + "server is overloaded", + "server overloaded", + "service overloaded", + "service is overloaded", + "upstream overloaded", + "currently overloaded", + "at capacity", + "over capacity", +] + # Usage-limit patterns that need disambiguation (could be billing OR rate_limit) _USAGE_LIMIT_PATTERNS = [ "usage limit", @@ -863,7 +888,19 @@ def _classify_by_status( ) if status_code == 429: - # Already checked long_context_tier above; this is a normal rate limit + # Already checked long_context_tier above. Some providers (notably + # Z.AI / Zhipu) reuse HTTP 429 for server-wide overload — same status + # code as a true per-credential rate limit, but the credential is + # valid and the correct recovery is "back off and retry the same key", + # NOT "rotate the credential" (which exhausts the pool while the + # endpoint is still busy, and does nothing for a single-key user). + # Disambiguate on the error body so an overload 429 takes the + # transient-overload path instead of burning the pool. (#14038) + if any(p in error_msg for p in _OVERLOADED_PATTERNS): + return result_fn( + FailoverReason.overloaded, + retryable=True, + ) return result_fn( FailoverReason.rate_limit, retryable=True, @@ -1219,10 +1256,7 @@ def _classify_by_message( # e.g. some Anthropic-compatible proxies) classifies as a transient # overload (backoff + retry) instead of falling through to `unknown` or # incorrectly triggering credential rotation. - if any(p in error_msg for p in ( - "overloaded", "temporarily overloaded", - "service is temporarily overloaded", - )): + if any(p in error_msg for p in _OVERLOADED_PATTERNS): return result_fn( FailoverReason.overloaded, retryable=True, diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index 5d72bc2f142..270c0d8b8d1 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -386,6 +386,30 @@ class TestClassifyApiError: assert result.retryable is True assert result.should_rotate_credential is False + def test_429_with_overloaded_body_is_overloaded_not_rate_limit(self): + """Z.AI / Zhipu reuse HTTP 429 for server-wide overload. The credential + is valid — the server is just busy — so it must classify as overloaded + (back off + retry the same key), NOT rate_limit (which would rotate and + exhaust the pool, doing nothing for a single-key user). (#14038)""" + e = MockAPIError( + "The service may be temporarily overloaded, please try again later", + status_code=429, + ) + result = classify_api_error(e, provider="zai") + assert result.reason == FailoverReason.overloaded + assert result.retryable is True + assert result.should_rotate_credential is False + + def test_429_normal_rate_limit_still_rotates(self): + """Guard: a genuine 429 rate limit (no overload language) must still + classify as rate_limit and rotate the credential. (#14038)""" + e = MockAPIError( + "Rate limit exceeded: too many requests", status_code=429 + ) + result = classify_api_error(e, provider="zai") + assert result.reason == FailoverReason.rate_limit + assert result.should_rotate_credential is True + # ── 5xx that are actually request-validation errors ── # Some OpenAI-compatible gateways (e.g. codex.nekos.me) return # request-validation failures with a 5xx status. These are