fix(agent): classify 429 'overloaded' bodies as overloaded, not rate_limit

Z.AI / Zhipu reuse HTTP 429 for server-wide overload. The 429 status path classified these unconditionally as rate_limit with should_rotate_credential=True, so an overloaded provider exhausted the credential pool after two errors — fatal for a single-key user, who has nothing to rotate to. The credential is valid; the server is just busy. Disambiguate the 429 body against a shared _OVERLOADED_PATTERNS list and route overload language to FailoverReason.overloaded (retryable, no rotation), matching the existing 503/529 path and the message-only path (#52890). Genuine rate limits (no overload language) still rotate. Extracted the inline overloaded tuple #52890 added into the shared _OVERLOADED_PATTERNS constant so the status-code and message paths use one list. Closes #14038.
2026-07-01 12:02:05 +00:00 · 2026-06-27 03:55:17 -07:00 · 2026-06-27 03:55:17 -07:00 · 38e7bd8a08
commit 38e7bd8a08
parent 16192103f4
2 changed files with 63 additions and 5 deletions
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@ -133,6 +133,31 @@ _RATE_LIMIT_PATTERNS = [
    "servicequotaexceededexception",
 ]

+# Patterns that indicate provider-side overload, NOT a per-credential rate
+# limit or billing problem.  The credential is valid — the server is just
+# busy — so the correct recovery is "back off and retry the same key", never
+# "rotate the credential" (rotating exhausts the pool while the endpoint is
+# still busy; a single-key user has nothing to rotate to).  Some providers
+# (notably Z.AI / Zhipu) reuse HTTP 429 for server-wide overload, so the 429
+# status path matches the body against this list before falling through to
+# the rate_limit default.  Phrases are kept narrow and overload-flavoured so a
+# normal rate-limit message ("you have been rate-limited") doesn't hit this
+# bucket. (#14038, #15297)
+_OVERLOADED_PATTERNS = [
+    "overloaded",
+    "temporarily overloaded",
+    "service is temporarily overloaded",
+    "service may be temporarily overloaded",
+    "server is overloaded",
+    "server overloaded",
+    "service overloaded",
+    "service is overloaded",
+    "upstream overloaded",
+    "currently overloaded",
+    "at capacity",
+    "over capacity",
+]
+
 # Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
 _USAGE_LIMIT_PATTERNS = [
    "usage limit",
@ -863,7 +888,19 @@ def _classify_by_status(
        )

    if status_code == 429:
-        # Already checked long_context_tier above; this is a normal rate limit
+        # Already checked long_context_tier above. Some providers (notably
+        # Z.AI / Zhipu) reuse HTTP 429 for server-wide overload — same status
+        # code as a true per-credential rate limit, but the credential is
+        # valid and the correct recovery is "back off and retry the same key",
+        # NOT "rotate the credential" (which exhausts the pool while the
+        # endpoint is still busy, and does nothing for a single-key user).
+        # Disambiguate on the error body so an overload 429 takes the
+        # transient-overload path instead of burning the pool. (#14038)
+        if any(p in error_msg for p in _OVERLOADED_PATTERNS):
+            return result_fn(
+                FailoverReason.overloaded,
+                retryable=True,
+            )
        return result_fn(
            FailoverReason.rate_limit,
            retryable=True,
@ -1219,10 +1256,7 @@ def _classify_by_message(
    # e.g. some Anthropic-compatible proxies) classifies as a transient
    # overload (backoff + retry) instead of falling through to `unknown` or
    # incorrectly triggering credential rotation.
-    if any(p in error_msg for p in (
-        "overloaded", "temporarily overloaded",
-        "service is temporarily overloaded",
-    )):
+    if any(p in error_msg for p in _OVERLOADED_PATTERNS):
        return result_fn(
            FailoverReason.overloaded,
            retryable=True,
--- a/tests/agent/test_error_classifier.py
+++ b/tests/agent/test_error_classifier.py
@ -386,6 +386,30 @@ class TestClassifyApiError:
        assert result.retryable is True
        assert result.should_rotate_credential is False

+    def test_429_with_overloaded_body_is_overloaded_not_rate_limit(self):
+        """Z.AI / Zhipu reuse HTTP 429 for server-wide overload. The credential
+        is valid — the server is just busy — so it must classify as overloaded
+        (back off + retry the same key), NOT rate_limit (which would rotate and
+        exhaust the pool, doing nothing for a single-key user). (#14038)"""
+        e = MockAPIError(
+            "The service may be temporarily overloaded, please try again later",
+            status_code=429,
+        )
+        result = classify_api_error(e, provider="zai")
+        assert result.reason == FailoverReason.overloaded
+        assert result.retryable is True
+        assert result.should_rotate_credential is False
+
+    def test_429_normal_rate_limit_still_rotates(self):
+        """Guard: a genuine 429 rate limit (no overload language) must still
+        classify as rate_limit and rotate the credential. (#14038)"""
+        e = MockAPIError(
+            "Rate limit exceeded: too many requests", status_code=429
+        )
+        result = classify_api_error(e, provider="zai")
+        assert result.reason == FailoverReason.rate_limit
+        assert result.should_rotate_credential is True
+
    # ── 5xx that are actually request-validation errors ──
    # Some OpenAI-compatible gateways (e.g. codex.nekos.me) return
    # request-validation failures with a 5xx status. These are