From 24461ec0bc6c86611337d621f0ef2ec7d90eba6c Mon Sep 17 00:00:00 2001
From: "Brian D. Evans" <252620095+briandevans@users.noreply.github.com>
Date: Fri, 24 Apr 2026 16:39:48 -0700
Subject: [PATCH 1/2] fix(error-classifier): route 429 'overloaded' messages to
 FailoverReason.overloaded (#15297)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some providers — notably Z.AI / Zhipu — return HTTP 429 with messages
like "The service may be temporarily overloaded, please try again later"
to signal **server-wide overload**, not per-credential rate limiting.
The two conditions need different recovery strategies:

| Reason | Correct strategy |
|---|---|
| ``rate_limit`` | Retry same credential once (the limit may have reset), then rotate |
| ``overloaded`` | Skip retry, rotate immediately (the whole endpoint is saturated) |

Before this fix:

- ``error_classifier.py`` mapped every 429 to ``FailoverReason.rate_limit``
  regardless of message body.
- ``FailoverReason.overloaded`` already existed as an enum value (and was
  produced for 503/529) but no production path emitted it for 429.
- ``_recover_with_credential_pool`` had no handler for ``overloaded`` —
  an ``overloaded`` classification fell through to the default no-op
  ``return False, has_retried_429`` line.

Net effect: every overload-language 429 burned a ``has_retried_429``
slot on the same saturated credential before the rotation happened, and
cron jobs (one turn each) often used their entire execution on that
wasted retry.

### Fix

Two narrow, additive changes:

1. ``agent/error_classifier.py`` — new ``_OVERLOADED_PATTERNS`` list
   containing provider-language overload phrases (``"temporarily
   overloaded"``, ``"server is overloaded"``, ``"at capacity"``, …).
   The 429 branch now checks the error body against this list and
   emits ``FailoverReason.overloaded`` when matched, preserving
   ``rate_limit`` for everything else.  Kept phrases narrow and
   provider-language-flavoured so a normal rate-limit message
   ("you have been rate-limited") doesn't hit this bucket.

2. ``run_agent.py::_recover_with_credential_pool`` — new branch for
   ``effective_reason == FailoverReason.overloaded``.  Same shape as
   the existing ``billing`` handler: rotate immediately via
   ``mark_exhausted_and_rotate(...)``, no retry-on-same-credential
   first.  The 503/529 ``overloaded`` classifications produced by the
   existing code now also flow through this branch.

### Tests (15 new, all passing on py3.11 venv)

``tests/agent/test_error_classifier.py`` — 7 cases:

- The exact #15297 Z.AI message → ``overloaded``
- 9-phrase parametrised matrix (server/service/upstream overloaded,
  at/over capacity, "currently overloaded", …) → all ``overloaded``
- Plain "Rate limit reached for requests per minute" 429 → still
  ``rate_limit`` (regression guard against the disambiguation
  silently broadening)
- Plain "Too Many Requests" 429 → still ``rate_limit``
- 503 path unchanged → still ``overloaded`` (negative control)

``tests/agent/test_credential_pool_routing.py`` —
``TestPoolOverloadedRotation`` (4 cases):

- Overloaded on first failure → rotates immediately (the fix)
- Overloaded with ``has_retried_429=True`` → still rotates (no retry-
  flag dependence — the rate-limit gate is specific to ``rate_limit``)
- Single-entry pool overload → ``recovered=False`` so outer fallback
  takes over rather than spinning
- ``rate_limit`` on first failure → still uses retry-first path
  (regression guard against the new branch broadening)

**Verified regression guards**: temporarily reverted the classifier's
overload branch; 10 of the 11 new classifier tests correctly failed
with clear assertion messages.  Restored fix → all 145 tests pass
(``test_error_classifier.py`` + ``test_credential_pool_routing.py``,
existing + new).

Closes #15297

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 agent/error_classifier.py                   |  38 +++++-
 run_agent.py                                |  18 +++
 tests/agent/test_credential_pool_routing.py | 131 ++++++++++++++++++++
 tests/agent/test_error_classifier.py        |  79 ++++++++++++
 4 files changed, 265 insertions(+), 1 deletion(-)

diff --git a/agent/error_classifier.py b/agent/error_classifier.py
index 87324d676..a6ccba061 100644
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@@ -147,6 +147,29 @@ _PAYLOAD_TOO_LARGE_PATTERNS = [
     "error code: 413",
 ]
 
+# Patterns that indicate provider-side overload, NOT per-credential rate
+# limiting.  Some providers (e.g. Z.AI / Zhipu) return HTTP 429 for
+# server-wide overload — same status code as a true rate limit but the
+# correct recovery is "rotate immediately, the whole endpoint is down"
+# rather than "back off and retry the same key".  Match against the
+# message body alongside the 429 status code in the classifier.  Keep
+# phrases narrow and provider-language-flavoured so a normal rate-limit
+# message ("you have been rate-limited") doesn't accidentally hit this
+# bucket — only true overload language does (#15297).
+_OVERLOADED_PATTERNS = [
+    "temporarily overloaded",
+    "server is overloaded",
+    "server overloaded",
+    "service overloaded",
+    "service is overloaded",
+    "service may be temporarily overloaded",
+    "upstream overloaded",
+    "is overloaded, please try again",
+    "at capacity",
+    "over capacity",
+    "currently overloaded",
+]
+
 # Context overflow patterns
 _CONTEXT_OVERFLOW_PATTERNS = [
     "context length",
@@ -589,7 +612,20 @@ def _classify_by_status(
         )
 
     if status_code == 429:
-        # Already checked long_context_tier above; this is a normal rate limit
+        # Already checked long_context_tier above.  Some providers (notably
+        # Z.AI / Zhipu) reuse 429 for server-wide overload — correct recovery
+        # is "skip retry, rotate immediately" rather than "back off, then
+        # rotate" (which is the right answer for a per-credential rate
+        # limit).  Disambiguate on the error body so the retry loop picks
+        # the matching strategy via the corresponding ``FailoverReason``
+        # handler (#15297).
+        if any(p in error_msg for p in _OVERLOADED_PATTERNS):
+            return result_fn(
+                FailoverReason.overloaded,
+                retryable=True,
+                should_rotate_credential=True,
+                should_fallback=True,
+            )
         return result_fn(
             FailoverReason.rate_limit,
             retryable=True,
diff --git a/run_agent.py b/run_agent.py
index 6770f568c..0e86b5228 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -5588,6 +5588,24 @@ class AIAgent:
                 return True, False
             return False, True
 
+        if effective_reason == FailoverReason.overloaded:
+            # Provider-side overload (e.g. Z.AI 429 "service is temporarily
+            # overloaded"): the whole endpoint is struggling, not just this
+            # API key — retrying the same credential burns a slot and
+            # delays recovery.  Rotate immediately, same shape as
+            # ``billing`` does, no retry-on-same-credential first (#15297).
+            rotate_status = status_code if status_code is not None else 429
+            next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
+            if next_entry is not None:
+                logger.info(
+                    "Credential %s (provider overloaded) — rotated to pool entry %s",
+                    rotate_status,
+                    getattr(next_entry, "id", "?"),
+                )
+                self._swap_credential(next_entry)
+                return True, False
+            return False, has_retried_429
+
         if effective_reason == FailoverReason.auth:
             refreshed = pool.try_refresh_current()
             if refreshed is not None:
diff --git a/tests/agent/test_credential_pool_routing.py b/tests/agent/test_credential_pool_routing.py
index 8477fdb64..3978fe7ed 100644
--- a/tests/agent/test_credential_pool_routing.py
+++ b/tests/agent/test_credential_pool_routing.py
@@ -232,3 +232,134 @@ class TestPoolRotationCycle:
         )
         assert recovered is False
         assert has_retried is False
+
+
+# ---------------------------------------------------------------------------
+# 6. FailoverReason.overloaded handler (#15297)
+#
+# When the classifier produces ``overloaded`` (e.g. Z.AI 429 "service is
+# temporarily overloaded"), the recovery path must rotate immediately —
+# the whole endpoint is down, retrying the same credential burns a slot
+# and delays recovery.  Same shape as ``billing``, distinct from
+# ``rate_limit`` which retries-first.
+# ---------------------------------------------------------------------------
+
+
+class TestPoolOverloadedRotation:
+    """Verify ``FailoverReason.overloaded`` rotates immediately, no retry-first."""
+
+    def _make_agent_with_pool(self, pool_entries=3):
+        """Same factory shape as TestPoolRotationCycle — keep them parallel
+        so a future refactor of the agent's pool seam updates both
+        classes uniformly."""
+        from run_agent import AIAgent
+
+        with patch.object(AIAgent, "__init__", lambda self, **kw: None):
+            agent = AIAgent()
+
+        entries = []
+        for i in range(pool_entries):
+            e = MagicMock(name=f"entry_{i}")
+            e.id = f"cred-{i}"
+            entries.append(e)
+
+        pool = MagicMock()
+        pool.has_credentials.return_value = True
+
+        self._rotation_index = 0
+
+        def rotate(status_code=None, error_context=None):
+            self._rotation_index += 1
+            if self._rotation_index < pool_entries:
+                return entries[self._rotation_index]
+            pool.has_credentials.return_value = False
+            return None
+
+        pool.mark_exhausted_and_rotate = MagicMock(side_effect=rotate)
+        agent._credential_pool = pool
+        agent._swap_credential = MagicMock()
+        agent.log_prefix = ""
+
+        return agent, pool, entries
+
+    def test_overloaded_rotates_immediately_on_first_failure(self):
+        """The #15297 fix: ``overloaded`` must NOT use the rate-limit
+        retry-first path.  First failure rotates straight away because
+        retrying the same credential against a saturated provider just
+        burns the next ``has_retried_429`` slot too."""
+        from agent.error_classifier import FailoverReason
+
+        agent, pool, entries = self._make_agent_with_pool(3)
+        recovered, has_retried = agent._recover_with_credential_pool(
+            status_code=429,
+            has_retried_429=False,  # first failure
+            classified_reason=FailoverReason.overloaded,
+        )
+        assert recovered is True, (
+            "overloaded must rotate on first failure, like billing — "
+            "not wait for has_retried_429 like rate_limit does"
+        )
+        assert has_retried is False
+        pool.mark_exhausted_and_rotate.assert_called_once_with(
+            status_code=429, error_context=None,
+        )
+        agent._swap_credential.assert_called_once_with(entries[1])
+
+    def test_overloaded_does_not_block_on_has_retried_flag(self):
+        """The retry-first gate is specific to rate_limit.  An
+        ``overloaded`` reason must rotate regardless of whether the
+        retry flag is currently set or unset — same behaviour both
+        ways, no state leak from a previous rate-limit cycle."""
+        from agent.error_classifier import FailoverReason
+
+        agent, pool, entries = self._make_agent_with_pool(3)
+        recovered, has_retried = agent._recover_with_credential_pool(
+            status_code=429,
+            has_retried_429=True,  # already retried — should still rotate
+            classified_reason=FailoverReason.overloaded,
+        )
+        assert recovered is True
+        assert has_retried is False  # reset after rotation
+        pool.mark_exhausted_and_rotate.assert_called_once()
+
+    def test_overloaded_pool_exhaustion_returns_false(self):
+        """When the pool only has one credential and it overloads, the
+        rotation step returns no next entry — the recovery call must
+        report ``recovered=False`` so the caller falls through to its
+        fallback chain rather than spinning."""
+        from agent.error_classifier import FailoverReason
+
+        agent, _pool, _ = self._make_agent_with_pool(1)
+        recovered, has_retried = agent._recover_with_credential_pool(
+            status_code=429,
+            has_retried_429=False,
+            classified_reason=FailoverReason.overloaded,
+        )
+        assert recovered is False, (
+            "single-entry pool that overloads must return recovered=False "
+            "so the outer fallback chain takes over"
+        )
+        # has_retried_429 is propagated unchanged in the no-rotation
+        # branch — pinned so a future tweak to the contract surfaces
+        # via this test.
+        assert has_retried is False
+
+    def test_rate_limit_still_uses_retry_first(self):
+        """Negative control: ``rate_limit`` must KEEP its retry-first
+        semantics.  A regression that broadened the overloaded handler
+        to also catch rate_limit would silently double-rotate every
+        per-key throttle and burn the pool fast."""
+        from agent.error_classifier import FailoverReason
+
+        agent, pool, _ = self._make_agent_with_pool(3)
+        recovered, has_retried = agent._recover_with_credential_pool(
+            status_code=429,
+            has_retried_429=False,
+            classified_reason=FailoverReason.rate_limit,
+        )
+        assert recovered is False, (
+            "rate_limit on first failure must NOT rotate — the retry-"
+            "first path is the contract"
+        )
+        assert has_retried is True
+        pool.mark_exhausted_and_rotate.assert_not_called()
diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py
index e8a92774b..4bf3f8c8a 100644
--- a/tests/agent/test_error_classifier.py
+++ b/tests/agent/test_error_classifier.py
@@ -250,6 +250,85 @@ class TestClassifyApiError:
         assert result.reason == FailoverReason.rate_limit
         assert result.should_fallback is True
 
+    # ── 429 overload disambiguation (#15297) ──
+    #
+    # Some providers (notably Z.AI / Zhipu) reuse 429 for server-side
+    # overload rather than per-credential rate limiting.  Recovery for
+    # these is "rotate immediately, the whole endpoint is down" — not
+    # the rate-limit "back off, retry same key once, then rotate" path.
+    # The classifier disambiguates on the message body so the retry loop
+    # picks the right ``FailoverReason`` handler.
+
+    def test_429_zai_temporarily_overloaded(self):
+        """The exact message from #15297's Z.AI / Zhipu reproduction."""
+        msg = (
+            "HTTP 429: The service may be temporarily overloaded, "
+            "please try again later"
+        )
+        e = MockAPIError(msg, status_code=429)
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.overloaded, (
+            f"Z.AI overload message must classify as overloaded, got "
+            f"{result.reason!r} — recovery would burn a same-credential "
+            f"retry instead of rotating immediately"
+        )
+        assert result.should_rotate_credential is True
+        assert result.should_fallback is True
+        assert result.retryable is True
+
+    @pytest.mark.parametrize("phrase", [
+        "Service is overloaded, please try again later",
+        "server is overloaded",
+        "server overloaded — retry",
+        "Service overloaded",
+        "Upstream overloaded",
+        "API is currently overloaded",
+        "is overloaded, please try again",
+        "The endpoint is at capacity",
+        "Provider over capacity",
+    ])
+    def test_429_overload_phrases_route_to_overloaded(self, phrase):
+        """Phrase matrix: any provider-language overload phrase on a 429
+        must route to ``overloaded`` not ``rate_limit``.  Parametrised
+        so a future provider quirk that reads "overloaded" but is really
+        rate limiting can be surfaced and reclassified narrowly."""
+        e = MockAPIError(phrase, status_code=429)
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.overloaded, (
+            f"phrase {phrase!r} should map to overloaded; got {result.reason!r}"
+        )
+
+    def test_429_plain_rate_limit_remains_rate_limit(self):
+        """Regression guard: a plain 429 with rate-limit-flavoured text
+        (no overload phrase) must STILL classify as rate_limit so the
+        backoff-then-rotate path keeps working for normal per-key
+        throttling.  Without this guard, the disambiguation could
+        silently broaden and break every 429 recovery."""
+        e = MockAPIError(
+            "Rate limit reached for requests per minute", status_code=429,
+        )
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.rate_limit, (
+            "vanilla rate-limit message must NOT route to overloaded — "
+            "the recovery strategies are different (retry-then-rotate vs "
+            "rotate-immediately)"
+        )
+
+    def test_429_too_many_requests_remains_rate_limit(self):
+        """The literal HTTP 429 reason phrase ('Too Many Requests') must
+        route to rate_limit too — it carries no overload language."""
+        e = MockAPIError("Too Many Requests", status_code=429)
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.rate_limit
+
+    def test_503_overloaded_path_unchanged(self):
+        """Sanity check: HTTP 503 already mapped to ``overloaded`` and
+        must keep doing so.  Adding the 429 overload branch must not
+        accidentally regress the 503 path."""
+        e = MockAPIError("Service Unavailable", status_code=503)
+        result = classify_api_error(e)
+        assert result.reason == FailoverReason.overloaded
+
     def test_alibaba_rate_increased_too_quickly(self):
         """Alibaba/DashScope returns a unique throttling message.
 

From b0d5d63d9da4e5b14a91178504dffa6fef2410d9 Mon Sep 17 00:00:00 2001
From: "Brian D. Evans" <252620095+briandevans@users.noreply.github.com>
Date: Fri, 24 Apr 2026 17:47:31 -0700
Subject: [PATCH 2/2] =?UTF-8?q?log:=20clarify=20credential-rotation=20mess?=
 =?UTF-8?q?ages=20=E2=80=94=20HTTP=20%s=20instead=20of=20"Credential=20%s"?=
 =?UTF-8?q?=20(Copilot=20#15414)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot's review on #15414 flagged that the rotation log lines read
``"Credential %s (billing) — rotated to pool entry %s"`` where the
first ``%s`` is bound to ``rotate_status`` (an HTTP status code, e.g.
429), not a credential ID.  Reading the literal log line —
``"Credential 429 (billing) — rotated to pool entry cred-2"`` — would
suggest "credential 429" was rotated, which is meaningless and
actively confusing during incident debugging.

Same issue existed on the three sibling branches (billing,
rate_limit, auth refresh-failed) plus the new overloaded branch this
PR adds.  Drive-by fix all four with the same ``HTTP %s (reason) —
rotated credential to pool entry %s`` shape so:

  HTTP 429 (provider overloaded) — rotated credential to pool entry cred-2

reads correctly as "we got HTTP 429, classified it as provider
overloaded, rotated to credential cred-2".

No behaviour change.  145/145 tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 run_agent.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/run_agent.py b/run_agent.py
index 0e86b5228..26bfd3f4d 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -5565,7 +5565,7 @@ class AIAgent:
             next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
             if next_entry is not None:
                 logger.info(
-                    "Credential %s (billing) — rotated to pool entry %s",
+                    "HTTP %s (billing) — rotated credential to pool entry %s",
                     rotate_status,
                     getattr(next_entry, "id", "?"),
                 )
@@ -5580,7 +5580,7 @@ class AIAgent:
             next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
             if next_entry is not None:
                 logger.info(
-                    "Credential %s (rate limit) — rotated to pool entry %s",
+                    "HTTP %s (rate limit) — rotated credential to pool entry %s",
                     rotate_status,
                     getattr(next_entry, "id", "?"),
                 )
@@ -5598,7 +5598,7 @@ class AIAgent:
             next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
             if next_entry is not None:
                 logger.info(
-                    "Credential %s (provider overloaded) — rotated to pool entry %s",
+                    "HTTP %s (provider overloaded) — rotated credential to pool entry %s",
                     rotate_status,
                     getattr(next_entry, "id", "?"),
                 )
@@ -5618,7 +5618,7 @@ class AIAgent:
             next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
             if next_entry is not None:
                 logger.info(
-                    "Credential %s (auth refresh failed) — rotated to pool entry %s",
+                    "HTTP %s (auth refresh failed) — rotated credential to pool entry %s",
                     rotate_status,
                     getattr(next_entry, "id", "?"),
                 )