fix(aux): trigger fallback on 429 rate-limit errors in auxiliary client

When a provider returns a 429 rate-limit error (not billing-related), the auxiliary client's call_llm/async_call_llm previously did NOT trigger the fallback chain. This caused auxiliary tasks like session_search to exhaust all 3 retries against the same rate-limited endpoint, losing session metadata that depended on the summarization completing. Root cause: `_is_payment_error()` only matched 429s containing billing keywords ("credits", "insufficient funds", etc.). Provider-specific rate-limit messages like Nous's "Hold up for a bit, you've exceeded the rate limit on your API key" didn't match, so `_is_payment_error` returned False, `_is_connection_error` returned False, and `should_fallback` was False — all retries hit the same rate-limited provider. Fix: - New `_is_rate_limit_error()` function that detects 429 + rate-limit keywords, generic 429 without billing keywords, and OpenAI SDK `RateLimitError` class instances (which may omit .status_code). - Updated `should_fallback` in both `call_llm` and `async_call_llm` to include `_is_rate_limit_error`. - Updated the max_tokens retry path to also check for rate-limit errors. - Updated the reason string to include "rate limit". This complements the Nous rate guard (PR #10568) which prevents new calls to Nous when already rate-limited — this fix handles the case where a request is already in flight when the 429 arrives. Related: #8023, #12554, #11034 Co-authored-by: Zeejay <zjtan1@gmail.com>
2026-05-08 03:01:47 +00:00 · 2026-04-22 01:14:32 +10:00 · 2026-04-22 01:14:32 +10:00 · f8ba265340
commit f8ba265340
parent 8c0f254c06
2 changed files with 155 additions and 8 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -1650,6 +1650,39 @@ def _is_payment_error(exc: Exception) -> bool:
    return False


+def _is_rate_limit_error(exc: Exception) -> bool:
+    """Detect rate-limit errors that warrant provider fallback.
+
+    Returns True for HTTP 429 errors whose message indicates rate limiting
+    (as opposed to billing/quota exhaustion, which _is_payment_error handles).
+    Also catches OpenAI SDK RateLimitError instances that may not set
+    .status_code on the exception object.
+    """
+    status = getattr(exc, "status_code", None)
+    err_lower = str(exc).lower()
+
+    # OpenAI SDK's RateLimitError sometimes omits .status_code —
+    # detect by class name so we don't miss these.  (PR #8023 pattern)
+    if type(exc).__name__ == "RateLimitError":
+        return True
+
+    if status == 429:
+        # Distinguish rate-limit from billing: billing keywords are handled
+        # by _is_payment_error, everything else on 429 is a rate limit.
+        if any(kw in err_lower for kw in (
+            "rate limit", "rate_limit", "too many requests",
+            "try again", "retry after", "resets in",
+        )):
+            return True
+        # Generic 429 without billing keywords = likely a rate limit
+        if not any(kw in err_lower for kw in (
+            "credits", "insufficient funds", "billing",
+            "payment required", "can only afford",
+        )):
+            return True
+    return False
+
+
 def _is_connection_error(exc: Exception) -> bool:
    """Detect connection/network errors that warrant provider fallback.

@ -3542,7 +3575,7 @@ def call_llm(
            except Exception as retry_err:
                # If the max_tokens retry also hits a payment or connection
                # error, fall through to the fallback chain below.
-                if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)):
+                if not (_is_payment_error(retry_err) or _is_connection_error(retry_err) or _is_rate_limit_error(retry_err)):
                    raise
                first_err = retry_err

@ -3625,13 +3658,27 @@ def call_llm(
        # Codex/OAuth tokens that authenticate but whose endpoint is down,
        # and providers the user never configured that got picked up by
        # the auto-detection chain.
-        should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err)
+        #
+        # ── Rate-limit fallback (#13579) ─────────────────────────────
+        # When the provider returns a 429 rate-limit (not billing), fall
+        # back to an alternative provider instead of exhausting retries
+        # against the same rate-limited endpoint.
+        should_fallback = (
+            _is_payment_error(first_err)
+            or _is_connection_error(first_err)
+            or _is_rate_limit_error(first_err)
+        )
        # Only try alternative providers when the user didn't explicitly
        # configure this task's provider.  Explicit provider = hard constraint;
        # auto (the default) = best-effort fallback chain.  (#7559)
        is_auto = resolved_provider in ("auto", "", None)
        if should_fallback and is_auto:
-            reason = "payment error" if _is_payment_error(first_err) else "connection error"
+            if _is_payment_error(first_err):
+                reason = "payment error"
+            elif _is_rate_limit_error(first_err):
+                reason = "rate limit"
+            else:
+                reason = "connection error"
            logger.info("Auxiliary %s: %s on %s (%s), trying fallback",
                        task or "call", reason, resolved_provider, first_err)
            fb_client, fb_model, fb_label = _try_payment_fallback(
@ -3834,7 +3881,7 @@ async def async_call_llm(
            except Exception as retry_err:
                # If the max_tokens retry also hits a payment or connection
                # error, fall through to the fallback chain below.
-                if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)):
+                if not (_is_payment_error(retry_err) or _is_connection_error(retry_err) or _is_rate_limit_error(retry_err)):
                    raise
                first_err = retry_err

@ -3903,11 +3950,20 @@ async def async_call_llm(
                    return _validate_llm_response(
                        await retry_client.chat.completions.create(**retry_kwargs), task)

-        # ── Payment / connection fallback (mirrors sync call_llm) ─────
-        should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err)
+        # ── Payment / connection / rate-limit fallback (mirrors sync call_llm) ──
+        should_fallback = (
+            _is_payment_error(first_err)
+            or _is_connection_error(first_err)
+            or _is_rate_limit_error(first_err)
+        )
        is_auto = resolved_provider in ("auto", "", None)
        if should_fallback and is_auto:
-            reason = "payment error" if _is_payment_error(first_err) else "connection error"
+            if _is_payment_error(first_err):
+                reason = "payment error"
+            elif _is_rate_limit_error(first_err):
+                reason = "rate limit"
+            else:
+                reason = "connection error"
            logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback",
                        task or "call", reason, resolved_provider, first_err)
            fb_client, fb_model, fb_label = _try_payment_fallback(