From f8ba265340e1cb218f1e2e8f3820d2746af4ee5d Mon Sep 17 00:00:00 2001 From: Zeejay Date: Wed, 22 Apr 2026 01:14:32 +1000 Subject: [PATCH] fix(aux): trigger fallback on 429 rate-limit errors in auxiliary client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a provider returns a 429 rate-limit error (not billing-related), the auxiliary client's call_llm/async_call_llm previously did NOT trigger the fallback chain. This caused auxiliary tasks like session_search to exhaust all 3 retries against the same rate-limited endpoint, losing session metadata that depended on the summarization completing. Root cause: `_is_payment_error()` only matched 429s containing billing keywords ("credits", "insufficient funds", etc.). Provider-specific rate-limit messages like Nous's "Hold up for a bit, you've exceeded the rate limit on your API key" didn't match, so `_is_payment_error` returned False, `_is_connection_error` returned False, and `should_fallback` was False — all retries hit the same rate-limited provider. Fix: - New `_is_rate_limit_error()` function that detects 429 + rate-limit keywords, generic 429 without billing keywords, and OpenAI SDK `RateLimitError` class instances (which may omit .status_code). - Updated `should_fallback` in both `call_llm` and `async_call_llm` to include `_is_rate_limit_error`. - Updated the max_tokens retry path to also check for rate-limit errors. - Updated the reason string to include "rate limit". This complements the Nous rate guard (PR #10568) which prevents new calls to Nous when already rate-limited — this fix handles the case where a request is already in flight when the 429 arrives. Related: #8023, #12554, #11034 Co-authored-by: Zeejay --- agent/auxiliary_client.py | 70 ++++++++++++++++++--- tests/agent/test_auxiliary_client.py | 93 +++++++++++++++++++++++++++- 2 files changed, 155 insertions(+), 8 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 50f6f31fa5..54a1d63a7f 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -1650,6 +1650,39 @@ def _is_payment_error(exc: Exception) -> bool: return False +def _is_rate_limit_error(exc: Exception) -> bool: + """Detect rate-limit errors that warrant provider fallback. + + Returns True for HTTP 429 errors whose message indicates rate limiting + (as opposed to billing/quota exhaustion, which _is_payment_error handles). + Also catches OpenAI SDK RateLimitError instances that may not set + .status_code on the exception object. + """ + status = getattr(exc, "status_code", None) + err_lower = str(exc).lower() + + # OpenAI SDK's RateLimitError sometimes omits .status_code — + # detect by class name so we don't miss these. (PR #8023 pattern) + if type(exc).__name__ == "RateLimitError": + return True + + if status == 429: + # Distinguish rate-limit from billing: billing keywords are handled + # by _is_payment_error, everything else on 429 is a rate limit. + if any(kw in err_lower for kw in ( + "rate limit", "rate_limit", "too many requests", + "try again", "retry after", "resets in", + )): + return True + # Generic 429 without billing keywords = likely a rate limit + if not any(kw in err_lower for kw in ( + "credits", "insufficient funds", "billing", + "payment required", "can only afford", + )): + return True + return False + + def _is_connection_error(exc: Exception) -> bool: """Detect connection/network errors that warrant provider fallback. @@ -3542,7 +3575,7 @@ def call_llm( except Exception as retry_err: # If the max_tokens retry also hits a payment or connection # error, fall through to the fallback chain below. - if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)): + if not (_is_payment_error(retry_err) or _is_connection_error(retry_err) or _is_rate_limit_error(retry_err)): raise first_err = retry_err @@ -3625,13 +3658,27 @@ def call_llm( # Codex/OAuth tokens that authenticate but whose endpoint is down, # and providers the user never configured that got picked up by # the auto-detection chain. - should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err) + # + # ── Rate-limit fallback (#13579) ───────────────────────────── + # When the provider returns a 429 rate-limit (not billing), fall + # back to an alternative provider instead of exhausting retries + # against the same rate-limited endpoint. + should_fallback = ( + _is_payment_error(first_err) + or _is_connection_error(first_err) + or _is_rate_limit_error(first_err) + ) # Only try alternative providers when the user didn't explicitly # configure this task's provider. Explicit provider = hard constraint; # auto (the default) = best-effort fallback chain. (#7559) is_auto = resolved_provider in ("auto", "", None) if should_fallback and is_auto: - reason = "payment error" if _is_payment_error(first_err) else "connection error" + if _is_payment_error(first_err): + reason = "payment error" + elif _is_rate_limit_error(first_err): + reason = "rate limit" + else: + reason = "connection error" logger.info("Auxiliary %s: %s on %s (%s), trying fallback", task or "call", reason, resolved_provider, first_err) fb_client, fb_model, fb_label = _try_payment_fallback( @@ -3834,7 +3881,7 @@ async def async_call_llm( except Exception as retry_err: # If the max_tokens retry also hits a payment or connection # error, fall through to the fallback chain below. - if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)): + if not (_is_payment_error(retry_err) or _is_connection_error(retry_err) or _is_rate_limit_error(retry_err)): raise first_err = retry_err @@ -3903,11 +3950,20 @@ async def async_call_llm( return _validate_llm_response( await retry_client.chat.completions.create(**retry_kwargs), task) - # ── Payment / connection fallback (mirrors sync call_llm) ───── - should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err) + # ── Payment / connection / rate-limit fallback (mirrors sync call_llm) ── + should_fallback = ( + _is_payment_error(first_err) + or _is_connection_error(first_err) + or _is_rate_limit_error(first_err) + ) is_auto = resolved_provider in ("auto", "", None) if should_fallback and is_auto: - reason = "payment error" if _is_payment_error(first_err) else "connection error" + if _is_payment_error(first_err): + reason = "payment error" + elif _is_rate_limit_error(first_err): + reason = "rate limit" + else: + reason = "connection error" logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback", task or "call", reason, resolved_provider, first_err) fb_client, fb_model, fb_label = _try_payment_fallback( diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index cf75d0b1f8..55a7e969e1 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -20,6 +20,7 @@ from agent.auxiliary_client import ( _read_codex_access_token, _get_provider_chain, _is_payment_error, + _is_rate_limit_error, _normalize_aux_provider, _try_payment_fallback, _resolve_auto, @@ -789,6 +790,65 @@ class TestIsPaymentError: assert _is_payment_error(exc) is False +class TestIsRateLimitError: + """_is_rate_limit_error detects 429 rate-limit errors warranting fallback.""" + + def test_429_with_rate_limit_message(self): + exc = Exception("Rate limit exceeded, try again in 2 seconds") + exc.status_code = 429 + assert _is_rate_limit_error(exc) is True + + def test_429_with_resets_in_message(self): + """Nous-style 429: 'resets in 3508s'.""" + exc = Exception("Hold up for a bit, you've exceeded the rate limit on your API key") + exc.status_code = 429 + assert _is_rate_limit_error(exc) is True + + def test_429_with_too_many_requests(self): + exc = Exception("Too many requests") + exc.status_code = 429 + assert _is_rate_limit_error(exc) is True + + def test_429_without_billing_keywords_is_rate_limit(self): + """Generic 429 without billing keywords = likely a rate limit.""" + exc = Exception("Something went wrong") + exc.status_code = 429 + assert _is_rate_limit_error(exc) is True + + def test_429_with_credits_message_is_not_rate_limit(self): + """Billing-related 429 should NOT be classified as rate limit.""" + exc = Exception("insufficient credits remaining") + exc.status_code = 429 + assert _is_rate_limit_error(exc) is False + + def test_429_with_billing_message_is_not_rate_limit(self): + exc = Exception("you can only afford 1000 tokens") + exc.status_code = 429 + assert _is_rate_limit_error(exc) is False + + def test_402_is_not_rate_limit(self): + exc = Exception("Payment Required") + exc.status_code = 402 + assert _is_rate_limit_error(exc) is False + + def test_500_is_not_rate_limit(self): + exc = Exception("Internal Server Error") + exc.status_code = 500 + assert _is_rate_limit_error(exc) is False + + def test_openai_ratelimiterror_classname(self): + """OpenAI SDK RateLimitError may omit .status_code — detect by class name.""" + class RateLimitError(Exception): + pass + exc = RateLimitError("rate limit exceeded") + # No status_code set, but class name matches + assert _is_rate_limit_error(exc) is True + + def test_no_status_code_no_keywords_is_not_rate_limit(self): + exc = Exception("connection reset") + assert _is_rate_limit_error(exc) is False + + class TestGetProviderChain: """_get_provider_chain() resolves functions at call time (testable).""" @@ -860,13 +920,18 @@ class TestTryPaymentFallback: class TestCallLlmPaymentFallback: - """call_llm() retries with a different provider on 402 / payment errors.""" + """call_llm() retries with a different provider on 402 / payment / rate-limit errors.""" def _make_402_error(self, msg="Payment Required: insufficient credits"): exc = Exception(msg) exc.status_code = 402 return exc + def _make_429_rate_limit_error(self, msg="Rate limit exceeded, try again in 60 seconds"): + exc = Exception(msg) + exc.status_code = 429 + return exc + def test_non_payment_error_not_caught(self, monkeypatch): """Non-payment/non-connection errors (500) should NOT trigger fallback.""" monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") @@ -886,6 +951,32 @@ class TestCallLlmPaymentFallback: messages=[{"role": "user", "content": "hello"}], ) + def test_429_rate_limit_triggers_fallback(self, monkeypatch): + """429 rate-limit errors should trigger fallback to next provider.""" + monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") + + primary_client = MagicMock() + rate_err = self._make_429_rate_limit_error() + primary_client.chat.completions.create.side_effect = rate_err + + fallback_client = MagicMock() + fallback_client.chat.completions.create.return_value = MagicMock(choices=[ + MagicMock(message=MagicMock(content="fallback response")) + ]) + + with patch("agent.auxiliary_client._get_cached_client", + return_value=(primary_client, "xiaomi/mimo-v2-pro")), \ + patch("agent.auxiliary_client._resolve_task_provider_model", + return_value=("auto", "xiaomi/mimo-v2-pro", None, None, None)), \ + patch("agent.auxiliary_client._try_payment_fallback", + return_value=(fallback_client, "fallback-model", "openrouter")): + result = call_llm( + task="session_search", + messages=[{"role": "user", "content": "hello"}], + ) + # Fallback client should have been used + assert fallback_client.chat.completions.create.called + # --------------------------------------------------------------------------- # Gate: _resolve_api_key_provider must skip anthropic when not configured # ---------------------------------------------------------------------------