fix(auxiliary): detect quota exhaustion as payment error; allow capacity-error fallback for explicit providers

Closes #26803

Root causes:
1. _is_payment_error() checked for billing keywords (credits, insufficient
   funds, billing, payment required) but missed daily token quota exhaustion
   phrases used by Bedrock, Vertex AI, and LiteLLM proxies — e.g.
   'Too many tokens per day', 'quota exceeded', 'resource exhausted',
   'daily limit'. These are functionally identical to credit exhaustion
   (provider cannot serve the request) but don't trigger fallback.

2. The call_llm() fallback chain was gated on resolved_provider == 'auto'.
   When a task resolves to a specific provider (e.g. 'custom' for a LiteLLM
   proxy, or 'openrouter'), capacity failures (payment/quota/connection)
   silently raise instead of trying alternatives. This is overly conservative:
   capacity errors mean the provider *cannot* serve the request regardless of
   user intent, so alternatives should always be tried.

Fixes:
- Add quota-related keywords to _is_payment_error(): quota_exceeded,
  too many tokens per day, daily limit, tokens per day, daily quota,
  resource exhausted (Vertex AI gRPC code).
- Allow fallback for capacity errors (payment + connection) even when
  resolved_provider is not 'auto'. Rate-limit fallback stays gated on
  is_auto to honour explicit provider constraints for transient limits.
- Apply both fixes to sync call_llm() and async acall_llm() paths.
- Add 6 targeted tests for the new quota-error detection cases.
This commit is contained in:
Bartok9 2026-05-16 03:36:36 -04:00 committed by Teknium
parent 569bc94b59
commit 24c209f112
2 changed files with 72 additions and 9 deletions

View file

@ -2096,7 +2096,13 @@ def _is_payment_error(exc: Exception) -> bool:
"""Detect payment/credit/quota exhaustion errors.
Returns True for HTTP 402 (Payment Required) and for 429/other errors
whose message indicates billing exhaustion rather than rate limiting.
whose message indicates billing exhaustion or daily quota exhaustion
rather than transient rate limiting.
Daily token quota errors (e.g. Bedrock "Too many tokens per day",
Vertex AI "quota exceeded") are functionally equivalent to credit
exhaustion the provider cannot serve the request until the quota
resets and should trigger the same provider-fallback logic.
"""
status = getattr(exc, "status_code", None)
if status == 402:
@ -2104,10 +2110,19 @@ def _is_payment_error(exc: Exception) -> bool:
err_lower = str(exc).lower()
# OpenRouter and other providers include "credits" or "afford" in 402 bodies,
# but sometimes wrap them in 429 or other codes.
# Daily quota exhaustion from Bedrock, Vertex AI, and similar providers
# uses different language but is semantically identical to credit exhaustion.
if status in {402, 429, None}:
if any(kw in err_lower for kw in ("credits", "insufficient funds",
"can only afford", "billing",
"payment required")):
if any(kw in err_lower for kw in (
"credits", "insufficient funds",
"can only afford", "billing",
"payment required",
# Daily / monthly quota exhaustion keywords
"quota exceeded", "quota_exceeded",
"too many tokens per day", "daily limit",
"tokens per day", "daily quota",
"resource exhausted", # Vertex AI / gRPC quota errors
)):
return True
return False
@ -4538,11 +4553,17 @@ def call_llm(
or _is_connection_error(first_err)
or _is_rate_limit_error(first_err)
)
# Only try alternative providers when the user didn't explicitly
# configure this task's provider. Explicit provider = hard constraint;
# auto (the default) = best-effort fallback chain. (#7559)
# Respect explicit provider choice for transient errors (auth, request
# validation, etc.) but allow fallback when the provider clearly cannot
# serve the request due to capacity: payment/quota exhaustion and
# connection failures are capacity problems, not request constraints.
# See #26803: daily token quota (429 + "too many tokens per day") must
# fall back just like a 402 credit error.
is_auto = resolved_provider in {"auto", "", None}
if should_fallback and is_auto:
# Capacity errors bypass the explicit-provider gate: the provider
# literally cannot serve this request regardless of user intent.
is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
if should_fallback and (is_auto or is_capacity_error):
if _is_payment_error(first_err):
reason = "payment error"
# Resolve the actual provider label (resolved_provider may be
@ -4870,8 +4891,12 @@ async def async_call_llm(
or _is_connection_error(first_err)
or _is_rate_limit_error(first_err)
)
# Capacity errors (payment/quota/connection) bypass the explicit-provider
# gate — the provider cannot serve the request regardless of user intent.
# See #26803: daily token quota must fall back like a 402 credit error.
is_auto = resolved_provider in {"auto", "", None}
if should_fallback and is_auto:
is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
if should_fallback and (is_auto or is_capacity_error):
if _is_payment_error(first_err):
reason = "payment error"
_mark_provider_unhealthy(

View file

@ -925,6 +925,44 @@ class TestIsPaymentError:
exc = Exception("connection reset")
assert _is_payment_error(exc) is False
# ── Daily / monthly quota exhaustion (#26803) ────────────────────────────
def test_429_quota_exceeded(self):
"""Cloud provider quota exhaustion (e.g. Vertex AI) is a payment error."""
exc = Exception("RESOURCE_EXHAUSTED: quota exceeded for project")
exc.status_code = 429
assert _is_payment_error(exc) is True
def test_429_too_many_tokens_per_day(self):
"""Bedrock / LiteLLM daily token limit is a payment error."""
exc = Exception("Too many tokens per day: 1000000 used, 1000000 limit")
exc.status_code = 429
assert _is_payment_error(exc) is True
def test_429_daily_limit_phrase(self):
"""Generic 'daily limit' phrasing is a payment error."""
exc = Exception("You have exceeded your daily limit.")
exc.status_code = 429
assert _is_payment_error(exc) is True
def test_429_resource_exhausted_grpc(self):
"""Vertex AI gRPC RESOURCE_EXHAUSTED maps to payment error."""
exc = Exception("resource exhausted")
exc.status_code = 429
assert _is_payment_error(exc) is True
def test_429_daily_quota_phrase(self):
"""'daily quota' phrasing is a payment error."""
exc = Exception("Daily quota of 500 requests reached.")
exc.status_code = 429
assert _is_payment_error(exc) is True
def test_429_transient_rate_limit_not_quota(self):
"""Transient 429 rate limit without quota keywords is NOT a payment error."""
exc = Exception("Rate limit exceeded. Retry after 10s.")
exc.status_code = 429
assert _is_payment_error(exc) is False
class TestIsRateLimitError:
"""_is_rate_limit_error detects 429 rate-limit errors warranting fallback."""