fix(auxiliary): detect quota exhaustion as payment error; allow capacity-error fallback for explicit providers

Closes #26803 Root causes: 1. _is_payment_error() checked for billing keywords (credits, insufficient funds, billing, payment required) but missed daily token quota exhaustion phrases used by Bedrock, Vertex AI, and LiteLLM proxies — e.g. 'Too many tokens per day', 'quota exceeded', 'resource exhausted', 'daily limit'. These are functionally identical to credit exhaustion (provider cannot serve the request) but don't trigger fallback. 2. The call_llm() fallback chain was gated on resolved_provider == 'auto'. When a task resolves to a specific provider (e.g. 'custom' for a LiteLLM proxy, or 'openrouter'), capacity failures (payment/quota/connection) silently raise instead of trying alternatives. This is overly conservative: capacity errors mean the provider *cannot* serve the request regardless of user intent, so alternatives should always be tried. Fixes: - Add quota-related keywords to _is_payment_error(): quota_exceeded, too many tokens per day, daily limit, tokens per day, daily quota, resource exhausted (Vertex AI gRPC code). - Allow fallback for capacity errors (payment + connection) even when resolved_provider is not 'auto'. Rate-limit fallback stays gated on is_auto to honour explicit provider constraints for transient limits. - Apply both fixes to sync call_llm() and async acall_llm() paths. - Add 6 targeted tests for the new quota-error detection cases.
2026-07-15 14:22:43 +00:00 · 2026-05-16 03:36:36 -04:00 · 2026-05-16 03:36:36 -04:00 · 24c209f112
commit 24c209f112
parent 569bc94b59
2 changed files with 72 additions and 9 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -2096,7 +2096,13 @@ def _is_payment_error(exc: Exception) -> bool:
    """Detect payment/credit/quota exhaustion errors.

    Returns True for HTTP 402 (Payment Required) and for 429/other errors
-    whose message indicates billing exhaustion rather than rate limiting.
+    whose message indicates billing exhaustion or daily quota exhaustion
+    rather than transient rate limiting.
+
+    Daily token quota errors (e.g. Bedrock "Too many tokens per day",
+    Vertex AI "quota exceeded") are functionally equivalent to credit
+    exhaustion — the provider cannot serve the request until the quota
+    resets — and should trigger the same provider-fallback logic.
    """
    status = getattr(exc, "status_code", None)
    if status == 402:
@ -2104,10 +2110,19 @@ def _is_payment_error(exc: Exception) -> bool:
    err_lower = str(exc).lower()
    # OpenRouter and other providers include "credits" or "afford" in 402 bodies,
    # but sometimes wrap them in 429 or other codes.
+    # Daily quota exhaustion from Bedrock, Vertex AI, and similar providers
+    # uses different language but is semantically identical to credit exhaustion.
    if status in {402, 429, None}:
-        if any(kw in err_lower for kw in ("credits", "insufficient funds",
-                                           "can only afford", "billing",
-                                           "payment required")):
+        if any(kw in err_lower for kw in (
+            "credits", "insufficient funds",
+            "can only afford", "billing",
+            "payment required",
+            # Daily / monthly quota exhaustion keywords
+            "quota exceeded", "quota_exceeded",
+            "too many tokens per day", "daily limit",
+            "tokens per day", "daily quota",
+            "resource exhausted",  # Vertex AI / gRPC quota errors
+        )):
            return True
    return False

@ -4538,11 +4553,17 @@ def call_llm(
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
        )
-        # Only try alternative providers when the user didn't explicitly
-        # configure this task's provider.  Explicit provider = hard constraint;
-        # auto (the default) = best-effort fallback chain.  (#7559)
+        # Respect explicit provider choice for transient errors (auth, request
+        # validation, etc.) but allow fallback when the provider clearly cannot
+        # serve the request due to capacity: payment/quota exhaustion and
+        # connection failures are capacity problems, not request constraints.
+        # See #26803: daily token quota (429 + "too many tokens per day") must
+        # fall back just like a 402 credit error.
        is_auto = resolved_provider in {"auto", "", None}
-        if should_fallback and is_auto:
+        # Capacity errors bypass the explicit-provider gate: the provider
+        # literally cannot serve this request regardless of user intent.
+        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
+        if should_fallback and (is_auto or is_capacity_error):
            if _is_payment_error(first_err):
                reason = "payment error"
                # Resolve the actual provider label (resolved_provider may be
@ -4870,8 +4891,12 @@ async def async_call_llm(
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
        )
+        # Capacity errors (payment/quota/connection) bypass the explicit-provider
+        # gate — the provider cannot serve the request regardless of user intent.
+        # See #26803: daily token quota must fall back like a 402 credit error.
        is_auto = resolved_provider in {"auto", "", None}
-        if should_fallback and is_auto:
+        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
+        if should_fallback and (is_auto or is_capacity_error):
            if _is_payment_error(first_err):
                reason = "payment error"
                _mark_provider_unhealthy(
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -925,6 +925,44 @@ class TestIsPaymentError:
        exc = Exception("connection reset")
        assert _is_payment_error(exc) is False

+    # ── Daily / monthly quota exhaustion (#26803) ────────────────────────────
+
+    def test_429_quota_exceeded(self):
+        """Cloud provider quota exhaustion (e.g. Vertex AI) is a payment error."""
+        exc = Exception("RESOURCE_EXHAUSTED: quota exceeded for project")
+        exc.status_code = 429
+        assert _is_payment_error(exc) is True
+
+    def test_429_too_many_tokens_per_day(self):
+        """Bedrock / LiteLLM daily token limit is a payment error."""
+        exc = Exception("Too many tokens per day: 1000000 used, 1000000 limit")
+        exc.status_code = 429
+        assert _is_payment_error(exc) is True
+
+    def test_429_daily_limit_phrase(self):
+        """Generic 'daily limit' phrasing is a payment error."""
+        exc = Exception("You have exceeded your daily limit.")
+        exc.status_code = 429
+        assert _is_payment_error(exc) is True
+
+    def test_429_resource_exhausted_grpc(self):
+        """Vertex AI gRPC RESOURCE_EXHAUSTED maps to payment error."""
+        exc = Exception("resource exhausted")
+        exc.status_code = 429
+        assert _is_payment_error(exc) is True
+
+    def test_429_daily_quota_phrase(self):
+        """'daily quota' phrasing is a payment error."""
+        exc = Exception("Daily quota of 500 requests reached.")
+        exc.status_code = 429
+        assert _is_payment_error(exc) is True
+
+    def test_429_transient_rate_limit_not_quota(self):
+        """Transient 429 rate limit without quota keywords is NOT a payment error."""
+        exc = Exception("Rate limit exceeded. Retry after 10s.")
+        exc.status_code = 429
+        assert _is_payment_error(exc) is False
+

 class TestIsRateLimitError:
    """_is_rate_limit_error detects 429 rate-limit errors warranting fallback."""