mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(auxiliary): detect quota exhaustion as payment error; allow capacity-error fallback for explicit providers
Closes #26803 Root causes: 1. _is_payment_error() checked for billing keywords (credits, insufficient funds, billing, payment required) but missed daily token quota exhaustion phrases used by Bedrock, Vertex AI, and LiteLLM proxies — e.g. 'Too many tokens per day', 'quota exceeded', 'resource exhausted', 'daily limit'. These are functionally identical to credit exhaustion (provider cannot serve the request) but don't trigger fallback. 2. The call_llm() fallback chain was gated on resolved_provider == 'auto'. When a task resolves to a specific provider (e.g. 'custom' for a LiteLLM proxy, or 'openrouter'), capacity failures (payment/quota/connection) silently raise instead of trying alternatives. This is overly conservative: capacity errors mean the provider *cannot* serve the request regardless of user intent, so alternatives should always be tried. Fixes: - Add quota-related keywords to _is_payment_error(): quota_exceeded, too many tokens per day, daily limit, tokens per day, daily quota, resource exhausted (Vertex AI gRPC code). - Allow fallback for capacity errors (payment + connection) even when resolved_provider is not 'auto'. Rate-limit fallback stays gated on is_auto to honour explicit provider constraints for transient limits. - Apply both fixes to sync call_llm() and async acall_llm() paths. - Add 6 targeted tests for the new quota-error detection cases.
This commit is contained in:
parent
569bc94b59
commit
24c209f112
2 changed files with 72 additions and 9 deletions
|
|
@ -2096,7 +2096,13 @@ def _is_payment_error(exc: Exception) -> bool:
|
|||
"""Detect payment/credit/quota exhaustion errors.
|
||||
|
||||
Returns True for HTTP 402 (Payment Required) and for 429/other errors
|
||||
whose message indicates billing exhaustion rather than rate limiting.
|
||||
whose message indicates billing exhaustion or daily quota exhaustion
|
||||
rather than transient rate limiting.
|
||||
|
||||
Daily token quota errors (e.g. Bedrock "Too many tokens per day",
|
||||
Vertex AI "quota exceeded") are functionally equivalent to credit
|
||||
exhaustion — the provider cannot serve the request until the quota
|
||||
resets — and should trigger the same provider-fallback logic.
|
||||
"""
|
||||
status = getattr(exc, "status_code", None)
|
||||
if status == 402:
|
||||
|
|
@ -2104,10 +2110,19 @@ def _is_payment_error(exc: Exception) -> bool:
|
|||
err_lower = str(exc).lower()
|
||||
# OpenRouter and other providers include "credits" or "afford" in 402 bodies,
|
||||
# but sometimes wrap them in 429 or other codes.
|
||||
# Daily quota exhaustion from Bedrock, Vertex AI, and similar providers
|
||||
# uses different language but is semantically identical to credit exhaustion.
|
||||
if status in {402, 429, None}:
|
||||
if any(kw in err_lower for kw in ("credits", "insufficient funds",
|
||||
"can only afford", "billing",
|
||||
"payment required")):
|
||||
if any(kw in err_lower for kw in (
|
||||
"credits", "insufficient funds",
|
||||
"can only afford", "billing",
|
||||
"payment required",
|
||||
# Daily / monthly quota exhaustion keywords
|
||||
"quota exceeded", "quota_exceeded",
|
||||
"too many tokens per day", "daily limit",
|
||||
"tokens per day", "daily quota",
|
||||
"resource exhausted", # Vertex AI / gRPC quota errors
|
||||
)):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -4538,11 +4553,17 @@ def call_llm(
|
|||
or _is_connection_error(first_err)
|
||||
or _is_rate_limit_error(first_err)
|
||||
)
|
||||
# Only try alternative providers when the user didn't explicitly
|
||||
# configure this task's provider. Explicit provider = hard constraint;
|
||||
# auto (the default) = best-effort fallback chain. (#7559)
|
||||
# Respect explicit provider choice for transient errors (auth, request
|
||||
# validation, etc.) but allow fallback when the provider clearly cannot
|
||||
# serve the request due to capacity: payment/quota exhaustion and
|
||||
# connection failures are capacity problems, not request constraints.
|
||||
# See #26803: daily token quota (429 + "too many tokens per day") must
|
||||
# fall back just like a 402 credit error.
|
||||
is_auto = resolved_provider in {"auto", "", None}
|
||||
if should_fallback and is_auto:
|
||||
# Capacity errors bypass the explicit-provider gate: the provider
|
||||
# literally cannot serve this request regardless of user intent.
|
||||
is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
|
||||
if should_fallback and (is_auto or is_capacity_error):
|
||||
if _is_payment_error(first_err):
|
||||
reason = "payment error"
|
||||
# Resolve the actual provider label (resolved_provider may be
|
||||
|
|
@ -4870,8 +4891,12 @@ async def async_call_llm(
|
|||
or _is_connection_error(first_err)
|
||||
or _is_rate_limit_error(first_err)
|
||||
)
|
||||
# Capacity errors (payment/quota/connection) bypass the explicit-provider
|
||||
# gate — the provider cannot serve the request regardless of user intent.
|
||||
# See #26803: daily token quota must fall back like a 402 credit error.
|
||||
is_auto = resolved_provider in {"auto", "", None}
|
||||
if should_fallback and is_auto:
|
||||
is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
|
||||
if should_fallback and (is_auto or is_capacity_error):
|
||||
if _is_payment_error(first_err):
|
||||
reason = "payment error"
|
||||
_mark_provider_unhealthy(
|
||||
|
|
|
|||
|
|
@ -925,6 +925,44 @@ class TestIsPaymentError:
|
|||
exc = Exception("connection reset")
|
||||
assert _is_payment_error(exc) is False
|
||||
|
||||
# ── Daily / monthly quota exhaustion (#26803) ────────────────────────────
|
||||
|
||||
def test_429_quota_exceeded(self):
|
||||
"""Cloud provider quota exhaustion (e.g. Vertex AI) is a payment error."""
|
||||
exc = Exception("RESOURCE_EXHAUSTED: quota exceeded for project")
|
||||
exc.status_code = 429
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_429_too_many_tokens_per_day(self):
|
||||
"""Bedrock / LiteLLM daily token limit is a payment error."""
|
||||
exc = Exception("Too many tokens per day: 1000000 used, 1000000 limit")
|
||||
exc.status_code = 429
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_429_daily_limit_phrase(self):
|
||||
"""Generic 'daily limit' phrasing is a payment error."""
|
||||
exc = Exception("You have exceeded your daily limit.")
|
||||
exc.status_code = 429
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_429_resource_exhausted_grpc(self):
|
||||
"""Vertex AI gRPC RESOURCE_EXHAUSTED maps to payment error."""
|
||||
exc = Exception("resource exhausted")
|
||||
exc.status_code = 429
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_429_daily_quota_phrase(self):
|
||||
"""'daily quota' phrasing is a payment error."""
|
||||
exc = Exception("Daily quota of 500 requests reached.")
|
||||
exc.status_code = 429
|
||||
assert _is_payment_error(exc) is True
|
||||
|
||||
def test_429_transient_rate_limit_not_quota(self):
|
||||
"""Transient 429 rate limit without quota keywords is NOT a payment error."""
|
||||
exc = Exception("Rate limit exceeded. Retry after 10s.")
|
||||
exc.status_code = 429
|
||||
assert _is_payment_error(exc) is False
|
||||
|
||||
|
||||
class TestIsRateLimitError:
|
||||
"""_is_rate_limit_error detects 429 rate-limit errors warranting fallback."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue