mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-14 04:02:26 +00:00
fix(aux): trigger fallback on 429 rate-limit errors in auxiliary client
When a provider returns a 429 rate-limit error (not billing-related),
the auxiliary client's call_llm/async_call_llm previously did NOT trigger
the fallback chain. This caused auxiliary tasks like session_search to
exhaust all 3 retries against the same rate-limited endpoint, losing
session metadata that depended on the summarization completing.
Root cause: `_is_payment_error()` only matched 429s containing billing
keywords ("credits", "insufficient funds", etc.). Provider-specific
rate-limit messages like Nous's "Hold up for a bit, you've exceeded the
rate limit on your API key" didn't match, so `_is_payment_error` returned
False, `_is_connection_error` returned False, and `should_fallback` was
False — all retries hit the same rate-limited provider.
Fix:
- New `_is_rate_limit_error()` function that detects 429 + rate-limit
keywords, generic 429 without billing keywords, and OpenAI SDK
`RateLimitError` class instances (which may omit .status_code).
- Updated `should_fallback` in both `call_llm` and `async_call_llm` to
include `_is_rate_limit_error`.
- Updated the max_tokens retry path to also check for rate-limit errors.
- Updated the reason string to include "rate limit".
This complements the Nous rate guard (PR #10568) which prevents new calls
to Nous when already rate-limited — this fix handles the case where a
request is already in flight when the 429 arrives.
Related: #8023, #12554, #11034
Co-authored-by: Zeejay <zjtan1@gmail.com>
This commit is contained in:
parent
8c0f254c06
commit
f8ba265340
2 changed files with 155 additions and 8 deletions
|
|
@ -1650,6 +1650,39 @@ def _is_payment_error(exc: Exception) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _is_rate_limit_error(exc: Exception) -> bool:
|
||||||
|
"""Detect rate-limit errors that warrant provider fallback.
|
||||||
|
|
||||||
|
Returns True for HTTP 429 errors whose message indicates rate limiting
|
||||||
|
(as opposed to billing/quota exhaustion, which _is_payment_error handles).
|
||||||
|
Also catches OpenAI SDK RateLimitError instances that may not set
|
||||||
|
.status_code on the exception object.
|
||||||
|
"""
|
||||||
|
status = getattr(exc, "status_code", None)
|
||||||
|
err_lower = str(exc).lower()
|
||||||
|
|
||||||
|
# OpenAI SDK's RateLimitError sometimes omits .status_code —
|
||||||
|
# detect by class name so we don't miss these. (PR #8023 pattern)
|
||||||
|
if type(exc).__name__ == "RateLimitError":
|
||||||
|
return True
|
||||||
|
|
||||||
|
if status == 429:
|
||||||
|
# Distinguish rate-limit from billing: billing keywords are handled
|
||||||
|
# by _is_payment_error, everything else on 429 is a rate limit.
|
||||||
|
if any(kw in err_lower for kw in (
|
||||||
|
"rate limit", "rate_limit", "too many requests",
|
||||||
|
"try again", "retry after", "resets in",
|
||||||
|
)):
|
||||||
|
return True
|
||||||
|
# Generic 429 without billing keywords = likely a rate limit
|
||||||
|
if not any(kw in err_lower for kw in (
|
||||||
|
"credits", "insufficient funds", "billing",
|
||||||
|
"payment required", "can only afford",
|
||||||
|
)):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _is_connection_error(exc: Exception) -> bool:
|
def _is_connection_error(exc: Exception) -> bool:
|
||||||
"""Detect connection/network errors that warrant provider fallback.
|
"""Detect connection/network errors that warrant provider fallback.
|
||||||
|
|
||||||
|
|
@ -3542,7 +3575,7 @@ def call_llm(
|
||||||
except Exception as retry_err:
|
except Exception as retry_err:
|
||||||
# If the max_tokens retry also hits a payment or connection
|
# If the max_tokens retry also hits a payment or connection
|
||||||
# error, fall through to the fallback chain below.
|
# error, fall through to the fallback chain below.
|
||||||
if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)):
|
if not (_is_payment_error(retry_err) or _is_connection_error(retry_err) or _is_rate_limit_error(retry_err)):
|
||||||
raise
|
raise
|
||||||
first_err = retry_err
|
first_err = retry_err
|
||||||
|
|
||||||
|
|
@ -3625,13 +3658,27 @@ def call_llm(
|
||||||
# Codex/OAuth tokens that authenticate but whose endpoint is down,
|
# Codex/OAuth tokens that authenticate but whose endpoint is down,
|
||||||
# and providers the user never configured that got picked up by
|
# and providers the user never configured that got picked up by
|
||||||
# the auto-detection chain.
|
# the auto-detection chain.
|
||||||
should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err)
|
#
|
||||||
|
# ── Rate-limit fallback (#13579) ─────────────────────────────
|
||||||
|
# When the provider returns a 429 rate-limit (not billing), fall
|
||||||
|
# back to an alternative provider instead of exhausting retries
|
||||||
|
# against the same rate-limited endpoint.
|
||||||
|
should_fallback = (
|
||||||
|
_is_payment_error(first_err)
|
||||||
|
or _is_connection_error(first_err)
|
||||||
|
or _is_rate_limit_error(first_err)
|
||||||
|
)
|
||||||
# Only try alternative providers when the user didn't explicitly
|
# Only try alternative providers when the user didn't explicitly
|
||||||
# configure this task's provider. Explicit provider = hard constraint;
|
# configure this task's provider. Explicit provider = hard constraint;
|
||||||
# auto (the default) = best-effort fallback chain. (#7559)
|
# auto (the default) = best-effort fallback chain. (#7559)
|
||||||
is_auto = resolved_provider in ("auto", "", None)
|
is_auto = resolved_provider in ("auto", "", None)
|
||||||
if should_fallback and is_auto:
|
if should_fallback and is_auto:
|
||||||
reason = "payment error" if _is_payment_error(first_err) else "connection error"
|
if _is_payment_error(first_err):
|
||||||
|
reason = "payment error"
|
||||||
|
elif _is_rate_limit_error(first_err):
|
||||||
|
reason = "rate limit"
|
||||||
|
else:
|
||||||
|
reason = "connection error"
|
||||||
logger.info("Auxiliary %s: %s on %s (%s), trying fallback",
|
logger.info("Auxiliary %s: %s on %s (%s), trying fallback",
|
||||||
task or "call", reason, resolved_provider, first_err)
|
task or "call", reason, resolved_provider, first_err)
|
||||||
fb_client, fb_model, fb_label = _try_payment_fallback(
|
fb_client, fb_model, fb_label = _try_payment_fallback(
|
||||||
|
|
@ -3834,7 +3881,7 @@ async def async_call_llm(
|
||||||
except Exception as retry_err:
|
except Exception as retry_err:
|
||||||
# If the max_tokens retry also hits a payment or connection
|
# If the max_tokens retry also hits a payment or connection
|
||||||
# error, fall through to the fallback chain below.
|
# error, fall through to the fallback chain below.
|
||||||
if not (_is_payment_error(retry_err) or _is_connection_error(retry_err)):
|
if not (_is_payment_error(retry_err) or _is_connection_error(retry_err) or _is_rate_limit_error(retry_err)):
|
||||||
raise
|
raise
|
||||||
first_err = retry_err
|
first_err = retry_err
|
||||||
|
|
||||||
|
|
@ -3903,11 +3950,20 @@ async def async_call_llm(
|
||||||
return _validate_llm_response(
|
return _validate_llm_response(
|
||||||
await retry_client.chat.completions.create(**retry_kwargs), task)
|
await retry_client.chat.completions.create(**retry_kwargs), task)
|
||||||
|
|
||||||
# ── Payment / connection fallback (mirrors sync call_llm) ─────
|
# ── Payment / connection / rate-limit fallback (mirrors sync call_llm) ──
|
||||||
should_fallback = _is_payment_error(first_err) or _is_connection_error(first_err)
|
should_fallback = (
|
||||||
|
_is_payment_error(first_err)
|
||||||
|
or _is_connection_error(first_err)
|
||||||
|
or _is_rate_limit_error(first_err)
|
||||||
|
)
|
||||||
is_auto = resolved_provider in ("auto", "", None)
|
is_auto = resolved_provider in ("auto", "", None)
|
||||||
if should_fallback and is_auto:
|
if should_fallback and is_auto:
|
||||||
reason = "payment error" if _is_payment_error(first_err) else "connection error"
|
if _is_payment_error(first_err):
|
||||||
|
reason = "payment error"
|
||||||
|
elif _is_rate_limit_error(first_err):
|
||||||
|
reason = "rate limit"
|
||||||
|
else:
|
||||||
|
reason = "connection error"
|
||||||
logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback",
|
logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback",
|
||||||
task or "call", reason, resolved_provider, first_err)
|
task or "call", reason, resolved_provider, first_err)
|
||||||
fb_client, fb_model, fb_label = _try_payment_fallback(
|
fb_client, fb_model, fb_label = _try_payment_fallback(
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ from agent.auxiliary_client import (
|
||||||
_read_codex_access_token,
|
_read_codex_access_token,
|
||||||
_get_provider_chain,
|
_get_provider_chain,
|
||||||
_is_payment_error,
|
_is_payment_error,
|
||||||
|
_is_rate_limit_error,
|
||||||
_normalize_aux_provider,
|
_normalize_aux_provider,
|
||||||
_try_payment_fallback,
|
_try_payment_fallback,
|
||||||
_resolve_auto,
|
_resolve_auto,
|
||||||
|
|
@ -789,6 +790,65 @@ class TestIsPaymentError:
|
||||||
assert _is_payment_error(exc) is False
|
assert _is_payment_error(exc) is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsRateLimitError:
|
||||||
|
"""_is_rate_limit_error detects 429 rate-limit errors warranting fallback."""
|
||||||
|
|
||||||
|
def test_429_with_rate_limit_message(self):
|
||||||
|
exc = Exception("Rate limit exceeded, try again in 2 seconds")
|
||||||
|
exc.status_code = 429
|
||||||
|
assert _is_rate_limit_error(exc) is True
|
||||||
|
|
||||||
|
def test_429_with_resets_in_message(self):
|
||||||
|
"""Nous-style 429: 'resets in 3508s'."""
|
||||||
|
exc = Exception("Hold up for a bit, you've exceeded the rate limit on your API key")
|
||||||
|
exc.status_code = 429
|
||||||
|
assert _is_rate_limit_error(exc) is True
|
||||||
|
|
||||||
|
def test_429_with_too_many_requests(self):
|
||||||
|
exc = Exception("Too many requests")
|
||||||
|
exc.status_code = 429
|
||||||
|
assert _is_rate_limit_error(exc) is True
|
||||||
|
|
||||||
|
def test_429_without_billing_keywords_is_rate_limit(self):
|
||||||
|
"""Generic 429 without billing keywords = likely a rate limit."""
|
||||||
|
exc = Exception("Something went wrong")
|
||||||
|
exc.status_code = 429
|
||||||
|
assert _is_rate_limit_error(exc) is True
|
||||||
|
|
||||||
|
def test_429_with_credits_message_is_not_rate_limit(self):
|
||||||
|
"""Billing-related 429 should NOT be classified as rate limit."""
|
||||||
|
exc = Exception("insufficient credits remaining")
|
||||||
|
exc.status_code = 429
|
||||||
|
assert _is_rate_limit_error(exc) is False
|
||||||
|
|
||||||
|
def test_429_with_billing_message_is_not_rate_limit(self):
|
||||||
|
exc = Exception("you can only afford 1000 tokens")
|
||||||
|
exc.status_code = 429
|
||||||
|
assert _is_rate_limit_error(exc) is False
|
||||||
|
|
||||||
|
def test_402_is_not_rate_limit(self):
|
||||||
|
exc = Exception("Payment Required")
|
||||||
|
exc.status_code = 402
|
||||||
|
assert _is_rate_limit_error(exc) is False
|
||||||
|
|
||||||
|
def test_500_is_not_rate_limit(self):
|
||||||
|
exc = Exception("Internal Server Error")
|
||||||
|
exc.status_code = 500
|
||||||
|
assert _is_rate_limit_error(exc) is False
|
||||||
|
|
||||||
|
def test_openai_ratelimiterror_classname(self):
|
||||||
|
"""OpenAI SDK RateLimitError may omit .status_code — detect by class name."""
|
||||||
|
class RateLimitError(Exception):
|
||||||
|
pass
|
||||||
|
exc = RateLimitError("rate limit exceeded")
|
||||||
|
# No status_code set, but class name matches
|
||||||
|
assert _is_rate_limit_error(exc) is True
|
||||||
|
|
||||||
|
def test_no_status_code_no_keywords_is_not_rate_limit(self):
|
||||||
|
exc = Exception("connection reset")
|
||||||
|
assert _is_rate_limit_error(exc) is False
|
||||||
|
|
||||||
|
|
||||||
class TestGetProviderChain:
|
class TestGetProviderChain:
|
||||||
"""_get_provider_chain() resolves functions at call time (testable)."""
|
"""_get_provider_chain() resolves functions at call time (testable)."""
|
||||||
|
|
||||||
|
|
@ -860,13 +920,18 @@ class TestTryPaymentFallback:
|
||||||
|
|
||||||
|
|
||||||
class TestCallLlmPaymentFallback:
|
class TestCallLlmPaymentFallback:
|
||||||
"""call_llm() retries with a different provider on 402 / payment errors."""
|
"""call_llm() retries with a different provider on 402 / payment / rate-limit errors."""
|
||||||
|
|
||||||
def _make_402_error(self, msg="Payment Required: insufficient credits"):
|
def _make_402_error(self, msg="Payment Required: insufficient credits"):
|
||||||
exc = Exception(msg)
|
exc = Exception(msg)
|
||||||
exc.status_code = 402
|
exc.status_code = 402
|
||||||
return exc
|
return exc
|
||||||
|
|
||||||
|
def _make_429_rate_limit_error(self, msg="Rate limit exceeded, try again in 60 seconds"):
|
||||||
|
exc = Exception(msg)
|
||||||
|
exc.status_code = 429
|
||||||
|
return exc
|
||||||
|
|
||||||
def test_non_payment_error_not_caught(self, monkeypatch):
|
def test_non_payment_error_not_caught(self, monkeypatch):
|
||||||
"""Non-payment/non-connection errors (500) should NOT trigger fallback."""
|
"""Non-payment/non-connection errors (500) should NOT trigger fallback."""
|
||||||
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
|
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
|
||||||
|
|
@ -886,6 +951,32 @@ class TestCallLlmPaymentFallback:
|
||||||
messages=[{"role": "user", "content": "hello"}],
|
messages=[{"role": "user", "content": "hello"}],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_429_rate_limit_triggers_fallback(self, monkeypatch):
|
||||||
|
"""429 rate-limit errors should trigger fallback to next provider."""
|
||||||
|
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
|
||||||
|
|
||||||
|
primary_client = MagicMock()
|
||||||
|
rate_err = self._make_429_rate_limit_error()
|
||||||
|
primary_client.chat.completions.create.side_effect = rate_err
|
||||||
|
|
||||||
|
fallback_client = MagicMock()
|
||||||
|
fallback_client.chat.completions.create.return_value = MagicMock(choices=[
|
||||||
|
MagicMock(message=MagicMock(content="fallback response"))
|
||||||
|
])
|
||||||
|
|
||||||
|
with patch("agent.auxiliary_client._get_cached_client",
|
||||||
|
return_value=(primary_client, "xiaomi/mimo-v2-pro")), \
|
||||||
|
patch("agent.auxiliary_client._resolve_task_provider_model",
|
||||||
|
return_value=("auto", "xiaomi/mimo-v2-pro", None, None, None)), \
|
||||||
|
patch("agent.auxiliary_client._try_payment_fallback",
|
||||||
|
return_value=(fallback_client, "fallback-model", "openrouter")):
|
||||||
|
result = call_llm(
|
||||||
|
task="session_search",
|
||||||
|
messages=[{"role": "user", "content": "hello"}],
|
||||||
|
)
|
||||||
|
# Fallback client should have been used
|
||||||
|
assert fallback_client.chat.completions.create.called
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Gate: _resolve_api_key_provider must skip anthropic when not configured
|
# Gate: _resolve_api_key_provider must skip anthropic when not configured
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue