mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
fix(agent): fail over to fallback provider on persistent auth failure (401/403)
When the active provider returns a 401/403 that survives its per-provider
credential-refresh attempt (revoked OAuth, blocked/expired key, or an
account pinned to a dead/staging inference endpoint), the conversation
loop now escalates to the configured fallback chain instead of dead-ending.
Before: the generic failover dispatch fired only for {rate_limit, billing};
auth/auth_permanent fell through to 'switch providers manually' advice and
never called _try_activate_fallback(). A user whose primary credential was
broken kept thrashing on the same dead credential every turn — the main
agent appeared 'stuck in fallback mode' while never actually failing over.
This also affected auxiliary tasks (compression, vision, title-gen), since
auto-resolved aux follows the main provider.
After: a persistent auth failure with a configured fallback chain switches
to the next provider (mirroring the rate-limit/billing failover path),
guarded one-shot per attempt by TurnRetryState.auth_failover_attempted.
When no fallback is configured the behavior is unchanged — it falls through
to the existing terminal handling and provider-specific troubleshooting
guidance.
Tests: test_auth_provider_failover.py — 401/403 classify as auth, the
gating condition fires only with a chain present + guard unset, the guard
blocks repeats, and non-auth (500) errors do not trigger auth failover.
This commit is contained in:
parent
ea8a8b4af8
commit
f22dd8a75a
4 changed files with 166 additions and 0 deletions
|
|
@ -2824,6 +2824,39 @@ def run_conversation(
|
|||
_retry.primary_recovery_attempted = False
|
||||
continue
|
||||
|
||||
# ── Auth-failure provider failover ───────────────────────
|
||||
# A 401/403 that survives the per-provider credential-refresh
|
||||
# attempt above (each guarded by its own
|
||||
# ``*_auth_retry_attempted`` flag) means the active provider's
|
||||
# credential or endpoint is broken in a way refreshing can't
|
||||
# fix (revoked OAuth, blocked/expired key, an account pinned to
|
||||
# a dead/staging endpoint). Previously the loop only printed
|
||||
# "switch providers manually" advice and fell through, so a
|
||||
# user with a configured fallback chain kept thrashing on the
|
||||
# same dead credential every turn instead of failing over.
|
||||
# Escalate to the fallback chain here, mirroring the rate-
|
||||
# limit/billing failover above. When no fallback is configured
|
||||
# (or the chain is exhausted), _try_activate_fallback returns
|
||||
# False and we fall through to the existing terminal handling
|
||||
# + provider-specific troubleshooting guidance unchanged.
|
||||
if (
|
||||
classified.is_auth
|
||||
and not _retry.auth_failover_attempted
|
||||
and agent._fallback_index < len(agent._fallback_chain)
|
||||
):
|
||||
_retry.auth_failover_attempted = True
|
||||
agent._buffer_status(
|
||||
"🔐 Authentication failed and could not be refreshed — "
|
||||
"switching to fallback provider..."
|
||||
)
|
||||
if agent._try_activate_fallback(reason=classified.reason):
|
||||
active_system_prompt = _sync_failover_system_message(
|
||||
agent, api_messages, active_system_prompt)
|
||||
retry_count = 0
|
||||
compression_attempts = 0
|
||||
_retry.primary_recovery_attempted = False
|
||||
continue
|
||||
|
||||
# ── Nous Portal: record rate limit & skip retries ─────
|
||||
# When Nous returns a 429 that is a genuine account-
|
||||
# level rate limit, record the reset time to a shared
|
||||
|
|
|
|||
|
|
@ -58,6 +58,12 @@ class TurnRetryState:
|
|||
primary_recovery_attempted: bool = False
|
||||
has_retried_429: bool = False
|
||||
|
||||
# ── Auth-failure provider failover ───────────────────────────────────
|
||||
# Set once we've escalated a persistent 401/403 (after the per-provider
|
||||
# credential-refresh attempt above failed) to the fallback chain, so we
|
||||
# don't loop on the same auth failover within one attempt.
|
||||
auth_failover_attempted: bool = False
|
||||
|
||||
# ── Restart signals (read by the outer loop after the attempt) ───────
|
||||
restart_with_compressed_messages: bool = False
|
||||
restart_with_length_continuation: bool = False
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ EXPECTED_FIELDS = {
|
|||
"llama_cpp_grammar_retry_attempted",
|
||||
"primary_recovery_attempted",
|
||||
"has_retried_429",
|
||||
"auth_failover_attempted",
|
||||
"restart_with_compressed_messages",
|
||||
"restart_with_length_continuation",
|
||||
}
|
||||
|
|
|
|||
126
tests/run_agent/test_auth_provider_failover.py
Normal file
126
tests/run_agent/test_auth_provider_failover.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
"""Auth-failure provider failover (conversation loop).
|
||||
|
||||
A 401/403 that survives the per-provider credential-refresh attempt
|
||||
(revoked OAuth, blocked/expired key, an account pinned to a dead/staging
|
||||
endpoint) must escalate to the configured fallback chain instead of
|
||||
thrashing on the same dead credential every turn.
|
||||
|
||||
Before the fix, the conversation loop's generic failover dispatch only
|
||||
fired for ``{rate_limit, billing}`` reasons; ``auth`` / ``auth_permanent``
|
||||
fell through to "switch providers manually" advice and never called
|
||||
``_try_activate_fallback()``. These tests pin:
|
||||
|
||||
1. 401/403 classify as auth (``classified.is_auth`` True).
|
||||
2. ``_try_activate_fallback`` advances the chain on an auth reason.
|
||||
3. The one-shot guard flag exists on TurnRetryState.
|
||||
"""
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from run_agent import AIAgent
|
||||
from agent.error_classifier import classify_api_error, FailoverReason
|
||||
from agent.turn_retry_state import TurnRetryState
|
||||
|
||||
|
||||
def _make_agent(fallback_model=None):
|
||||
with (
|
||||
patch("run_agent.get_tool_definitions", return_value=[]),
|
||||
patch("run_agent.check_toolset_requirements", return_value={}),
|
||||
patch("run_agent.OpenAI"),
|
||||
):
|
||||
agent = AIAgent(
|
||||
api_key="test-key",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
quiet_mode=True,
|
||||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
fallback_model=fallback_model,
|
||||
)
|
||||
agent.client = MagicMock()
|
||||
return agent
|
||||
|
||||
|
||||
def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"):
|
||||
mock = MagicMock()
|
||||
mock.base_url = base_url
|
||||
mock.api_key = api_key
|
||||
return mock
|
||||
|
||||
|
||||
def _auth_error(status=401, msg="Your API key is invalid, blocked or out of funds."):
|
||||
err = Exception(f"Error code: {status} - {msg}")
|
||||
err.status_code = status
|
||||
return err
|
||||
|
||||
|
||||
class TestAuthErrorClassification:
|
||||
def test_401_is_auth(self):
|
||||
c = classify_api_error(_auth_error(401))
|
||||
assert c.reason in {FailoverReason.auth, FailoverReason.auth_permanent}
|
||||
assert c.is_auth is True
|
||||
|
||||
def test_403_is_auth(self):
|
||||
c = classify_api_error(_auth_error(403, "forbidden"))
|
||||
assert c.is_auth is True
|
||||
|
||||
def test_500_is_not_auth(self):
|
||||
err = Exception("Error code: 500 - internal server error")
|
||||
err.status_code = 500
|
||||
c = classify_api_error(err)
|
||||
assert c.is_auth is False
|
||||
|
||||
|
||||
class TestAuthFailoverGuardFlag:
|
||||
def test_flag_defaults_false(self):
|
||||
assert TurnRetryState().auth_failover_attempted is False
|
||||
|
||||
|
||||
class TestAuthFailoverActivation:
|
||||
"""The decision the loop makes on a persistent auth failure: when a
|
||||
fallback chain exists and the guard hasn't fired, escalate to it."""
|
||||
|
||||
def _should_failover(self, agent, classified, retry):
|
||||
# Mirror the exact gating condition added to conversation_loop.py.
|
||||
return (
|
||||
classified.is_auth
|
||||
and not retry.auth_failover_attempted
|
||||
and agent._fallback_index < len(agent._fallback_chain)
|
||||
)
|
||||
|
||||
def test_auth_failover_fires_when_chain_present(self):
|
||||
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
|
||||
retry = TurnRetryState()
|
||||
classified = classify_api_error(_auth_error(401))
|
||||
assert self._should_failover(agent, classified, retry) is True
|
||||
# And the activation primitive actually advances on an auth reason.
|
||||
with patch(
|
||||
"agent.auxiliary_client.resolve_provider_client",
|
||||
return_value=(_mock_client(), "gpt-4o"),
|
||||
):
|
||||
advanced = agent._try_activate_fallback(reason=classified.reason)
|
||||
assert advanced is True
|
||||
assert agent._fallback_index == 1
|
||||
|
||||
def test_no_failover_without_chain(self):
|
||||
"""A user with no fallback configured (the common case for the
|
||||
original incident) does NOT failover — falls through to the
|
||||
existing terminal handling + troubleshooting advice."""
|
||||
agent = _make_agent(fallback_model=None)
|
||||
retry = TurnRetryState()
|
||||
classified = classify_api_error(_auth_error(401))
|
||||
assert self._should_failover(agent, classified, retry) is False
|
||||
|
||||
def test_guard_blocks_repeat_failover(self):
|
||||
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
|
||||
retry = TurnRetryState()
|
||||
retry.auth_failover_attempted = True # already escalated this attempt
|
||||
classified = classify_api_error(_auth_error(401))
|
||||
assert self._should_failover(agent, classified, retry) is False
|
||||
|
||||
def test_non_auth_error_does_not_trigger_auth_failover(self):
|
||||
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
|
||||
retry = TurnRetryState()
|
||||
err = Exception("Error code: 500 - internal server error")
|
||||
err.status_code = 500
|
||||
classified = classify_api_error(err)
|
||||
assert self._should_failover(agent, classified, retry) is False
|
||||
Loading…
Add table
Add a link
Reference in a new issue