mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
When the active provider returns a 401/403 that survives its per-provider
credential-refresh attempt (revoked OAuth, blocked/expired key, or an
account pinned to a dead/staging inference endpoint), the conversation
loop now escalates to the configured fallback chain instead of dead-ending.
Before: the generic failover dispatch fired only for {rate_limit, billing};
auth/auth_permanent fell through to 'switch providers manually' advice and
never called _try_activate_fallback(). A user whose primary credential was
broken kept thrashing on the same dead credential every turn — the main
agent appeared 'stuck in fallback mode' while never actually failing over.
This also affected auxiliary tasks (compression, vision, title-gen), since
auto-resolved aux follows the main provider.
After: a persistent auth failure with a configured fallback chain switches
to the next provider (mirroring the rate-limit/billing failover path),
guarded one-shot per attempt by TurnRetryState.auth_failover_attempted.
When no fallback is configured the behavior is unchanged — it falls through
to the existing terminal handling and provider-specific troubleshooting
guidance.
Tests: test_auth_provider_failover.py — 401/403 classify as auth, the
gating condition fires only with a chain present + guard unset, the guard
blocks repeats, and non-auth (500) errors do not trigger auth failover.
126 lines
4.9 KiB
Python
126 lines
4.9 KiB
Python
"""Auth-failure provider failover (conversation loop).
|
|
|
|
A 401/403 that survives the per-provider credential-refresh attempt
|
|
(revoked OAuth, blocked/expired key, an account pinned to a dead/staging
|
|
endpoint) must escalate to the configured fallback chain instead of
|
|
thrashing on the same dead credential every turn.
|
|
|
|
Before the fix, the conversation loop's generic failover dispatch only
|
|
fired for ``{rate_limit, billing}`` reasons; ``auth`` / ``auth_permanent``
|
|
fell through to "switch providers manually" advice and never called
|
|
``_try_activate_fallback()``. These tests pin:
|
|
|
|
1. 401/403 classify as auth (``classified.is_auth`` True).
|
|
2. ``_try_activate_fallback`` advances the chain on an auth reason.
|
|
3. The one-shot guard flag exists on TurnRetryState.
|
|
"""
|
|
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from run_agent import AIAgent
|
|
from agent.error_classifier import classify_api_error, FailoverReason
|
|
from agent.turn_retry_state import TurnRetryState
|
|
|
|
|
|
def _make_agent(fallback_model=None):
|
|
with (
|
|
patch("run_agent.get_tool_definitions", return_value=[]),
|
|
patch("run_agent.check_toolset_requirements", return_value={}),
|
|
patch("run_agent.OpenAI"),
|
|
):
|
|
agent = AIAgent(
|
|
api_key="test-key",
|
|
base_url="https://openrouter.ai/api/v1",
|
|
quiet_mode=True,
|
|
skip_context_files=True,
|
|
skip_memory=True,
|
|
fallback_model=fallback_model,
|
|
)
|
|
agent.client = MagicMock()
|
|
return agent
|
|
|
|
|
|
def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"):
|
|
mock = MagicMock()
|
|
mock.base_url = base_url
|
|
mock.api_key = api_key
|
|
return mock
|
|
|
|
|
|
def _auth_error(status=401, msg="Your API key is invalid, blocked or out of funds."):
|
|
err = Exception(f"Error code: {status} - {msg}")
|
|
err.status_code = status
|
|
return err
|
|
|
|
|
|
class TestAuthErrorClassification:
|
|
def test_401_is_auth(self):
|
|
c = classify_api_error(_auth_error(401))
|
|
assert c.reason in {FailoverReason.auth, FailoverReason.auth_permanent}
|
|
assert c.is_auth is True
|
|
|
|
def test_403_is_auth(self):
|
|
c = classify_api_error(_auth_error(403, "forbidden"))
|
|
assert c.is_auth is True
|
|
|
|
def test_500_is_not_auth(self):
|
|
err = Exception("Error code: 500 - internal server error")
|
|
err.status_code = 500
|
|
c = classify_api_error(err)
|
|
assert c.is_auth is False
|
|
|
|
|
|
class TestAuthFailoverGuardFlag:
|
|
def test_flag_defaults_false(self):
|
|
assert TurnRetryState().auth_failover_attempted is False
|
|
|
|
|
|
class TestAuthFailoverActivation:
|
|
"""The decision the loop makes on a persistent auth failure: when a
|
|
fallback chain exists and the guard hasn't fired, escalate to it."""
|
|
|
|
def _should_failover(self, agent, classified, retry):
|
|
# Mirror the exact gating condition added to conversation_loop.py.
|
|
return (
|
|
classified.is_auth
|
|
and not retry.auth_failover_attempted
|
|
and agent._fallback_index < len(agent._fallback_chain)
|
|
)
|
|
|
|
def test_auth_failover_fires_when_chain_present(self):
|
|
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
|
|
retry = TurnRetryState()
|
|
classified = classify_api_error(_auth_error(401))
|
|
assert self._should_failover(agent, classified, retry) is True
|
|
# And the activation primitive actually advances on an auth reason.
|
|
with patch(
|
|
"agent.auxiliary_client.resolve_provider_client",
|
|
return_value=(_mock_client(), "gpt-4o"),
|
|
):
|
|
advanced = agent._try_activate_fallback(reason=classified.reason)
|
|
assert advanced is True
|
|
assert agent._fallback_index == 1
|
|
|
|
def test_no_failover_without_chain(self):
|
|
"""A user with no fallback configured (the common case for the
|
|
original incident) does NOT failover — falls through to the
|
|
existing terminal handling + troubleshooting advice."""
|
|
agent = _make_agent(fallback_model=None)
|
|
retry = TurnRetryState()
|
|
classified = classify_api_error(_auth_error(401))
|
|
assert self._should_failover(agent, classified, retry) is False
|
|
|
|
def test_guard_blocks_repeat_failover(self):
|
|
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
|
|
retry = TurnRetryState()
|
|
retry.auth_failover_attempted = True # already escalated this attempt
|
|
classified = classify_api_error(_auth_error(401))
|
|
assert self._should_failover(agent, classified, retry) is False
|
|
|
|
def test_non_auth_error_does_not_trigger_auth_failover(self):
|
|
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
|
|
retry = TurnRetryState()
|
|
err = Exception("Error code: 500 - internal server error")
|
|
err.status_code = 500
|
|
classified = classify_api_error(err)
|
|
assert self._should_failover(agent, classified, retry) is False
|