fix(agent): fail over to fallback provider on persistent auth failure (401/403)

When the active provider returns a 401/403 that survives its per-provider
credential-refresh attempt (revoked OAuth, blocked/expired key, or an
account pinned to a dead/staging inference endpoint), the conversation
loop now escalates to the configured fallback chain instead of dead-ending.

Before: the generic failover dispatch fired only for {rate_limit, billing};
auth/auth_permanent fell through to 'switch providers manually' advice and
never called _try_activate_fallback(). A user whose primary credential was
broken kept thrashing on the same dead credential every turn — the main
agent appeared 'stuck in fallback mode' while never actually failing over.
This also affected auxiliary tasks (compression, vision, title-gen), since
auto-resolved aux follows the main provider.

After: a persistent auth failure with a configured fallback chain switches
to the next provider (mirroring the rate-limit/billing failover path),
guarded one-shot per attempt by TurnRetryState.auth_failover_attempted.
When no fallback is configured the behavior is unchanged — it falls through
to the existing terminal handling and provider-specific troubleshooting
guidance.

Tests: test_auth_provider_failover.py — 401/403 classify as auth, the
gating condition fires only with a chain present + guard unset, the guard
blocks repeats, and non-auth (500) errors do not trigger auth failover.
This commit is contained in:
teknium1 2026-06-20 11:13:02 -07:00 committed by Teknium
parent ea8a8b4af8
commit f22dd8a75a
4 changed files with 166 additions and 0 deletions

View file

@ -2824,6 +2824,39 @@ def run_conversation(
_retry.primary_recovery_attempted = False
continue
# ── Auth-failure provider failover ───────────────────────
# A 401/403 that survives the per-provider credential-refresh
# attempt above (each guarded by its own
# ``*_auth_retry_attempted`` flag) means the active provider's
# credential or endpoint is broken in a way refreshing can't
# fix (revoked OAuth, blocked/expired key, an account pinned to
# a dead/staging endpoint). Previously the loop only printed
# "switch providers manually" advice and fell through, so a
# user with a configured fallback chain kept thrashing on the
# same dead credential every turn instead of failing over.
# Escalate to the fallback chain here, mirroring the rate-
# limit/billing failover above. When no fallback is configured
# (or the chain is exhausted), _try_activate_fallback returns
# False and we fall through to the existing terminal handling
# + provider-specific troubleshooting guidance unchanged.
if (
classified.is_auth
and not _retry.auth_failover_attempted
and agent._fallback_index < len(agent._fallback_chain)
):
_retry.auth_failover_attempted = True
agent._buffer_status(
"🔐 Authentication failed and could not be refreshed — "
"switching to fallback provider..."
)
if agent._try_activate_fallback(reason=classified.reason):
active_system_prompt = _sync_failover_system_message(
agent, api_messages, active_system_prompt)
retry_count = 0
compression_attempts = 0
_retry.primary_recovery_attempted = False
continue
# ── Nous Portal: record rate limit & skip retries ─────
# When Nous returns a 429 that is a genuine account-
# level rate limit, record the reset time to a shared

View file

@ -58,6 +58,12 @@ class TurnRetryState:
primary_recovery_attempted: bool = False
has_retried_429: bool = False
# ── Auth-failure provider failover ───────────────────────────────────
# Set once we've escalated a persistent 401/403 (after the per-provider
# credential-refresh attempt above failed) to the fallback chain, so we
# don't loop on the same auth failover within one attempt.
auth_failover_attempted: bool = False
# ── Restart signals (read by the outer loop after the attempt) ───────
restart_with_compressed_messages: bool = False
restart_with_length_continuation: bool = False

View file

@ -27,6 +27,7 @@ EXPECTED_FIELDS = {
"llama_cpp_grammar_retry_attempted",
"primary_recovery_attempted",
"has_retried_429",
"auth_failover_attempted",
"restart_with_compressed_messages",
"restart_with_length_continuation",
}

View file

@ -0,0 +1,126 @@
"""Auth-failure provider failover (conversation loop).
A 401/403 that survives the per-provider credential-refresh attempt
(revoked OAuth, blocked/expired key, an account pinned to a dead/staging
endpoint) must escalate to the configured fallback chain instead of
thrashing on the same dead credential every turn.
Before the fix, the conversation loop's generic failover dispatch only
fired for ``{rate_limit, billing}`` reasons; ``auth`` / ``auth_permanent``
fell through to "switch providers manually" advice and never called
``_try_activate_fallback()``. These tests pin:
1. 401/403 classify as auth (``classified.is_auth`` True).
2. ``_try_activate_fallback`` advances the chain on an auth reason.
3. The one-shot guard flag exists on TurnRetryState.
"""
from unittest.mock import MagicMock, patch
from run_agent import AIAgent
from agent.error_classifier import classify_api_error, FailoverReason
from agent.turn_retry_state import TurnRetryState
def _make_agent(fallback_model=None):
with (
patch("run_agent.get_tool_definitions", return_value=[]),
patch("run_agent.check_toolset_requirements", return_value={}),
patch("run_agent.OpenAI"),
):
agent = AIAgent(
api_key="test-key",
base_url="https://openrouter.ai/api/v1",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
fallback_model=fallback_model,
)
agent.client = MagicMock()
return agent
def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"):
mock = MagicMock()
mock.base_url = base_url
mock.api_key = api_key
return mock
def _auth_error(status=401, msg="Your API key is invalid, blocked or out of funds."):
err = Exception(f"Error code: {status} - {msg}")
err.status_code = status
return err
class TestAuthErrorClassification:
def test_401_is_auth(self):
c = classify_api_error(_auth_error(401))
assert c.reason in {FailoverReason.auth, FailoverReason.auth_permanent}
assert c.is_auth is True
def test_403_is_auth(self):
c = classify_api_error(_auth_error(403, "forbidden"))
assert c.is_auth is True
def test_500_is_not_auth(self):
err = Exception("Error code: 500 - internal server error")
err.status_code = 500
c = classify_api_error(err)
assert c.is_auth is False
class TestAuthFailoverGuardFlag:
def test_flag_defaults_false(self):
assert TurnRetryState().auth_failover_attempted is False
class TestAuthFailoverActivation:
"""The decision the loop makes on a persistent auth failure: when a
fallback chain exists and the guard hasn't fired, escalate to it."""
def _should_failover(self, agent, classified, retry):
# Mirror the exact gating condition added to conversation_loop.py.
return (
classified.is_auth
and not retry.auth_failover_attempted
and agent._fallback_index < len(agent._fallback_chain)
)
def test_auth_failover_fires_when_chain_present(self):
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
retry = TurnRetryState()
classified = classify_api_error(_auth_error(401))
assert self._should_failover(agent, classified, retry) is True
# And the activation primitive actually advances on an auth reason.
with patch(
"agent.auxiliary_client.resolve_provider_client",
return_value=(_mock_client(), "gpt-4o"),
):
advanced = agent._try_activate_fallback(reason=classified.reason)
assert advanced is True
assert agent._fallback_index == 1
def test_no_failover_without_chain(self):
"""A user with no fallback configured (the common case for the
original incident) does NOT failover falls through to the
existing terminal handling + troubleshooting advice."""
agent = _make_agent(fallback_model=None)
retry = TurnRetryState()
classified = classify_api_error(_auth_error(401))
assert self._should_failover(agent, classified, retry) is False
def test_guard_blocks_repeat_failover(self):
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
retry = TurnRetryState()
retry.auth_failover_attempted = True # already escalated this attempt
classified = classify_api_error(_auth_error(401))
assert self._should_failover(agent, classified, retry) is False
def test_non_auth_error_does_not_trigger_auth_failover(self):
agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
retry = TurnRetryState()
err = Exception("Error code: 500 - internal server error")
err.status_code = 500
classified = classify_api_error(err)
assert self._should_failover(agent, classified, retry) is False