fix(agent): fail over to fallback provider on persistent auth failure (401/403)

When the active provider returns a 401/403 that survives its per-provider credential-refresh attempt (revoked OAuth, blocked/expired key, or an account pinned to a dead/staging inference endpoint), the conversation loop now escalates to the configured fallback chain instead of dead-ending. Before: the generic failover dispatch fired only for {rate_limit, billing}; auth/auth_permanent fell through to 'switch providers manually' advice and never called _try_activate_fallback(). A user whose primary credential was broken kept thrashing on the same dead credential every turn — the main agent appeared 'stuck in fallback mode' while never actually failing over. This also affected auxiliary tasks (compression, vision, title-gen), since auto-resolved aux follows the main provider. After: a persistent auth failure with a configured fallback chain switches to the next provider (mirroring the rate-limit/billing failover path), guarded one-shot per attempt by TurnRetryState.auth_failover_attempted. When no fallback is configured the behavior is unchanged — it falls through to the existing terminal handling and provider-specific troubleshooting guidance. Tests: test_auth_provider_failover.py — 401/403 classify as auth, the gating condition fires only with a chain present + guard unset, the guard blocks repeats, and non-auth (500) errors do not trigger auth failover.
2026-06-21 10:22:18 +00:00 · 2026-06-20 11:13:02 -07:00 · 2026-06-20 11:13:02 -07:00 · f22dd8a75a
commit f22dd8a75a
parent ea8a8b4af8
4 changed files with 166 additions and 0 deletions
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -2824,6 +2824,39 @@ def run_conversation(
                            _retry.primary_recovery_attempted = False
                            continue

+                # ── Auth-failure provider failover ───────────────────────
+                # A 401/403 that survives the per-provider credential-refresh
+                # attempt above (each guarded by its own
+                # ``*_auth_retry_attempted`` flag) means the active provider's
+                # credential or endpoint is broken in a way refreshing can't
+                # fix (revoked OAuth, blocked/expired key, an account pinned to
+                # a dead/staging endpoint). Previously the loop only printed
+                # "switch providers manually" advice and fell through, so a
+                # user with a configured fallback chain kept thrashing on the
+                # same dead credential every turn instead of failing over.
+                # Escalate to the fallback chain here, mirroring the rate-
+                # limit/billing failover above. When no fallback is configured
+                # (or the chain is exhausted), _try_activate_fallback returns
+                # False and we fall through to the existing terminal handling
+                # + provider-specific troubleshooting guidance unchanged.
+                if (
+                    classified.is_auth
+                    and not _retry.auth_failover_attempted
+                    and agent._fallback_index < len(agent._fallback_chain)
+                ):
+                    _retry.auth_failover_attempted = True
+                    agent._buffer_status(
+                        "🔐 Authentication failed and could not be refreshed — "
+                        "switching to fallback provider..."
+                    )
+                    if agent._try_activate_fallback(reason=classified.reason):
+                        active_system_prompt = _sync_failover_system_message(
+                            agent, api_messages, active_system_prompt)
+                        retry_count = 0
+                        compression_attempts = 0
+                        _retry.primary_recovery_attempted = False
+                        continue
+
                # ── Nous Portal: record rate limit & skip retries ─────
                # When Nous returns a 429 that is a genuine account-
                # level rate limit, record the reset time to a shared
--- a/agent/turn_retry_state.py
+++ b/agent/turn_retry_state.py
@ -58,6 +58,12 @@ class TurnRetryState:
    primary_recovery_attempted: bool = False
    has_retried_429: bool = False

+    # ── Auth-failure provider failover ───────────────────────────────────
+    # Set once we've escalated a persistent 401/403 (after the per-provider
+    # credential-refresh attempt above failed) to the fallback chain, so we
+    # don't loop on the same auth failover within one attempt.
+    auth_failover_attempted: bool = False
+
    # ── Restart signals (read by the outer loop after the attempt) ───────
    restart_with_compressed_messages: bool = False
    restart_with_length_continuation: bool = False
--- a/tests/agent/test_turn_retry_state.py
+++ b/tests/agent/test_turn_retry_state.py
@ -27,6 +27,7 @@ EXPECTED_FIELDS = {
    "llama_cpp_grammar_retry_attempted",
    "primary_recovery_attempted",
    "has_retried_429",
+    "auth_failover_attempted",
    "restart_with_compressed_messages",
    "restart_with_length_continuation",
 }
--- a/tests/run_agent/test_auth_provider_failover.py
+++ b/tests/run_agent/test_auth_provider_failover.py
@ -0,0 +1,126 @@
+"""Auth-failure provider failover (conversation loop).
+
+A 401/403 that survives the per-provider credential-refresh attempt
+(revoked OAuth, blocked/expired key, an account pinned to a dead/staging
+endpoint) must escalate to the configured fallback chain instead of
+thrashing on the same dead credential every turn.
+
+Before the fix, the conversation loop's generic failover dispatch only
+fired for ``{rate_limit, billing}`` reasons; ``auth`` / ``auth_permanent``
+fell through to "switch providers manually" advice and never called
+``_try_activate_fallback()``. These tests pin:
+
+  1. 401/403 classify as auth (``classified.is_auth`` True).
+  2. ``_try_activate_fallback`` advances the chain on an auth reason.
+  3. The one-shot guard flag exists on TurnRetryState.
+"""
+
+from unittest.mock import MagicMock, patch
+
+from run_agent import AIAgent
+from agent.error_classifier import classify_api_error, FailoverReason
+from agent.turn_retry_state import TurnRetryState
+
+
+def _make_agent(fallback_model=None):
+    with (
+        patch("run_agent.get_tool_definitions", return_value=[]),
+        patch("run_agent.check_toolset_requirements", return_value={}),
+        patch("run_agent.OpenAI"),
+    ):
+        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
+            quiet_mode=True,
+            skip_context_files=True,
+            skip_memory=True,
+            fallback_model=fallback_model,
+        )
+        agent.client = MagicMock()
+        return agent
+
+
+def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"):
+    mock = MagicMock()
+    mock.base_url = base_url
+    mock.api_key = api_key
+    return mock
+
+
+def _auth_error(status=401, msg="Your API key is invalid, blocked or out of funds."):
+    err = Exception(f"Error code: {status} - {msg}")
+    err.status_code = status
+    return err
+
+
+class TestAuthErrorClassification:
+    def test_401_is_auth(self):
+        c = classify_api_error(_auth_error(401))
+        assert c.reason in {FailoverReason.auth, FailoverReason.auth_permanent}
+        assert c.is_auth is True
+
+    def test_403_is_auth(self):
+        c = classify_api_error(_auth_error(403, "forbidden"))
+        assert c.is_auth is True
+
+    def test_500_is_not_auth(self):
+        err = Exception("Error code: 500 - internal server error")
+        err.status_code = 500
+        c = classify_api_error(err)
+        assert c.is_auth is False
+
+
+class TestAuthFailoverGuardFlag:
+    def test_flag_defaults_false(self):
+        assert TurnRetryState().auth_failover_attempted is False
+
+
+class TestAuthFailoverActivation:
+    """The decision the loop makes on a persistent auth failure: when a
+    fallback chain exists and the guard hasn't fired, escalate to it."""
+
+    def _should_failover(self, agent, classified, retry):
+        # Mirror the exact gating condition added to conversation_loop.py.
+        return (
+            classified.is_auth
+            and not retry.auth_failover_attempted
+            and agent._fallback_index < len(agent._fallback_chain)
+        )
+
+    def test_auth_failover_fires_when_chain_present(self):
+        agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
+        retry = TurnRetryState()
+        classified = classify_api_error(_auth_error(401))
+        assert self._should_failover(agent, classified, retry) is True
+        # And the activation primitive actually advances on an auth reason.
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(_mock_client(), "gpt-4o"),
+        ):
+            advanced = agent._try_activate_fallback(reason=classified.reason)
+        assert advanced is True
+        assert agent._fallback_index == 1
+
+    def test_no_failover_without_chain(self):
+        """A user with no fallback configured (the common case for the
+        original incident) does NOT failover — falls through to the
+        existing terminal handling + troubleshooting advice."""
+        agent = _make_agent(fallback_model=None)
+        retry = TurnRetryState()
+        classified = classify_api_error(_auth_error(401))
+        assert self._should_failover(agent, classified, retry) is False
+
+    def test_guard_blocks_repeat_failover(self):
+        agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
+        retry = TurnRetryState()
+        retry.auth_failover_attempted = True  # already escalated this attempt
+        classified = classify_api_error(_auth_error(401))
+        assert self._should_failover(agent, classified, retry) is False
+
+    def test_non_auth_error_does_not_trigger_auth_failover(self):
+        agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}])
+        retry = TurnRetryState()
+        err = Exception("Error code: 500 - internal server error")
+        err.status_code = 500
+        classified = classify_api_error(err)
+        assert self._should_failover(agent, classified, retry) is False