From f22dd8a75ac0f7c2f78a3174cdf89bc915ac30c5 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sat, 20 Jun 2026 11:13:02 -0700 Subject: [PATCH] fix(agent): fail over to fallback provider on persistent auth failure (401/403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the active provider returns a 401/403 that survives its per-provider credential-refresh attempt (revoked OAuth, blocked/expired key, or an account pinned to a dead/staging inference endpoint), the conversation loop now escalates to the configured fallback chain instead of dead-ending. Before: the generic failover dispatch fired only for {rate_limit, billing}; auth/auth_permanent fell through to 'switch providers manually' advice and never called _try_activate_fallback(). A user whose primary credential was broken kept thrashing on the same dead credential every turn — the main agent appeared 'stuck in fallback mode' while never actually failing over. This also affected auxiliary tasks (compression, vision, title-gen), since auto-resolved aux follows the main provider. After: a persistent auth failure with a configured fallback chain switches to the next provider (mirroring the rate-limit/billing failover path), guarded one-shot per attempt by TurnRetryState.auth_failover_attempted. When no fallback is configured the behavior is unchanged — it falls through to the existing terminal handling and provider-specific troubleshooting guidance. Tests: test_auth_provider_failover.py — 401/403 classify as auth, the gating condition fires only with a chain present + guard unset, the guard blocks repeats, and non-auth (500) errors do not trigger auth failover. --- agent/conversation_loop.py | 33 +++++ agent/turn_retry_state.py | 6 + tests/agent/test_turn_retry_state.py | 1 + .../run_agent/test_auth_provider_failover.py | 126 ++++++++++++++++++ 4 files changed, 166 insertions(+) create mode 100644 tests/run_agent/test_auth_provider_failover.py diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 157762f1a1b..8726ba9bd26 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -2824,6 +2824,39 @@ def run_conversation( _retry.primary_recovery_attempted = False continue + # ── Auth-failure provider failover ─────────────────────── + # A 401/403 that survives the per-provider credential-refresh + # attempt above (each guarded by its own + # ``*_auth_retry_attempted`` flag) means the active provider's + # credential or endpoint is broken in a way refreshing can't + # fix (revoked OAuth, blocked/expired key, an account pinned to + # a dead/staging endpoint). Previously the loop only printed + # "switch providers manually" advice and fell through, so a + # user with a configured fallback chain kept thrashing on the + # same dead credential every turn instead of failing over. + # Escalate to the fallback chain here, mirroring the rate- + # limit/billing failover above. When no fallback is configured + # (or the chain is exhausted), _try_activate_fallback returns + # False and we fall through to the existing terminal handling + # + provider-specific troubleshooting guidance unchanged. + if ( + classified.is_auth + and not _retry.auth_failover_attempted + and agent._fallback_index < len(agent._fallback_chain) + ): + _retry.auth_failover_attempted = True + agent._buffer_status( + "🔐 Authentication failed and could not be refreshed — " + "switching to fallback provider..." + ) + if agent._try_activate_fallback(reason=classified.reason): + active_system_prompt = _sync_failover_system_message( + agent, api_messages, active_system_prompt) + retry_count = 0 + compression_attempts = 0 + _retry.primary_recovery_attempted = False + continue + # ── Nous Portal: record rate limit & skip retries ───── # When Nous returns a 429 that is a genuine account- # level rate limit, record the reset time to a shared diff --git a/agent/turn_retry_state.py b/agent/turn_retry_state.py index 188fe3f1c16..34183bd06be 100644 --- a/agent/turn_retry_state.py +++ b/agent/turn_retry_state.py @@ -58,6 +58,12 @@ class TurnRetryState: primary_recovery_attempted: bool = False has_retried_429: bool = False + # ── Auth-failure provider failover ─────────────────────────────────── + # Set once we've escalated a persistent 401/403 (after the per-provider + # credential-refresh attempt above failed) to the fallback chain, so we + # don't loop on the same auth failover within one attempt. + auth_failover_attempted: bool = False + # ── Restart signals (read by the outer loop after the attempt) ─────── restart_with_compressed_messages: bool = False restart_with_length_continuation: bool = False diff --git a/tests/agent/test_turn_retry_state.py b/tests/agent/test_turn_retry_state.py index 138cca12a64..21b772d6801 100644 --- a/tests/agent/test_turn_retry_state.py +++ b/tests/agent/test_turn_retry_state.py @@ -27,6 +27,7 @@ EXPECTED_FIELDS = { "llama_cpp_grammar_retry_attempted", "primary_recovery_attempted", "has_retried_429", + "auth_failover_attempted", "restart_with_compressed_messages", "restart_with_length_continuation", } diff --git a/tests/run_agent/test_auth_provider_failover.py b/tests/run_agent/test_auth_provider_failover.py new file mode 100644 index 00000000000..1576ef40887 --- /dev/null +++ b/tests/run_agent/test_auth_provider_failover.py @@ -0,0 +1,126 @@ +"""Auth-failure provider failover (conversation loop). + +A 401/403 that survives the per-provider credential-refresh attempt +(revoked OAuth, blocked/expired key, an account pinned to a dead/staging +endpoint) must escalate to the configured fallback chain instead of +thrashing on the same dead credential every turn. + +Before the fix, the conversation loop's generic failover dispatch only +fired for ``{rate_limit, billing}`` reasons; ``auth`` / ``auth_permanent`` +fell through to "switch providers manually" advice and never called +``_try_activate_fallback()``. These tests pin: + + 1. 401/403 classify as auth (``classified.is_auth`` True). + 2. ``_try_activate_fallback`` advances the chain on an auth reason. + 3. The one-shot guard flag exists on TurnRetryState. +""" + +from unittest.mock import MagicMock, patch + +from run_agent import AIAgent +from agent.error_classifier import classify_api_error, FailoverReason +from agent.turn_retry_state import TurnRetryState + + +def _make_agent(fallback_model=None): + with ( + patch("run_agent.get_tool_definitions", return_value=[]), + patch("run_agent.check_toolset_requirements", return_value={}), + patch("run_agent.OpenAI"), + ): + agent = AIAgent( + api_key="test-key", + base_url="https://openrouter.ai/api/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + fallback_model=fallback_model, + ) + agent.client = MagicMock() + return agent + + +def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"): + mock = MagicMock() + mock.base_url = base_url + mock.api_key = api_key + return mock + + +def _auth_error(status=401, msg="Your API key is invalid, blocked or out of funds."): + err = Exception(f"Error code: {status} - {msg}") + err.status_code = status + return err + + +class TestAuthErrorClassification: + def test_401_is_auth(self): + c = classify_api_error(_auth_error(401)) + assert c.reason in {FailoverReason.auth, FailoverReason.auth_permanent} + assert c.is_auth is True + + def test_403_is_auth(self): + c = classify_api_error(_auth_error(403, "forbidden")) + assert c.is_auth is True + + def test_500_is_not_auth(self): + err = Exception("Error code: 500 - internal server error") + err.status_code = 500 + c = classify_api_error(err) + assert c.is_auth is False + + +class TestAuthFailoverGuardFlag: + def test_flag_defaults_false(self): + assert TurnRetryState().auth_failover_attempted is False + + +class TestAuthFailoverActivation: + """The decision the loop makes on a persistent auth failure: when a + fallback chain exists and the guard hasn't fired, escalate to it.""" + + def _should_failover(self, agent, classified, retry): + # Mirror the exact gating condition added to conversation_loop.py. + return ( + classified.is_auth + and not retry.auth_failover_attempted + and agent._fallback_index < len(agent._fallback_chain) + ) + + def test_auth_failover_fires_when_chain_present(self): + agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}]) + retry = TurnRetryState() + classified = classify_api_error(_auth_error(401)) + assert self._should_failover(agent, classified, retry) is True + # And the activation primitive actually advances on an auth reason. + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(_mock_client(), "gpt-4o"), + ): + advanced = agent._try_activate_fallback(reason=classified.reason) + assert advanced is True + assert agent._fallback_index == 1 + + def test_no_failover_without_chain(self): + """A user with no fallback configured (the common case for the + original incident) does NOT failover — falls through to the + existing terminal handling + troubleshooting advice.""" + agent = _make_agent(fallback_model=None) + retry = TurnRetryState() + classified = classify_api_error(_auth_error(401)) + assert self._should_failover(agent, classified, retry) is False + + def test_guard_blocks_repeat_failover(self): + agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}]) + retry = TurnRetryState() + retry.auth_failover_attempted = True # already escalated this attempt + classified = classify_api_error(_auth_error(401)) + assert self._should_failover(agent, classified, retry) is False + + def test_non_auth_error_does_not_trigger_auth_failover(self): + agent = _make_agent(fallback_model=[{"provider": "openai", "model": "gpt-4o"}]) + retry = TurnRetryState() + err = Exception("Error code: 500 - internal server error") + err.status_code = 500 + classified = classify_api_error(err) + assert self._should_failover(agent, classified, retry) is False