From 973bb124a415be6dd9948d4a0c2c7690c30f2061 Mon Sep 17 00:00:00 2001 From: zccyman <16263913+zccyman@users.noreply.github.com> Date: Mon, 18 May 2026 19:55:12 +0800 Subject: [PATCH] fix(credential-pool): rotate immediately when credential already exhausted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #26145. When the user interrupts the retry loop between two 429s (Ctrl-C in interactive mode, /new, gateway disconnect), the local has_retried_429 flag dies with the recovery function. On the next user prompt the agent restarts with has_retried_429=False, hits 429 on the exhausted credential, sets the flag, returns 'retry once'. Repeat forever — the second 429 that would trigger rotation is never reached, and healthy entries (priority>0 free/paid accounts) are never tried. Fix: in recover_with_credential_pool's rate_limit branch, pre-check pool.current().last_status before running the retry-once dance. If the current entry is already STATUS_EXHAUSTED, rotate immediately. Uses getattr() for the attribute read so existing tests with SimpleNamespace mocks (which only set 'label') keep working. Co-authored-by: zccyman <16263913+zccyman@users.noreply.github.com> --- agent/agent_runtime_helpers.py | 24 +++++ .../test_credential_pool_interrupt.py | 100 ++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 tests/run_agent/test_credential_pool_interrupt.py diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py index f7c8819eb5e..f2d9112ccd3 100644 --- a/agent/agent_runtime_helpers.py +++ b/agent/agent_runtime_helpers.py @@ -41,6 +41,7 @@ from agent.message_sanitization import ( ) from agent.tool_dispatch_helpers import _trajectory_normalize_msg, make_tool_result_message from agent.trajectory import convert_scratchpad_to_think +from agent.credential_pool import STATUS_EXHAUSTED from agent.error_classifier import classify_api_error, FailoverReason from utils import base_url_host_matches, base_url_hostname, env_var_enabled, atomic_json_write @@ -582,6 +583,29 @@ def recover_with_credential_pool( return False, has_retried_429 if effective_reason == FailoverReason.rate_limit: + # If current credential is already marked exhausted, skip retry and + # rotate immediately. This prevents the "cancel-between-429s" trap + # where has_retried_429 (a local var) gets reset on each new prompt, + # causing the pool to retry the same exhausted credential forever. + current_entry = pool.current() + current_last_status = getattr(current_entry, "last_status", None) if current_entry else None + if current_last_status == STATUS_EXHAUSTED: + _ra().logger.info( + "Credential already exhausted (last_status=%s) — rotating immediately instead of retrying", + current_last_status, + ) + rotate_status = status_code if status_code is not None else 429 + next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context) + if next_entry is not None: + _ra().logger.info( + "Credential %s (rate limit, pre-exhausted) — rotated to pool entry %s", + rotate_status, + getattr(next_entry, "id", "?"), + ) + agent._swap_credential(next_entry) + return True, False + return False, True + usage_limit_reached = False if error_context: context_reason = str(error_context.get("reason") or "").lower() diff --git a/tests/run_agent/test_credential_pool_interrupt.py b/tests/run_agent/test_credential_pool_interrupt.py new file mode 100644 index 00000000000..8484fa003e9 --- /dev/null +++ b/tests/run_agent/test_credential_pool_interrupt.py @@ -0,0 +1,100 @@ +"""Regression test for #26145: credential pool rotation after interrupt-resume. + +When has_retried_429 is lost (user cancels between 429s), the pool should +still rotate if the current credential is already marked exhausted. +""" +import pytest +from unittest.mock import MagicMock, patch + +from agent.credential_pool import PooledCredential, STATUS_EXHAUSTED +from agent.error_classifier import FailoverReason + + +def _make_entry(idx, **overrides): + defaults = dict( + provider="test-provider", + id=f"cred-{idx}", + label=f"Credential {idx}", + auth_type="api_key", + priority=idx, + source="manual", + access_token=f"key-{idx}", + ) + defaults.update(overrides) + return PooledCredential(**defaults) + + +def _make_pool(entries): + pool = MagicMock() + pool.entries = entries + pool.current.return_value = entries[0] + return pool + + +def test_rotate_immediately_when_credential_already_exhausted(): + """If current credential has last_status='exhausted', rotate on first 429 + instead of retrying (Option A fix for #26145).""" + entries = [_make_entry(0, last_status=STATUS_EXHAUSTED, last_error_code=429), _make_entry(1)] + pool = _make_pool(entries) + pool.mark_exhausted_and_rotate.return_value = entries[1] + + from run_agent import AIAgent + with patch("run_agent.get_tool_definitions", return_value=[]), patch("run_agent.check_toolset_requirements", return_value={}), patch("run_agent.OpenAI"): + agent = MagicMock(spec=AIAgent) + agent._credential_pool = pool + agent._swap_credential = MagicMock() + recovered, retried = AIAgent._recover_with_credential_pool( + agent, + status_code=429, + has_retried_429=False, # Key: False on first 429 after interrupt + classified_reason=FailoverReason.rate_limit, + ) + + assert recovered is True + assert retried is False + pool.mark_exhausted_and_rotate.assert_called_once() + agent._swap_credential.assert_called_once_with(entries[1]) + + +def test_normal_retry_when_credential_not_exhausted(): + """When credential is active, first 429 should still retry (existing behavior).""" + entries = [_make_entry(0, last_status=None), _make_entry(1)] + pool = _make_pool(entries) + + from run_agent import AIAgent + with patch("run_agent.get_tool_definitions", return_value=[]), patch("run_agent.check_toolset_requirements", return_value={}), patch("run_agent.OpenAI"): + agent = MagicMock(spec=AIAgent) + agent._credential_pool = pool + recovered, retried = AIAgent._recover_with_credential_pool( + agent, + status_code=429, + has_retried_429=False, + classified_reason=FailoverReason.rate_limit, + ) + + assert recovered is False + assert retried is True + pool.mark_exhausted_and_rotate.assert_not_called() + + +def test_rotate_on_second_429_when_not_exhausted(): + """When credential is active and this is the second 429, rotate (existing behavior).""" + entries = [_make_entry(0, last_status=None), _make_entry(1)] + pool = _make_pool(entries) + pool.mark_exhausted_and_rotate.return_value = entries[1] + + from run_agent import AIAgent + with patch("run_agent.get_tool_definitions", return_value=[]), patch("run_agent.check_toolset_requirements", return_value={}), patch("run_agent.OpenAI"): + agent = MagicMock(spec=AIAgent) + agent._credential_pool = pool + agent._swap_credential = MagicMock() + recovered, retried = AIAgent._recover_with_credential_pool( + agent, + status_code=429, + has_retried_429=True, # Second 429 + classified_reason=FailoverReason.rate_limit, + ) + + assert recovered is True + assert retried is False + pool.mark_exhausted_and_rotate.assert_called_once()