fix(credential-pool): rotate immediately when credential already exhausted

Closes #26145.

When the user interrupts the retry loop between two 429s (Ctrl-C in
interactive mode, /new, gateway disconnect), the local has_retried_429
flag dies with the recovery function. On the next user prompt the agent
restarts with has_retried_429=False, hits 429 on the exhausted credential,
sets the flag, returns 'retry once'. Repeat forever — the second 429 that
would trigger rotation is never reached, and healthy entries (priority>0
free/paid accounts) are never tried.

Fix: in recover_with_credential_pool's rate_limit branch, pre-check
pool.current().last_status before running the retry-once dance. If the
current entry is already STATUS_EXHAUSTED, rotate immediately. Uses
getattr() for the attribute read so existing tests with SimpleNamespace
mocks (which only set 'label') keep working.

Co-authored-by: zccyman <16263913+zccyman@users.noreply.github.com>
This commit is contained in:
zccyman 2026-05-18 19:55:12 +08:00 committed by Teknium
parent 0a6a0ba527
commit 973bb124a4
2 changed files with 124 additions and 0 deletions

View file

@ -41,6 +41,7 @@ from agent.message_sanitization import (
)
from agent.tool_dispatch_helpers import _trajectory_normalize_msg, make_tool_result_message
from agent.trajectory import convert_scratchpad_to_think
from agent.credential_pool import STATUS_EXHAUSTED
from agent.error_classifier import classify_api_error, FailoverReason
from utils import base_url_host_matches, base_url_hostname, env_var_enabled, atomic_json_write
@ -582,6 +583,29 @@ def recover_with_credential_pool(
return False, has_retried_429
if effective_reason == FailoverReason.rate_limit:
# If current credential is already marked exhausted, skip retry and
# rotate immediately. This prevents the "cancel-between-429s" trap
# where has_retried_429 (a local var) gets reset on each new prompt,
# causing the pool to retry the same exhausted credential forever.
current_entry = pool.current()
current_last_status = getattr(current_entry, "last_status", None) if current_entry else None
if current_last_status == STATUS_EXHAUSTED:
_ra().logger.info(
"Credential already exhausted (last_status=%s) — rotating immediately instead of retrying",
current_last_status,
)
rotate_status = status_code if status_code is not None else 429
next_entry = pool.mark_exhausted_and_rotate(status_code=rotate_status, error_context=error_context)
if next_entry is not None:
_ra().logger.info(
"Credential %s (rate limit, pre-exhausted) — rotated to pool entry %s",
rotate_status,
getattr(next_entry, "id", "?"),
)
agent._swap_credential(next_entry)
return True, False
return False, True
usage_limit_reached = False
if error_context:
context_reason = str(error_context.get("reason") or "").lower()

View file

@ -0,0 +1,100 @@
"""Regression test for #26145: credential pool rotation after interrupt-resume.
When has_retried_429 is lost (user cancels between 429s), the pool should
still rotate if the current credential is already marked exhausted.
"""
import pytest
from unittest.mock import MagicMock, patch
from agent.credential_pool import PooledCredential, STATUS_EXHAUSTED
from agent.error_classifier import FailoverReason
def _make_entry(idx, **overrides):
defaults = dict(
provider="test-provider",
id=f"cred-{idx}",
label=f"Credential {idx}",
auth_type="api_key",
priority=idx,
source="manual",
access_token=f"key-{idx}",
)
defaults.update(overrides)
return PooledCredential(**defaults)
def _make_pool(entries):
pool = MagicMock()
pool.entries = entries
pool.current.return_value = entries[0]
return pool
def test_rotate_immediately_when_credential_already_exhausted():
"""If current credential has last_status='exhausted', rotate on first 429
instead of retrying (Option A fix for #26145)."""
entries = [_make_entry(0, last_status=STATUS_EXHAUSTED, last_error_code=429), _make_entry(1)]
pool = _make_pool(entries)
pool.mark_exhausted_and_rotate.return_value = entries[1]
from run_agent import AIAgent
with patch("run_agent.get_tool_definitions", return_value=[]), patch("run_agent.check_toolset_requirements", return_value={}), patch("run_agent.OpenAI"):
agent = MagicMock(spec=AIAgent)
agent._credential_pool = pool
agent._swap_credential = MagicMock()
recovered, retried = AIAgent._recover_with_credential_pool(
agent,
status_code=429,
has_retried_429=False, # Key: False on first 429 after interrupt
classified_reason=FailoverReason.rate_limit,
)
assert recovered is True
assert retried is False
pool.mark_exhausted_and_rotate.assert_called_once()
agent._swap_credential.assert_called_once_with(entries[1])
def test_normal_retry_when_credential_not_exhausted():
"""When credential is active, first 429 should still retry (existing behavior)."""
entries = [_make_entry(0, last_status=None), _make_entry(1)]
pool = _make_pool(entries)
from run_agent import AIAgent
with patch("run_agent.get_tool_definitions", return_value=[]), patch("run_agent.check_toolset_requirements", return_value={}), patch("run_agent.OpenAI"):
agent = MagicMock(spec=AIAgent)
agent._credential_pool = pool
recovered, retried = AIAgent._recover_with_credential_pool(
agent,
status_code=429,
has_retried_429=False,
classified_reason=FailoverReason.rate_limit,
)
assert recovered is False
assert retried is True
pool.mark_exhausted_and_rotate.assert_not_called()
def test_rotate_on_second_429_when_not_exhausted():
"""When credential is active and this is the second 429, rotate (existing behavior)."""
entries = [_make_entry(0, last_status=None), _make_entry(1)]
pool = _make_pool(entries)
pool.mark_exhausted_and_rotate.return_value = entries[1]
from run_agent import AIAgent
with patch("run_agent.get_tool_definitions", return_value=[]), patch("run_agent.check_toolset_requirements", return_value={}), patch("run_agent.OpenAI"):
agent = MagicMock(spec=AIAgent)
agent._credential_pool = pool
agent._swap_credential = MagicMock()
recovered, retried = AIAgent._recover_with_credential_pool(
agent,
status_code=429,
has_retried_429=True, # Second 429
classified_reason=FailoverReason.rate_limit,
)
assert recovered is True
assert retried is False
pool.mark_exhausted_and_rotate.assert_called_once()