From f0de4c6a47204091d20b50961a02ddc02f3d81bc Mon Sep 17 00:00:00 2001 From: Jack Maloney Date: Sat, 27 Jun 2026 19:00:31 -0700 Subject: [PATCH] fix(pool): re-select from credential pool on primary runtime restore _restore_primary_runtime restored the construction-time api_key snapshot and never consulted the credential pool. After the pool rotated away from a revoked/exhausted entry mid-session, every new turn restored the dead key, re-failed instantly, burned the remaining entries, and fell through to cross-provider fallback. After restoring the snapshot, re-select the pool's current best entry and swap the live credential in via _swap_credential (which already rebuilds the OpenAI/Anthropic client, reapplies base-url headers, and carries the #33163 base_url / OAuth-detection fixes). Falls back to the snapshot key when the pool is absent, empty, or the entry has no usable key. Salvaged from #25206 onto current main: the original targeted the pre-refactor monolithic method in run_agent.py; the logic now lives in agent/agent_runtime_helpers.py and is collapsed onto _swap_credential instead of re-inlining the client rebuild. Fixes #25205 --- agent/agent_runtime_helpers.py | 28 +++ scripts/release.py | 1 + .../test_restore_primary_pool_reselect.py | 222 ++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 tests/agent/test_restore_primary_pool_reselect.py diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py index 33d2620bec3..a63305dd063 100644 --- a/agent/agent_runtime_helpers.py +++ b/agent/agent_runtime_helpers.py @@ -1078,6 +1078,34 @@ def restore_primary_runtime(agent) -> bool: api_mode=rt.get("compressor_api_mode", ""), ) + # ── Re-select from the credential pool if one is available ── + # The snapshot's api_key was captured at construction time. Across + # turns the pool may have rotated (token revocation, billing/rate-limit + # exhaustion, cooldown), leaving the snapshot key stale. Restoring it + # blindly re-fails on the first request and burns through the remaining + # pool entries before cross-provider fallback even gets a chance. Ask + # the pool for its current best entry and swap the live credential in. + # When the pool is absent, empty, or the entry has no usable key, we + # keep the snapshot key (the existing behavior). Fixes #25205. + pool = getattr(agent, "_credential_pool", None) + if pool is not None and pool.has_available(): + entry = pool.select() + if entry is not None: + entry_key = ( + getattr(entry, "runtime_api_key", None) + or getattr(entry, "access_token", "") + ) + if entry_key: + # ``_swap_credential`` rebuilds the OpenAI/Anthropic client, + # reapplies base-url-scoped headers, and carries the + # accumulated base_url / OAuth-detection fixes (#33163). + agent._swap_credential(entry) + logger.info( + "Restore re-selected pool entry %s (%s)", + getattr(entry, "id", "?"), + getattr(entry, "label", "?"), + ) + # ── Reset fallback chain for the new turn ── agent._fallback_activated = False agent._fallback_index = 0 diff --git a/scripts/release.py b/scripts/release.py index 989fc61c773..fb8796581f2 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -48,6 +48,7 @@ AUTHOR_MAP = { "der@konsi.org": "konsisumer", # PR #19608 salvage (read-modify-write merge in write_credential_pool to preserve concurrently-added credentials; #19566) "linyubin@users.noreply.github.com": "linyubin", # PR #50228 salvage (eager fallback on persistent transport timeout/overloaded; #22277) "5261694+djstunami@users.noreply.github.com": "djstunami", # PR #5316 salvage / co-author (suppress transient check_fn flakes so subagents keep file/terminal tools; #21658 / #5304) + "jmmaloney4@gmail.com": "jmmaloney4", # PR #25206 salvage (re-select credential pool on primary runtime restore; #25205) "dale@dalenguyen.me": "dalenguyen", # PR #53678 salvage (strip VIRTUAL_ENV/CONDA_PREFIX from terminal subprocess env; #23473) "blaryx@gmail.com": "Blaryxoff", # PR #32602 salvage (deep-merge PUT /api/config to preserve unrelated sections; #13396) "diamondeyesfox@gmail.com": "DiamondEyesFox", # PR #53351 salvage (rebaseline in-place compression flushes to prevent duplicate compacted rows; #9096) diff --git a/tests/agent/test_restore_primary_pool_reselect.py b/tests/agent/test_restore_primary_pool_reselect.py new file mode 100644 index 00000000000..1e526c64484 --- /dev/null +++ b/tests/agent/test_restore_primary_pool_reselect.py @@ -0,0 +1,222 @@ +# Copyright 2025 Nous Research (Licensed under the Apache License, Version 2.0) +"""Test that _restore_primary_runtime re-selects from the credential pool +instead of using a stale snapshot key. + +Bug: when a credential pool entry is revoked/marked-exhausted during a turn, +_restore_primary_runtime restores the original (now-stale) api_key from the +construction-time snapshot. The next turn immediately hits the same error, +exhausting remaining entries and falling through to cross-provider fallback. +""" + +import json +import logging +import time +from unittest.mock import MagicMock, patch + +import pytest + +from agent.credential_pool import ( + AUTH_TYPE_OAUTH, + PooledCredential, +) + + +def _make_entry( + label: str, + access_token: str, + *, + source: str = "device_code", + priority: int = 0, + last_status: str | None = None, + last_status_at: float | None = None, +) -> dict: + return { + "id": label, + "label": label, + "provider": "openai-codex", + "auth_type": AUTH_TYPE_OAUTH, + "source": source, + "priority": priority, + "access_token": access_token, + "refresh_token": f"rt-{label}", + "base_url": "https://chatgpt.com/backend-api/codex", + "last_status": last_status, + "last_status_at": last_status_at, + } + + +def _build_mock_pool(entries: list[dict], *, strategy: str = "round_robin"): + """Build a mock CredentialPool with the given entries.""" + from agent.credential_pool import CredentialPool + + pool = CredentialPool( + provider="openai-codex", + entries=[PooledCredential.from_dict("openai-codex", e) for e in entries], + ) + pool._strategy = strategy + return pool + + +class TestRestorePrimaryPoolReselect: + """_restore_primary_runtime should re-select from the credential pool.""" + + def _make_agent(self, pool): + """Create a minimal AIAgent with the given credential pool.""" + from run_agent import AIAgent + + agent = AIAgent.__new__(AIAgent) + agent.model = "gpt-5.5" + agent.provider = "openai-codex" + agent.base_url = "https://chatgpt.com/backend-api/codex" + agent.api_mode = "codex_responses" + agent.api_key = "original-key-entry-1" + agent._client_kwargs = { + "api_key": "original-key-entry-1", + "base_url": "https://chatgpt.com/backend-api/codex", + } + agent._credential_pool = pool + agent._fallback_activated = True + agent._fallback_index = 1 + agent._rate_limited_until = 0 + agent._use_prompt_caching = False + agent._use_native_cache_layout = False + agent.context_compressor = MagicMock() + agent.context_compressor.update_model = MagicMock() + + # Snapshot the original state + agent._primary_runtime = { + "model": "gpt-5.5", + "provider": "openai-codex", + "base_url": "https://chatgpt.com/backend-api/codex", + "api_mode": "codex_responses", + "api_key": "original-key-entry-1", + "client_kwargs": { + "api_key": "original-key-entry-1", + "base_url": "https://chatgpt.com/backend-api/codex", + }, + "use_prompt_caching": False, + "use_native_cache_layout": False, + "compressor_model": "gpt-5.5", + "compressor_base_url": "https://chatgpt.com/backend-api/codex", + "compressor_api_key": "original-key-entry-1", + "compressor_provider": "openai-codex", + "compressor_context_length": 128000, + "compressor_threshold_tokens": 0.8, + } + + # Mock client creation methods + agent._create_openai_client = MagicMock(return_value=MagicMock()) + agent._apply_client_headers_for_base_url = MagicMock() + agent._replace_primary_openai_client = MagicMock(return_value=True) + + return agent + + def test_restore_reselects_from_pool_after_rotation(self): + """After pool rotation, restore should use the new entry, not the stale snapshot key.""" + entries = [ + _make_entry("entry-1", "original-key-entry-1", priority=0), + _make_entry("entry-2", "rotated-key-entry-2", priority=1), + _make_entry("entry-3", "fresh-key-entry-3", priority=2), + ] + pool = _build_mock_pool(entries) + + # Simulate: entry-1 was exhausted, pool rotated to entry-2 + exhausted = pool._entries[0] + pool._mark_exhausted(exhausted, 401) + pool._current_id = "entry-2" + + agent = self._make_agent(pool) + result = agent._restore_primary_runtime() + + assert result is True + # The agent should have the NEW key from entry-2, not the stale snapshot key + assert agent.api_key == "rotated-key-entry-2" + assert agent._client_kwargs["api_key"] == "rotated-key-entry-2" + + def test_restore_uses_freshest_available_entry(self): + """When multiple entries are available, restore should select the pool's best pick.""" + entries = [ + _make_entry("entry-1", "key-1", priority=0, + last_status="exhausted", last_status_at=time.time() + 3600), + _make_entry("entry-2", "key-2", priority=1), + _make_entry("entry-3", "key-3", priority=2), + ] + pool = _build_mock_pool(entries) + + agent = self._make_agent(pool) + result = agent._restore_primary_runtime() + + assert result is True + # entry-1 is exhausted, so pool should select entry-2 + assert agent.api_key == "key-2" + assert agent._client_kwargs["api_key"] == "key-2" + + def test_restore_without_pool_uses_snapshot(self): + """When no pool exists, restore should use the snapshot key (existing behavior).""" + agent = self._make_agent(pool=None) + result = agent._restore_primary_runtime() + + assert result is True + assert agent.api_key == "original-key-entry-1" + + def test_restore_with_empty_pool_uses_snapshot(self): + """When pool exists but has no available entries, use snapshot key.""" + entries = [ + _make_entry("entry-1", "key-1", priority=0, + last_status="exhausted", last_status_at=time.time() + 3600), + ] + pool = _build_mock_pool(entries) + + agent = self._make_agent(pool) + result = agent._restore_primary_runtime() + + assert result is True + # Pool has no available entries, so fall back to snapshot key + assert agent.api_key == "original-key-entry-1" + + def test_restore_rebuilds_client_after_reselect(self): + """After re-selecting from pool, client should be rebuilt with new key.""" + entries = [ + _make_entry("entry-1", "key-1", priority=0), + ] + pool = _build_mock_pool(entries) + + agent = self._make_agent(pool) + result = agent._restore_primary_runtime() + + assert result is True + # _swap_credential rebuilds the live OpenAI client with the fresh key. + agent._replace_primary_openai_client.assert_called_once_with( + reason="credential_rotation", + ) + + def test_restore_skips_reselect_if_entry_has_no_key(self): + """If pool entry has an empty access token, fall back to snapshot key.""" + entries = [ + _make_entry("entry-1", "", priority=0), + ] + pool = _build_mock_pool(entries) + + agent = self._make_agent(pool) + result = agent._restore_primary_runtime() + + assert result is True + # Entry has no key, so use snapshot + assert agent.api_key == "original-key-entry-1" + + def test_restore_updates_base_url_from_pool_entry(self): + """If pool entry has a different base_url, restore should update it.""" + entries = [ + { + **_make_entry("entry-1", "key-1", priority=0), + "base_url": "https://custom-endpoint.example.com/v1", + }, + ] + pool = _build_mock_pool(entries) + + agent = self._make_agent(pool) + result = agent._restore_primary_runtime() + + assert result is True + assert "custom-endpoint.example.com" in agent.base_url + assert "custom-endpoint.example.com" in agent._client_kwargs["base_url"]