hermes-agent/tests/agent/test_restore_primary_pool_reselect.py
Jack Maloney f0de4c6a47 fix(pool): re-select from credential pool on primary runtime restore
_restore_primary_runtime restored the construction-time api_key snapshot and
never consulted the credential pool. After the pool rotated away from a
revoked/exhausted entry mid-session, every new turn restored the dead key,
re-failed instantly, burned the remaining entries, and fell through to
cross-provider fallback.

After restoring the snapshot, re-select the pool's current best entry and
swap the live credential in via _swap_credential (which already rebuilds the
OpenAI/Anthropic client, reapplies base-url headers, and carries the #33163
base_url / OAuth-detection fixes). Falls back to the snapshot key when the
pool is absent, empty, or the entry has no usable key.

Salvaged from #25206 onto current main: the original targeted the pre-refactor
monolithic method in run_agent.py; the logic now lives in
agent/agent_runtime_helpers.py and is collapsed onto _swap_credential instead
of re-inlining the client rebuild.

Fixes #25205
2026-06-27 20:04:45 -07:00

222 lines
8.1 KiB
Python

# Copyright 2025 Nous Research (Licensed under the Apache License, Version 2.0)
"""Test that _restore_primary_runtime re-selects from the credential pool
instead of using a stale snapshot key.
Bug: when a credential pool entry is revoked/marked-exhausted during a turn,
_restore_primary_runtime restores the original (now-stale) api_key from the
construction-time snapshot. The next turn immediately hits the same error,
exhausting remaining entries and falling through to cross-provider fallback.
"""
import json
import logging
import time
from unittest.mock import MagicMock, patch
import pytest
from agent.credential_pool import (
AUTH_TYPE_OAUTH,
PooledCredential,
)
def _make_entry(
label: str,
access_token: str,
*,
source: str = "device_code",
priority: int = 0,
last_status: str | None = None,
last_status_at: float | None = None,
) -> dict:
return {
"id": label,
"label": label,
"provider": "openai-codex",
"auth_type": AUTH_TYPE_OAUTH,
"source": source,
"priority": priority,
"access_token": access_token,
"refresh_token": f"rt-{label}",
"base_url": "https://chatgpt.com/backend-api/codex",
"last_status": last_status,
"last_status_at": last_status_at,
}
def _build_mock_pool(entries: list[dict], *, strategy: str = "round_robin"):
"""Build a mock CredentialPool with the given entries."""
from agent.credential_pool import CredentialPool
pool = CredentialPool(
provider="openai-codex",
entries=[PooledCredential.from_dict("openai-codex", e) for e in entries],
)
pool._strategy = strategy
return pool
class TestRestorePrimaryPoolReselect:
"""_restore_primary_runtime should re-select from the credential pool."""
def _make_agent(self, pool):
"""Create a minimal AIAgent with the given credential pool."""
from run_agent import AIAgent
agent = AIAgent.__new__(AIAgent)
agent.model = "gpt-5.5"
agent.provider = "openai-codex"
agent.base_url = "https://chatgpt.com/backend-api/codex"
agent.api_mode = "codex_responses"
agent.api_key = "original-key-entry-1"
agent._client_kwargs = {
"api_key": "original-key-entry-1",
"base_url": "https://chatgpt.com/backend-api/codex",
}
agent._credential_pool = pool
agent._fallback_activated = True
agent._fallback_index = 1
agent._rate_limited_until = 0
agent._use_prompt_caching = False
agent._use_native_cache_layout = False
agent.context_compressor = MagicMock()
agent.context_compressor.update_model = MagicMock()
# Snapshot the original state
agent._primary_runtime = {
"model": "gpt-5.5",
"provider": "openai-codex",
"base_url": "https://chatgpt.com/backend-api/codex",
"api_mode": "codex_responses",
"api_key": "original-key-entry-1",
"client_kwargs": {
"api_key": "original-key-entry-1",
"base_url": "https://chatgpt.com/backend-api/codex",
},
"use_prompt_caching": False,
"use_native_cache_layout": False,
"compressor_model": "gpt-5.5",
"compressor_base_url": "https://chatgpt.com/backend-api/codex",
"compressor_api_key": "original-key-entry-1",
"compressor_provider": "openai-codex",
"compressor_context_length": 128000,
"compressor_threshold_tokens": 0.8,
}
# Mock client creation methods
agent._create_openai_client = MagicMock(return_value=MagicMock())
agent._apply_client_headers_for_base_url = MagicMock()
agent._replace_primary_openai_client = MagicMock(return_value=True)
return agent
def test_restore_reselects_from_pool_after_rotation(self):
"""After pool rotation, restore should use the new entry, not the stale snapshot key."""
entries = [
_make_entry("entry-1", "original-key-entry-1", priority=0),
_make_entry("entry-2", "rotated-key-entry-2", priority=1),
_make_entry("entry-3", "fresh-key-entry-3", priority=2),
]
pool = _build_mock_pool(entries)
# Simulate: entry-1 was exhausted, pool rotated to entry-2
exhausted = pool._entries[0]
pool._mark_exhausted(exhausted, 401)
pool._current_id = "entry-2"
agent = self._make_agent(pool)
result = agent._restore_primary_runtime()
assert result is True
# The agent should have the NEW key from entry-2, not the stale snapshot key
assert agent.api_key == "rotated-key-entry-2"
assert agent._client_kwargs["api_key"] == "rotated-key-entry-2"
def test_restore_uses_freshest_available_entry(self):
"""When multiple entries are available, restore should select the pool's best pick."""
entries = [
_make_entry("entry-1", "key-1", priority=0,
last_status="exhausted", last_status_at=time.time() + 3600),
_make_entry("entry-2", "key-2", priority=1),
_make_entry("entry-3", "key-3", priority=2),
]
pool = _build_mock_pool(entries)
agent = self._make_agent(pool)
result = agent._restore_primary_runtime()
assert result is True
# entry-1 is exhausted, so pool should select entry-2
assert agent.api_key == "key-2"
assert agent._client_kwargs["api_key"] == "key-2"
def test_restore_without_pool_uses_snapshot(self):
"""When no pool exists, restore should use the snapshot key (existing behavior)."""
agent = self._make_agent(pool=None)
result = agent._restore_primary_runtime()
assert result is True
assert agent.api_key == "original-key-entry-1"
def test_restore_with_empty_pool_uses_snapshot(self):
"""When pool exists but has no available entries, use snapshot key."""
entries = [
_make_entry("entry-1", "key-1", priority=0,
last_status="exhausted", last_status_at=time.time() + 3600),
]
pool = _build_mock_pool(entries)
agent = self._make_agent(pool)
result = agent._restore_primary_runtime()
assert result is True
# Pool has no available entries, so fall back to snapshot key
assert agent.api_key == "original-key-entry-1"
def test_restore_rebuilds_client_after_reselect(self):
"""After re-selecting from pool, client should be rebuilt with new key."""
entries = [
_make_entry("entry-1", "key-1", priority=0),
]
pool = _build_mock_pool(entries)
agent = self._make_agent(pool)
result = agent._restore_primary_runtime()
assert result is True
# _swap_credential rebuilds the live OpenAI client with the fresh key.
agent._replace_primary_openai_client.assert_called_once_with(
reason="credential_rotation",
)
def test_restore_skips_reselect_if_entry_has_no_key(self):
"""If pool entry has an empty access token, fall back to snapshot key."""
entries = [
_make_entry("entry-1", "", priority=0),
]
pool = _build_mock_pool(entries)
agent = self._make_agent(pool)
result = agent._restore_primary_runtime()
assert result is True
# Entry has no key, so use snapshot
assert agent.api_key == "original-key-entry-1"
def test_restore_updates_base_url_from_pool_entry(self):
"""If pool entry has a different base_url, restore should update it."""
entries = [
{
**_make_entry("entry-1", "key-1", priority=0),
"base_url": "https://custom-endpoint.example.com/v1",
},
]
pool = _build_mock_pool(entries)
agent = self._make_agent(pool)
result = agent._restore_primary_runtime()
assert result is True
assert "custom-endpoint.example.com" in agent.base_url
assert "custom-endpoint.example.com" in agent._client_kwargs["base_url"]