mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(pool): re-select from credential pool on primary runtime restore
_restore_primary_runtime restored the construction-time api_key snapshot and never consulted the credential pool. After the pool rotated away from a revoked/exhausted entry mid-session, every new turn restored the dead key, re-failed instantly, burned the remaining entries, and fell through to cross-provider fallback. After restoring the snapshot, re-select the pool's current best entry and swap the live credential in via _swap_credential (which already rebuilds the OpenAI/Anthropic client, reapplies base-url headers, and carries the #33163 base_url / OAuth-detection fixes). Falls back to the snapshot key when the pool is absent, empty, or the entry has no usable key. Salvaged from #25206 onto current main: the original targeted the pre-refactor monolithic method in run_agent.py; the logic now lives in agent/agent_runtime_helpers.py and is collapsed onto _swap_credential instead of re-inlining the client rebuild. Fixes #25205
This commit is contained in:
parent
a590c5efdc
commit
f0de4c6a47
3 changed files with 251 additions and 0 deletions
|
|
@ -1078,6 +1078,34 @@ def restore_primary_runtime(agent) -> bool:
|
|||
api_mode=rt.get("compressor_api_mode", ""),
|
||||
)
|
||||
|
||||
# ── Re-select from the credential pool if one is available ──
|
||||
# The snapshot's api_key was captured at construction time. Across
|
||||
# turns the pool may have rotated (token revocation, billing/rate-limit
|
||||
# exhaustion, cooldown), leaving the snapshot key stale. Restoring it
|
||||
# blindly re-fails on the first request and burns through the remaining
|
||||
# pool entries before cross-provider fallback even gets a chance. Ask
|
||||
# the pool for its current best entry and swap the live credential in.
|
||||
# When the pool is absent, empty, or the entry has no usable key, we
|
||||
# keep the snapshot key (the existing behavior). Fixes #25205.
|
||||
pool = getattr(agent, "_credential_pool", None)
|
||||
if pool is not None and pool.has_available():
|
||||
entry = pool.select()
|
||||
if entry is not None:
|
||||
entry_key = (
|
||||
getattr(entry, "runtime_api_key", None)
|
||||
or getattr(entry, "access_token", "")
|
||||
)
|
||||
if entry_key:
|
||||
# ``_swap_credential`` rebuilds the OpenAI/Anthropic client,
|
||||
# reapplies base-url-scoped headers, and carries the
|
||||
# accumulated base_url / OAuth-detection fixes (#33163).
|
||||
agent._swap_credential(entry)
|
||||
logger.info(
|
||||
"Restore re-selected pool entry %s (%s)",
|
||||
getattr(entry, "id", "?"),
|
||||
getattr(entry, "label", "?"),
|
||||
)
|
||||
|
||||
# ── Reset fallback chain for the new turn ──
|
||||
agent._fallback_activated = False
|
||||
agent._fallback_index = 0
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ AUTHOR_MAP = {
|
|||
"der@konsi.org": "konsisumer", # PR #19608 salvage (read-modify-write merge in write_credential_pool to preserve concurrently-added credentials; #19566)
|
||||
"linyubin@users.noreply.github.com": "linyubin", # PR #50228 salvage (eager fallback on persistent transport timeout/overloaded; #22277)
|
||||
"5261694+djstunami@users.noreply.github.com": "djstunami", # PR #5316 salvage / co-author (suppress transient check_fn flakes so subagents keep file/terminal tools; #21658 / #5304)
|
||||
"jmmaloney4@gmail.com": "jmmaloney4", # PR #25206 salvage (re-select credential pool on primary runtime restore; #25205)
|
||||
"dale@dalenguyen.me": "dalenguyen", # PR #53678 salvage (strip VIRTUAL_ENV/CONDA_PREFIX from terminal subprocess env; #23473)
|
||||
"blaryx@gmail.com": "Blaryxoff", # PR #32602 salvage (deep-merge PUT /api/config to preserve unrelated sections; #13396)
|
||||
"diamondeyesfox@gmail.com": "DiamondEyesFox", # PR #53351 salvage (rebaseline in-place compression flushes to prevent duplicate compacted rows; #9096)
|
||||
|
|
|
|||
222
tests/agent/test_restore_primary_pool_reselect.py
Normal file
222
tests/agent/test_restore_primary_pool_reselect.py
Normal file
|
|
@ -0,0 +1,222 @@
|
|||
# Copyright 2025 Nous Research (Licensed under the Apache License, Version 2.0)
|
||||
"""Test that _restore_primary_runtime re-selects from the credential pool
|
||||
instead of using a stale snapshot key.
|
||||
|
||||
Bug: when a credential pool entry is revoked/marked-exhausted during a turn,
|
||||
_restore_primary_runtime restores the original (now-stale) api_key from the
|
||||
construction-time snapshot. The next turn immediately hits the same error,
|
||||
exhausting remaining entries and falling through to cross-provider fallback.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.credential_pool import (
|
||||
AUTH_TYPE_OAUTH,
|
||||
PooledCredential,
|
||||
)
|
||||
|
||||
|
||||
def _make_entry(
|
||||
label: str,
|
||||
access_token: str,
|
||||
*,
|
||||
source: str = "device_code",
|
||||
priority: int = 0,
|
||||
last_status: str | None = None,
|
||||
last_status_at: float | None = None,
|
||||
) -> dict:
|
||||
return {
|
||||
"id": label,
|
||||
"label": label,
|
||||
"provider": "openai-codex",
|
||||
"auth_type": AUTH_TYPE_OAUTH,
|
||||
"source": source,
|
||||
"priority": priority,
|
||||
"access_token": access_token,
|
||||
"refresh_token": f"rt-{label}",
|
||||
"base_url": "https://chatgpt.com/backend-api/codex",
|
||||
"last_status": last_status,
|
||||
"last_status_at": last_status_at,
|
||||
}
|
||||
|
||||
|
||||
def _build_mock_pool(entries: list[dict], *, strategy: str = "round_robin"):
|
||||
"""Build a mock CredentialPool with the given entries."""
|
||||
from agent.credential_pool import CredentialPool
|
||||
|
||||
pool = CredentialPool(
|
||||
provider="openai-codex",
|
||||
entries=[PooledCredential.from_dict("openai-codex", e) for e in entries],
|
||||
)
|
||||
pool._strategy = strategy
|
||||
return pool
|
||||
|
||||
|
||||
class TestRestorePrimaryPoolReselect:
|
||||
"""_restore_primary_runtime should re-select from the credential pool."""
|
||||
|
||||
def _make_agent(self, pool):
|
||||
"""Create a minimal AIAgent with the given credential pool."""
|
||||
from run_agent import AIAgent
|
||||
|
||||
agent = AIAgent.__new__(AIAgent)
|
||||
agent.model = "gpt-5.5"
|
||||
agent.provider = "openai-codex"
|
||||
agent.base_url = "https://chatgpt.com/backend-api/codex"
|
||||
agent.api_mode = "codex_responses"
|
||||
agent.api_key = "original-key-entry-1"
|
||||
agent._client_kwargs = {
|
||||
"api_key": "original-key-entry-1",
|
||||
"base_url": "https://chatgpt.com/backend-api/codex",
|
||||
}
|
||||
agent._credential_pool = pool
|
||||
agent._fallback_activated = True
|
||||
agent._fallback_index = 1
|
||||
agent._rate_limited_until = 0
|
||||
agent._use_prompt_caching = False
|
||||
agent._use_native_cache_layout = False
|
||||
agent.context_compressor = MagicMock()
|
||||
agent.context_compressor.update_model = MagicMock()
|
||||
|
||||
# Snapshot the original state
|
||||
agent._primary_runtime = {
|
||||
"model": "gpt-5.5",
|
||||
"provider": "openai-codex",
|
||||
"base_url": "https://chatgpt.com/backend-api/codex",
|
||||
"api_mode": "codex_responses",
|
||||
"api_key": "original-key-entry-1",
|
||||
"client_kwargs": {
|
||||
"api_key": "original-key-entry-1",
|
||||
"base_url": "https://chatgpt.com/backend-api/codex",
|
||||
},
|
||||
"use_prompt_caching": False,
|
||||
"use_native_cache_layout": False,
|
||||
"compressor_model": "gpt-5.5",
|
||||
"compressor_base_url": "https://chatgpt.com/backend-api/codex",
|
||||
"compressor_api_key": "original-key-entry-1",
|
||||
"compressor_provider": "openai-codex",
|
||||
"compressor_context_length": 128000,
|
||||
"compressor_threshold_tokens": 0.8,
|
||||
}
|
||||
|
||||
# Mock client creation methods
|
||||
agent._create_openai_client = MagicMock(return_value=MagicMock())
|
||||
agent._apply_client_headers_for_base_url = MagicMock()
|
||||
agent._replace_primary_openai_client = MagicMock(return_value=True)
|
||||
|
||||
return agent
|
||||
|
||||
def test_restore_reselects_from_pool_after_rotation(self):
|
||||
"""After pool rotation, restore should use the new entry, not the stale snapshot key."""
|
||||
entries = [
|
||||
_make_entry("entry-1", "original-key-entry-1", priority=0),
|
||||
_make_entry("entry-2", "rotated-key-entry-2", priority=1),
|
||||
_make_entry("entry-3", "fresh-key-entry-3", priority=2),
|
||||
]
|
||||
pool = _build_mock_pool(entries)
|
||||
|
||||
# Simulate: entry-1 was exhausted, pool rotated to entry-2
|
||||
exhausted = pool._entries[0]
|
||||
pool._mark_exhausted(exhausted, 401)
|
||||
pool._current_id = "entry-2"
|
||||
|
||||
agent = self._make_agent(pool)
|
||||
result = agent._restore_primary_runtime()
|
||||
|
||||
assert result is True
|
||||
# The agent should have the NEW key from entry-2, not the stale snapshot key
|
||||
assert agent.api_key == "rotated-key-entry-2"
|
||||
assert agent._client_kwargs["api_key"] == "rotated-key-entry-2"
|
||||
|
||||
def test_restore_uses_freshest_available_entry(self):
|
||||
"""When multiple entries are available, restore should select the pool's best pick."""
|
||||
entries = [
|
||||
_make_entry("entry-1", "key-1", priority=0,
|
||||
last_status="exhausted", last_status_at=time.time() + 3600),
|
||||
_make_entry("entry-2", "key-2", priority=1),
|
||||
_make_entry("entry-3", "key-3", priority=2),
|
||||
]
|
||||
pool = _build_mock_pool(entries)
|
||||
|
||||
agent = self._make_agent(pool)
|
||||
result = agent._restore_primary_runtime()
|
||||
|
||||
assert result is True
|
||||
# entry-1 is exhausted, so pool should select entry-2
|
||||
assert agent.api_key == "key-2"
|
||||
assert agent._client_kwargs["api_key"] == "key-2"
|
||||
|
||||
def test_restore_without_pool_uses_snapshot(self):
|
||||
"""When no pool exists, restore should use the snapshot key (existing behavior)."""
|
||||
agent = self._make_agent(pool=None)
|
||||
result = agent._restore_primary_runtime()
|
||||
|
||||
assert result is True
|
||||
assert agent.api_key == "original-key-entry-1"
|
||||
|
||||
def test_restore_with_empty_pool_uses_snapshot(self):
|
||||
"""When pool exists but has no available entries, use snapshot key."""
|
||||
entries = [
|
||||
_make_entry("entry-1", "key-1", priority=0,
|
||||
last_status="exhausted", last_status_at=time.time() + 3600),
|
||||
]
|
||||
pool = _build_mock_pool(entries)
|
||||
|
||||
agent = self._make_agent(pool)
|
||||
result = agent._restore_primary_runtime()
|
||||
|
||||
assert result is True
|
||||
# Pool has no available entries, so fall back to snapshot key
|
||||
assert agent.api_key == "original-key-entry-1"
|
||||
|
||||
def test_restore_rebuilds_client_after_reselect(self):
|
||||
"""After re-selecting from pool, client should be rebuilt with new key."""
|
||||
entries = [
|
||||
_make_entry("entry-1", "key-1", priority=0),
|
||||
]
|
||||
pool = _build_mock_pool(entries)
|
||||
|
||||
agent = self._make_agent(pool)
|
||||
result = agent._restore_primary_runtime()
|
||||
|
||||
assert result is True
|
||||
# _swap_credential rebuilds the live OpenAI client with the fresh key.
|
||||
agent._replace_primary_openai_client.assert_called_once_with(
|
||||
reason="credential_rotation",
|
||||
)
|
||||
|
||||
def test_restore_skips_reselect_if_entry_has_no_key(self):
|
||||
"""If pool entry has an empty access token, fall back to snapshot key."""
|
||||
entries = [
|
||||
_make_entry("entry-1", "", priority=0),
|
||||
]
|
||||
pool = _build_mock_pool(entries)
|
||||
|
||||
agent = self._make_agent(pool)
|
||||
result = agent._restore_primary_runtime()
|
||||
|
||||
assert result is True
|
||||
# Entry has no key, so use snapshot
|
||||
assert agent.api_key == "original-key-entry-1"
|
||||
|
||||
def test_restore_updates_base_url_from_pool_entry(self):
|
||||
"""If pool entry has a different base_url, restore should update it."""
|
||||
entries = [
|
||||
{
|
||||
**_make_entry("entry-1", "key-1", priority=0),
|
||||
"base_url": "https://custom-endpoint.example.com/v1",
|
||||
},
|
||||
]
|
||||
pool = _build_mock_pool(entries)
|
||||
|
||||
agent = self._make_agent(pool)
|
||||
result = agent._restore_primary_runtime()
|
||||
|
||||
assert result is True
|
||||
assert "custom-endpoint.example.com" in agent.base_url
|
||||
assert "custom-endpoint.example.com" in agent._client_kwargs["base_url"]
|
||||
Loading…
Add table
Add a link
Reference in a new issue