mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(agent): isolate credential pool on provider fallback
Closes #33163. When _try_activate_fallback() switches from one provider to another (e.g. openai-codex → openrouter), the credential pool still belongs to the primary provider. This causes two compounding bugs: 1. The pool retains the primary's base_url. Downstream pool recovery (rate_limit / billing / auth) calls _swap_credential() with a primary entry which overwrites the agent's base_url back to the primary's endpoint. Every fallback request then 404s against the wrong host. 2. Pool recovery acting on errors from the FALLBACK provider mutates the PRIMARY's pool state (#33088 reported a related corruption pattern), exhausting/rotating entries that have nothing to do with the failure. Two layered fixes: a) try_activate_fallback (agent/chat_completion_helpers.py): on fallback activation, clear agent._credential_pool when the fallback provider doesn't match the pool's provider. Pool is preserved when the fallback shares the pool's provider (e.g. multiple openrouter entries). b) recover_with_credential_pool (agent/agent_runtime_helpers.py): defensive guard rejects any pool mutation when agent.provider doesn't match pool.provider. Defense-in-depth — should never fire after (a) is in place, but covers any future path that attaches a stale pool. Salvaged from @zccyman's PR #33217. The original PR was written against the pre-refactor monolithic run_agent.py; both target functions have since been extracted to module-level helpers. Behavior is identical — the guards live in the canonical extracted locations. Tests - New tests/run_agent/test_fallback_credential_isolation.py (7 tests covering: fallback clears mismatched pool, fallback preserves matching pool, recovery rejects mismatched pool, recovery accepts matching pool, 429-from-z.ai-doesn't-exhaust-codex-pool, _client_kwargs base_url survives pool clear, _swap_credential doesn't restore primary URL after fallback). - Cross-verified: 77/77 passing across fallback isolation tests + agent/test_credential_pool.py — no regression. Co-authored-by: zccyman <16263913+zccyman@users.noreply.github.com>
This commit is contained in:
parent
414a5bc924
commit
2e181602a1
3 changed files with 260 additions and 0 deletions
|
|
@ -560,6 +560,24 @@ def recover_with_credential_pool(
|
|||
if pool is None:
|
||||
return False, has_retried_429
|
||||
|
||||
# Defensive guard: if a fallback provider is active and its provider name
|
||||
# doesn't match the pool's provider, the pool belongs to the PRIMARY
|
||||
# provider. Mutating it based on fallback errors would corrupt the
|
||||
# primary's credential state (see #33088) and, via _swap_credential,
|
||||
# overwrite the agent's base_url back to the primary's endpoint — every
|
||||
# subsequent request then goes to the wrong host and 404s (see #33163).
|
||||
# The pool should only act when the agent is still on the same provider
|
||||
# that seeded the pool.
|
||||
current_provider = (getattr(agent, "provider", "") or "").strip().lower()
|
||||
pool_provider = (getattr(pool, "provider", "") or "").strip().lower()
|
||||
if current_provider and pool_provider and current_provider != pool_provider:
|
||||
_ra().logger.warning(
|
||||
"Credential pool provider mismatch: pool=%s, agent=%s — "
|
||||
"skipping pool mutation to avoid cross-provider contamination",
|
||||
pool_provider, current_provider,
|
||||
)
|
||||
return False, has_retried_429
|
||||
|
||||
effective_reason = classified_reason
|
||||
if effective_reason is None:
|
||||
if status_code == 402:
|
||||
|
|
|
|||
|
|
@ -1042,6 +1042,25 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
|
|||
agent._transport_cache.clear()
|
||||
agent._fallback_activated = True
|
||||
|
||||
# Clear the credential pool when the fallback provider doesn't match
|
||||
# the pool's provider. The pool was seeded for the primary provider;
|
||||
# leaving it attached means downstream recovery (rate_limit / billing /
|
||||
# auth) calls ``_swap_credential`` with a primary entry which overwrites
|
||||
# the agent's ``base_url`` back to the primary's endpoint — every
|
||||
# fallback request then 404s against the wrong host. See #33163.
|
||||
# When the fallback shares the pool's provider (e.g. both openrouter
|
||||
# entries with different routing) the pool is preserved.
|
||||
_existing_pool = getattr(agent, "_credential_pool", None)
|
||||
if _existing_pool is not None:
|
||||
_pool_provider = (getattr(_existing_pool, "provider", "") or "").strip().lower()
|
||||
if _pool_provider and _pool_provider != fb_provider:
|
||||
logger.info(
|
||||
"Fallback to %s/%s: clearing primary credential pool "
|
||||
"(pool_provider=%s) to prevent cross-provider contamination",
|
||||
fb_provider, fb_model, _pool_provider,
|
||||
)
|
||||
agent._credential_pool = None
|
||||
|
||||
# Honor per-provider / per-model request_timeout_seconds for the
|
||||
# fallback target (same knob the primary client uses). None = use
|
||||
# SDK default.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue