fix(auxiliary): cache 402'd providers as unhealthy with TTL to stop per-call retry storms (#23597)

When an auxiliary provider returns HTTP 402 (credit / payment), every
subsequent compression / title-gen / session-search / vision call still
re-tried it as the FIRST entry in the chain — burning ~1 RTT to hit 402
again, then falling back. On a long Discord/LCM session that meant dozens
of doomed 402s per minute (issue #23570).

Add a per-process unhealthy-provider cache with a 10 min TTL. When any
caller observes a payment error against a provider, the label is marked
unhealthy and skipped by:
  * _resolve_auto Step-1 (main provider use-as-aux path)
  * _resolve_auto Step-2 (aggregator/fallback chain)
  * _try_payment_fallback (used by call_llm/acall_llm on first 402)

Skip-logs are throttled to once per minute per label so a bursty session
doesn't spam agent.log. Entries auto-expire so a topped-up account
recovers without manual intervention. The cache is in-process only by
design — multi-profile users with different keys per profile must each
hit the 402 once.

Refs #23570
This commit is contained in:
Teknium 2026-05-10 22:43:14 -07:00 committed by GitHub
parent ace1c4ea8c
commit 228b7d27bd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 307 additions and 11 deletions

View file

@ -1830,6 +1830,113 @@ def _get_provider_chain() -> List[tuple]:
]
# ── Auxiliary "recently 402'd" unhealthy-provider cache ────────────────────
#
# When an auxiliary provider returns HTTP 402 (Payment Required / credit
# exhaustion), retrying it on every subsequent aux call is wasteful — the
# provider stays depleted for hours or days, but the chain re-tries it as
# the FIRST entry on every compression/title-gen/session-search call,
# burns ~1 RTT, gets 402 again, then falls back. On a long Discord/LCM
# session that adds up to dozens of doomed 402s.
#
# Solution: when ANY caller observes a payment error against a provider,
# mark it unhealthy for ``_AUX_UNHEALTHY_TTL_SECONDS``. ``_resolve_auto``
# Step-2 and ``_try_payment_fallback`` both consult this cache and skip
# unhealthy entries (logging once per skip-reason so the user sees what
# happened). Entries auto-expire so a topped-up account recovers without
# manual intervention.
#
# Failure isolation: the cache is in-process only. A second hermes
# process won't inherit the unhealthy mark — that's intentional, since
# the user might be running two profiles with different OpenRouter keys.
_AUX_UNHEALTHY_TTL_SECONDS = 600 # 10 minutes
_aux_unhealthy_until: Dict[str, float] = {}
_aux_unhealthy_logged_at: Dict[str, float] = {}
# Map provider names that show up in resolved_provider / explicit-config
# back to the chain labels used by _get_provider_chain(). Keep in sync
# with the alias map in _try_payment_fallback below.
_AUX_UNHEALTHY_LABEL_ALIASES = {
"openrouter": "openrouter",
"nous": "nous",
"custom": "local/custom",
"local/custom": "local/custom",
"openai-codex": "openai-codex",
"codex": "openai-codex",
}
def _normalize_chain_label(provider: str) -> str:
"""Normalize a resolved_provider value to a chain label used by
``_get_provider_chain()``. Falls back to the lowercased input for
direct API-key providers (deepseek, alibaba, minimax, etc.) which
each report their own provider name from the api-key chain.
"""
if not provider:
return ""
p = str(provider).strip().lower()
return _AUX_UNHEALTHY_LABEL_ALIASES.get(p, p)
def _mark_provider_unhealthy(provider: str, ttl: Optional[float] = None) -> None:
"""Mark ``provider`` as recently-402'd, hidden from chain iteration
until the TTL expires. Called from the payment-fallback branches in
``call_llm`` and ``acall_llm`` after a confirmed payment error.
"""
label = _normalize_chain_label(provider)
if not label:
return
expires_at = time.time() + (ttl if ttl is not None else _AUX_UNHEALTHY_TTL_SECONDS)
_aux_unhealthy_until[label] = expires_at
logger.warning(
"Auxiliary: marking %s unhealthy for %ds (payment / credit error). "
"Subsequent auxiliary calls will skip it until %s.",
label,
int(ttl if ttl is not None else _AUX_UNHEALTHY_TTL_SECONDS),
time.strftime("%H:%M:%S", time.localtime(expires_at)),
)
def _is_provider_unhealthy(label: str) -> bool:
"""True iff ``label`` is in the unhealthy cache and the TTL hasn't expired.
Lazily evicts expired entries so the cache stays small.
"""
if not label:
return False
expires_at = _aux_unhealthy_until.get(label)
if expires_at is None:
return False
if time.time() >= expires_at:
_aux_unhealthy_until.pop(label, None)
_aux_unhealthy_logged_at.pop(label, None)
return False
return True
def _log_skip_unhealthy(label: str, task: Optional[str] = None) -> None:
"""Emit a single info-level log per minute when we skip an unhealthy
provider. Avoids spamming the log on bursty sessions while still
giving the user a trail.
"""
now = time.time()
last = _aux_unhealthy_logged_at.get(label, 0.0)
if now - last >= 60:
_aux_unhealthy_logged_at[label] = now
expires_at = _aux_unhealthy_until.get(label, now)
logger.info(
"Auxiliary %s: skipping %s (recently returned payment error, retry in %ds)",
task or "call", label, max(0, int(expires_at - now)),
)
def _reset_aux_unhealthy_cache() -> None:
"""Clear the unhealthy cache. Used by tests and by a future explicit
user trigger (e.g. ``hermes config aux reset``)."""
_aux_unhealthy_until.clear()
_aux_unhealthy_logged_at.clear()
def _is_payment_error(exc: Exception) -> bool:
"""Detect payment/credit/quota exhaustion errors.
@ -2302,6 +2409,10 @@ def _try_payment_fallback(
for label, try_fn in _get_provider_chain():
if label in skip_chain_labels:
continue
if _is_provider_unhealthy(label):
_log_skip_unhealthy(label, task)
tried.append(f"{label} (unhealthy)")
continue
client, model = try_fn()
if client is not None:
logger.info(
@ -2378,21 +2489,34 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
resolved_provider = "custom"
explicit_base_url = runtime_base_url
explicit_api_key = runtime_api_key or None
client, resolved = resolve_provider_client(
resolved_provider,
main_model,
explicit_base_url=explicit_base_url,
explicit_api_key=explicit_api_key,
api_mode=runtime_api_mode or None,
)
if client is not None:
logger.info("Auxiliary auto-detect: using main provider %s (%s)",
main_provider, resolved or main_model)
return client, resolved or main_model
# Skip Step-1 if the main provider was recently 402'd. The unhealthy
# cache TTL bounds how long we bypass it, so a topped-up account
# recovers automatically. If we tried Step-1 anyway, every aux call
# on a depleted main provider would pay one doomed 402 RTT before
# falling to Step-2.
main_chain_label = _normalize_chain_label(resolved_provider)
if main_chain_label and _is_provider_unhealthy(main_chain_label):
_log_skip_unhealthy(main_chain_label)
else:
client, resolved = resolve_provider_client(
resolved_provider,
main_model,
explicit_base_url=explicit_base_url,
explicit_api_key=explicit_api_key,
api_mode=runtime_api_mode or None,
)
if client is not None:
logger.info("Auxiliary auto-detect: using main provider %s (%s)",
main_provider, resolved or main_model)
return client, resolved or main_model
# ── Step 2: aggregator / fallback chain ──────────────────────────────
tried = []
for label, try_fn in _get_provider_chain():
if _is_provider_unhealthy(label):
_log_skip_unhealthy(label)
tried.append(f"{label} (unhealthy)")
continue
client, model = try_fn()
if client is not None:
if tried:
@ -4224,6 +4348,13 @@ def call_llm(
if should_fallback and is_auto:
if _is_payment_error(first_err):
reason = "payment error"
# Resolve the actual provider label (resolved_provider may be
# "auto"; the client's base_url tells us which backend got the
# 402). Mark THAT label unhealthy so subsequent aux calls
# skip it instead of paying another doomed RTT.
_mark_provider_unhealthy(
_recoverable_pool_provider(resolved_provider, client) or resolved_provider
)
elif _is_rate_limit_error(first_err):
reason = "rate limit"
else:
@ -4546,6 +4677,9 @@ async def async_call_llm(
if should_fallback and is_auto:
if _is_payment_error(first_err):
reason = "payment error"
_mark_provider_unhealthy(
_recoverable_pool_provider(resolved_provider, client) or resolved_provider
)
elif _is_rate_limit_error(first_err):
reason = "rate limit"
else:

View file

@ -2496,3 +2496,165 @@ class TestAnthropicExplicitApiKey:
assert mock_build.call_args.args[0] == "explicit-fallback-key", (
"resolve_provider_client must forward explicit_api_key to _try_anthropic()"
)
# ── Auxiliary unhealthy-provider TTL cache (issue #23570) ────────────────
class TestAuxUnhealthyCache:
"""Recently-402'd providers are skipped on subsequent aux calls.
Without this, every compression / title-gen / session-search call on a
long session retries a depleted OpenRouter (~1 RTT to 402) before
falling back to the next provider. The TTL cache hides the unhealthy
provider for ``_AUX_UNHEALTHY_TTL_SECONDS`` so the chain skips it.
"""
def setup_method(self):
from agent.auxiliary_client import _reset_aux_unhealthy_cache
_reset_aux_unhealthy_cache()
def teardown_method(self):
from agent.auxiliary_client import _reset_aux_unhealthy_cache
_reset_aux_unhealthy_cache()
def test_mark_then_skip(self):
from agent.auxiliary_client import (
_mark_provider_unhealthy,
_is_provider_unhealthy,
)
assert _is_provider_unhealthy("openrouter") is False
_mark_provider_unhealthy("openrouter")
assert _is_provider_unhealthy("openrouter") is True
def test_ttl_expiry_evicts(self):
from agent.auxiliary_client import (
_mark_provider_unhealthy,
_is_provider_unhealthy,
_aux_unhealthy_until,
)
_mark_provider_unhealthy("openrouter", ttl=0.01)
assert _is_provider_unhealthy("openrouter") is True
import time
time.sleep(0.02)
# Lazy eviction: first lookup after expiry returns False AND removes the entry.
assert _is_provider_unhealthy("openrouter") is False
assert "openrouter" not in _aux_unhealthy_until
def test_alias_normalization(self):
"""'codex' should normalize to 'openai-codex' so the cache lookup
matches the chain label."""
from agent.auxiliary_client import (
_mark_provider_unhealthy,
_is_provider_unhealthy,
)
_mark_provider_unhealthy("codex")
assert _is_provider_unhealthy("openai-codex") is True
def test_resolve_auto_skips_unhealthy_step2(self):
"""_resolve_auto Step-2 chain skips unhealthy providers."""
from agent.auxiliary_client import (
_resolve_auto,
_mark_provider_unhealthy,
)
nous_client = MagicMock()
# Mark OpenRouter unhealthy → chain should skip it and pick nous.
_mark_provider_unhealthy("openrouter")
with patch("agent.auxiliary_client._read_main_provider", return_value=""), \
patch("agent.auxiliary_client._read_main_model", return_value=""), \
patch("agent.auxiliary_client._try_openrouter") as or_try, \
patch("agent.auxiliary_client._try_nous", return_value=(nous_client, "nous-model")), \
patch("agent.auxiliary_client._try_custom_endpoint", return_value=(None, None)), \
patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)):
client, model = _resolve_auto()
assert client is nous_client
assert model == "nous-model"
# The skipped provider's _try_* should NOT have been called at all.
or_try.assert_not_called()
def test_resolve_auto_skips_unhealthy_main_in_step1(self):
"""Step-1 also consults the unhealthy cache so a depleted main
provider doesn't burn a 402 RTT every aux call. Falls through to
Step-2 chain (which also respects the cache)."""
from agent.auxiliary_client import (
_resolve_auto,
_mark_provider_unhealthy,
)
nous_client = MagicMock()
_mark_provider_unhealthy("openrouter")
with patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"), \
patch("agent.auxiliary_client._read_main_model", return_value="anthropic/claude-sonnet-4.6"), \
patch("agent.auxiliary_client.resolve_provider_client") as step1, \
patch("agent.auxiliary_client._try_openrouter") as or_try, \
patch("agent.auxiliary_client._try_nous", return_value=(nous_client, "n-model")), \
patch("agent.auxiliary_client._try_custom_endpoint", return_value=(None, None)), \
patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)):
client, model = _resolve_auto()
# Step-1 was bypassed — resolve_provider_client never invoked
step1.assert_not_called()
# Step-2 also skipped openrouter and landed on nous
or_try.assert_not_called()
assert client is nous_client
def test_payment_fallback_skips_unhealthy(self):
"""_try_payment_fallback also consults the unhealthy cache so a 402
on OpenRouter doesn't cause a second OR call within the same chain
iteration if it gets re-entered."""
from agent.auxiliary_client import (
_try_payment_fallback,
_mark_provider_unhealthy,
)
nous_client = MagicMock()
# Mark BOTH the failed provider (openrouter) and a sibling (custom)
# unhealthy. The chain should still find nous.
_mark_provider_unhealthy("local/custom")
with patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"), \
patch("agent.auxiliary_client._try_openrouter") as or_try, \
patch("agent.auxiliary_client._try_nous", return_value=(nous_client, "n-model")), \
patch("agent.auxiliary_client._try_custom_endpoint") as custom_try, \
patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)):
client, model, label = _try_payment_fallback("openrouter", task="compression")
assert client is nous_client
assert label == "nous"
# OR is skipped via skip_chain_labels (failed provider), custom via unhealthy cache.
or_try.assert_not_called()
custom_try.assert_not_called()
def test_call_llm_marks_provider_unhealthy_on_402(self, monkeypatch):
"""A 402 from call_llm causes the provider to be marked unhealthy
so the next call skips it instead of re-trying the same depleted
endpoint."""
from agent.auxiliary_client import (
call_llm,
_is_provider_unhealthy,
)
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
primary_client = MagicMock()
# base_url tells _recoverable_pool_provider() that this is OpenRouter
# (resolved_provider="auto" doesn't carry that information by itself).
primary_client.base_url = "https://openrouter.ai/api/v1/"
err = Exception("Payment Required: insufficient credits")
err.status_code = 402
primary_client.chat.completions.create.side_effect = err
nous_client = MagicMock()
nous_resp = MagicMock()
nous_resp.choices = [MagicMock(message=MagicMock(content="ok"))]
nous_client.chat.completions.create.return_value = nous_resp
with patch("agent.auxiliary_client._get_cached_client",
return_value=(primary_client, "google/gemini-3-flash-preview")), \
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("auto", "google/gemini-3-flash-preview", None, None, None)), \
patch("agent.auxiliary_client._try_payment_fallback",
return_value=(nous_client, "n-model", "nous")), \
patch("agent.auxiliary_client._build_call_kwargs",
return_value={"model": "n-model", "messages": [{"role": "user", "content": "hi"}]}):
assert _is_provider_unhealthy("openrouter") is False
call_llm(
task="compression",
messages=[{"role": "user", "content": "hi"}],
)
# After the 402, OpenRouter is in the unhealthy cache.
assert _is_provider_unhealthy("openrouter") is True