fix(auxiliary): cache 402'd providers as unhealthy with TTL to stop per-call retry storms (#23597)

When an auxiliary provider returns HTTP 402 (credit / payment), every
subsequent compression / title-gen / session-search / vision call still
re-tried it as the FIRST entry in the chain — burning ~1 RTT to hit 402
again, then falling back. On a long Discord/LCM session that meant dozens
of doomed 402s per minute (issue #23570).

Add a per-process unhealthy-provider cache with a 10 min TTL. When any
caller observes a payment error against a provider, the label is marked
unhealthy and skipped by:
  * _resolve_auto Step-1 (main provider use-as-aux path)
  * _resolve_auto Step-2 (aggregator/fallback chain)
  * _try_payment_fallback (used by call_llm/acall_llm on first 402)

Skip-logs are throttled to once per minute per label so a bursty session
doesn't spam agent.log. Entries auto-expire so a topped-up account
recovers without manual intervention. The cache is in-process only by
design — multi-profile users with different keys per profile must each
hit the 402 once.

Refs #23570
This commit is contained in:
Teknium 2026-05-10 22:43:14 -07:00 committed by GitHub
parent ace1c4ea8c
commit 228b7d27bd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 307 additions and 11 deletions

View file

@ -1830,6 +1830,113 @@ def _get_provider_chain() -> List[tuple]:
]
# ── Auxiliary "recently 402'd" unhealthy-provider cache ────────────────────
#
# When an auxiliary provider returns HTTP 402 (Payment Required / credit
# exhaustion), retrying it on every subsequent aux call is wasteful — the
# provider stays depleted for hours or days, but the chain re-tries it as
# the FIRST entry on every compression/title-gen/session-search call,
# burns ~1 RTT, gets 402 again, then falls back. On a long Discord/LCM
# session that adds up to dozens of doomed 402s.
#
# Solution: when ANY caller observes a payment error against a provider,
# mark it unhealthy for ``_AUX_UNHEALTHY_TTL_SECONDS``. ``_resolve_auto``
# Step-2 and ``_try_payment_fallback`` both consult this cache and skip
# unhealthy entries (logging once per skip-reason so the user sees what
# happened). Entries auto-expire so a topped-up account recovers without
# manual intervention.
#
# Failure isolation: the cache is in-process only. A second hermes
# process won't inherit the unhealthy mark — that's intentional, since
# the user might be running two profiles with different OpenRouter keys.
_AUX_UNHEALTHY_TTL_SECONDS = 600 # 10 minutes
_aux_unhealthy_until: Dict[str, float] = {}
_aux_unhealthy_logged_at: Dict[str, float] = {}
# Map provider names that show up in resolved_provider / explicit-config
# back to the chain labels used by _get_provider_chain(). Keep in sync
# with the alias map in _try_payment_fallback below.
_AUX_UNHEALTHY_LABEL_ALIASES = {
"openrouter": "openrouter",
"nous": "nous",
"custom": "local/custom",
"local/custom": "local/custom",
"openai-codex": "openai-codex",
"codex": "openai-codex",
}
def _normalize_chain_label(provider: str) -> str:
"""Normalize a resolved_provider value to a chain label used by
``_get_provider_chain()``. Falls back to the lowercased input for
direct API-key providers (deepseek, alibaba, minimax, etc.) which
each report their own provider name from the api-key chain.
"""
if not provider:
return ""
p = str(provider).strip().lower()
return _AUX_UNHEALTHY_LABEL_ALIASES.get(p, p)
def _mark_provider_unhealthy(provider: str, ttl: Optional[float] = None) -> None:
"""Mark ``provider`` as recently-402'd, hidden from chain iteration
until the TTL expires. Called from the payment-fallback branches in
``call_llm`` and ``acall_llm`` after a confirmed payment error.
"""
label = _normalize_chain_label(provider)
if not label:
return
expires_at = time.time() + (ttl if ttl is not None else _AUX_UNHEALTHY_TTL_SECONDS)
_aux_unhealthy_until[label] = expires_at
logger.warning(
"Auxiliary: marking %s unhealthy for %ds (payment / credit error). "
"Subsequent auxiliary calls will skip it until %s.",
label,
int(ttl if ttl is not None else _AUX_UNHEALTHY_TTL_SECONDS),
time.strftime("%H:%M:%S", time.localtime(expires_at)),
)
def _is_provider_unhealthy(label: str) -> bool:
"""True iff ``label`` is in the unhealthy cache and the TTL hasn't expired.
Lazily evicts expired entries so the cache stays small.
"""
if not label:
return False
expires_at = _aux_unhealthy_until.get(label)
if expires_at is None:
return False
if time.time() >= expires_at:
_aux_unhealthy_until.pop(label, None)
_aux_unhealthy_logged_at.pop(label, None)
return False
return True
def _log_skip_unhealthy(label: str, task: Optional[str] = None) -> None:
"""Emit a single info-level log per minute when we skip an unhealthy
provider. Avoids spamming the log on bursty sessions while still
giving the user a trail.
"""
now = time.time()
last = _aux_unhealthy_logged_at.get(label, 0.0)
if now - last >= 60:
_aux_unhealthy_logged_at[label] = now
expires_at = _aux_unhealthy_until.get(label, now)
logger.info(
"Auxiliary %s: skipping %s (recently returned payment error, retry in %ds)",
task or "call", label, max(0, int(expires_at - now)),
)
def _reset_aux_unhealthy_cache() -> None:
"""Clear the unhealthy cache. Used by tests and by a future explicit
user trigger (e.g. ``hermes config aux reset``)."""
_aux_unhealthy_until.clear()
_aux_unhealthy_logged_at.clear()
def _is_payment_error(exc: Exception) -> bool:
"""Detect payment/credit/quota exhaustion errors.
@ -2302,6 +2409,10 @@ def _try_payment_fallback(
for label, try_fn in _get_provider_chain():
if label in skip_chain_labels:
continue
if _is_provider_unhealthy(label):
_log_skip_unhealthy(label, task)
tried.append(f"{label} (unhealthy)")
continue
client, model = try_fn()
if client is not None:
logger.info(
@ -2378,21 +2489,34 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
resolved_provider = "custom"
explicit_base_url = runtime_base_url
explicit_api_key = runtime_api_key or None
client, resolved = resolve_provider_client(
resolved_provider,
main_model,
explicit_base_url=explicit_base_url,
explicit_api_key=explicit_api_key,
api_mode=runtime_api_mode or None,
)
if client is not None:
logger.info("Auxiliary auto-detect: using main provider %s (%s)",
main_provider, resolved or main_model)
return client, resolved or main_model
# Skip Step-1 if the main provider was recently 402'd. The unhealthy
# cache TTL bounds how long we bypass it, so a topped-up account
# recovers automatically. If we tried Step-1 anyway, every aux call
# on a depleted main provider would pay one doomed 402 RTT before
# falling to Step-2.
main_chain_label = _normalize_chain_label(resolved_provider)
if main_chain_label and _is_provider_unhealthy(main_chain_label):
_log_skip_unhealthy(main_chain_label)
else:
client, resolved = resolve_provider_client(
resolved_provider,
main_model,
explicit_base_url=explicit_base_url,
explicit_api_key=explicit_api_key,
api_mode=runtime_api_mode or None,
)
if client is not None:
logger.info("Auxiliary auto-detect: using main provider %s (%s)",
main_provider, resolved or main_model)
return client, resolved or main_model
# ── Step 2: aggregator / fallback chain ──────────────────────────────
tried = []
for label, try_fn in _get_provider_chain():
if _is_provider_unhealthy(label):
_log_skip_unhealthy(label)
tried.append(f"{label} (unhealthy)")
continue
client, model = try_fn()
if client is not None:
if tried:
@ -4224,6 +4348,13 @@ def call_llm(
if should_fallback and is_auto:
if _is_payment_error(first_err):
reason = "payment error"
# Resolve the actual provider label (resolved_provider may be
# "auto"; the client's base_url tells us which backend got the
# 402). Mark THAT label unhealthy so subsequent aux calls
# skip it instead of paying another doomed RTT.
_mark_provider_unhealthy(
_recoverable_pool_provider(resolved_provider, client) or resolved_provider
)
elif _is_rate_limit_error(first_err):
reason = "rate limit"
else:
@ -4546,6 +4677,9 @@ async def async_call_llm(
if should_fallback and is_auto:
if _is_payment_error(first_err):
reason = "payment error"
_mark_provider_unhealthy(
_recoverable_pool_provider(resolved_provider, client) or resolved_provider
)
elif _is_rate_limit_error(first_err):
reason = "rate limit"
else: