fix(auxiliary): cache 402'd providers as unhealthy with TTL to stop per-call retry storms (#23597)

When an auxiliary provider returns HTTP 402 (credit / payment), every subsequent compression / title-gen / session-search / vision call still re-tried it as the FIRST entry in the chain — burning ~1 RTT to hit 402 again, then falling back. On a long Discord/LCM session that meant dozens of doomed 402s per minute (issue #23570). Add a per-process unhealthy-provider cache with a 10 min TTL. When any caller observes a payment error against a provider, the label is marked unhealthy and skipped by: * _resolve_auto Step-1 (main provider use-as-aux path) * _resolve_auto Step-2 (aggregator/fallback chain) * _try_payment_fallback (used by call_llm/acall_llm on first 402) Skip-logs are throttled to once per minute per label so a bursty session doesn't spam agent.log. Entries auto-expire so a topped-up account recovers without manual intervention. The cache is in-process only by design — multi-profile users with different keys per profile must each hit the 402 once. Refs #23570
2026-05-18 04:41:56 +00:00 · 2026-05-10 22:43:14 -07:00 · 2026-05-10 22:43:14 -07:00 · 228b7d27bd
commit 228b7d27bd
parent ace1c4ea8c
2 changed files with 307 additions and 11 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -1830,6 +1830,113 @@ def _get_provider_chain() -> List[tuple]:
    ]


+# ── Auxiliary "recently 402'd" unhealthy-provider cache ────────────────────
+#
+# When an auxiliary provider returns HTTP 402 (Payment Required / credit
+# exhaustion), retrying it on every subsequent aux call is wasteful — the
+# provider stays depleted for hours or days, but the chain re-tries it as
+# the FIRST entry on every compression/title-gen/session-search call,
+# burns ~1 RTT, gets 402 again, then falls back. On a long Discord/LCM
+# session that adds up to dozens of doomed 402s.
+#
+# Solution: when ANY caller observes a payment error against a provider,
+# mark it unhealthy for ``_AUX_UNHEALTHY_TTL_SECONDS``. ``_resolve_auto``
+# Step-2 and ``_try_payment_fallback`` both consult this cache and skip
+# unhealthy entries (logging once per skip-reason so the user sees what
+# happened). Entries auto-expire so a topped-up account recovers without
+# manual intervention.
+#
+# Failure isolation: the cache is in-process only. A second hermes
+# process won't inherit the unhealthy mark — that's intentional, since
+# the user might be running two profiles with different OpenRouter keys.
+
+_AUX_UNHEALTHY_TTL_SECONDS = 600  # 10 minutes
+_aux_unhealthy_until: Dict[str, float] = {}
+_aux_unhealthy_logged_at: Dict[str, float] = {}
+
+# Map provider names that show up in resolved_provider / explicit-config
+# back to the chain labels used by _get_provider_chain(). Keep in sync
+# with the alias map in _try_payment_fallback below.
+_AUX_UNHEALTHY_LABEL_ALIASES = {
+    "openrouter": "openrouter",
+    "nous": "nous",
+    "custom": "local/custom",
+    "local/custom": "local/custom",
+    "openai-codex": "openai-codex",
+    "codex": "openai-codex",
+}
+
+
+def _normalize_chain_label(provider: str) -> str:
+    """Normalize a resolved_provider value to a chain label used by
+    ``_get_provider_chain()``. Falls back to the lowercased input for
+    direct API-key providers (deepseek, alibaba, minimax, etc.) which
+    each report their own provider name from the api-key chain.
+    """
+    if not provider:
+        return ""
+    p = str(provider).strip().lower()
+    return _AUX_UNHEALTHY_LABEL_ALIASES.get(p, p)
+
+
+def _mark_provider_unhealthy(provider: str, ttl: Optional[float] = None) -> None:
+    """Mark ``provider`` as recently-402'd, hidden from chain iteration
+    until the TTL expires. Called from the payment-fallback branches in
+    ``call_llm`` and ``acall_llm`` after a confirmed payment error.
+    """
+    label = _normalize_chain_label(provider)
+    if not label:
+        return
+    expires_at = time.time() + (ttl if ttl is not None else _AUX_UNHEALTHY_TTL_SECONDS)
+    _aux_unhealthy_until[label] = expires_at
+    logger.warning(
+        "Auxiliary: marking %s unhealthy for %ds (payment / credit error). "
+        "Subsequent auxiliary calls will skip it until %s.",
+        label,
+        int(ttl if ttl is not None else _AUX_UNHEALTHY_TTL_SECONDS),
+        time.strftime("%H:%M:%S", time.localtime(expires_at)),
+    )
+
+
+def _is_provider_unhealthy(label: str) -> bool:
+    """True iff ``label`` is in the unhealthy cache and the TTL hasn't expired.
+    Lazily evicts expired entries so the cache stays small.
+    """
+    if not label:
+        return False
+    expires_at = _aux_unhealthy_until.get(label)
+    if expires_at is None:
+        return False
+    if time.time() >= expires_at:
+        _aux_unhealthy_until.pop(label, None)
+        _aux_unhealthy_logged_at.pop(label, None)
+        return False
+    return True
+
+
+def _log_skip_unhealthy(label: str, task: Optional[str] = None) -> None:
+    """Emit a single info-level log per minute when we skip an unhealthy
+    provider. Avoids spamming the log on bursty sessions while still
+    giving the user a trail.
+    """
+    now = time.time()
+    last = _aux_unhealthy_logged_at.get(label, 0.0)
+    if now - last >= 60:
+        _aux_unhealthy_logged_at[label] = now
+        expires_at = _aux_unhealthy_until.get(label, now)
+        logger.info(
+            "Auxiliary %s: skipping %s (recently returned payment error, retry in %ds)",
+            task or "call", label, max(0, int(expires_at - now)),
+        )
+
+
+def _reset_aux_unhealthy_cache() -> None:
+    """Clear the unhealthy cache. Used by tests and by a future explicit
+    user trigger (e.g. ``hermes config aux reset``)."""
+    _aux_unhealthy_until.clear()
+    _aux_unhealthy_logged_at.clear()
+
+
 def _is_payment_error(exc: Exception) -> bool:
    """Detect payment/credit/quota exhaustion errors.

@ -2302,6 +2409,10 @@ def _try_payment_fallback(
    for label, try_fn in _get_provider_chain():
        if label in skip_chain_labels:
            continue
+        if _is_provider_unhealthy(label):
+            _log_skip_unhealthy(label, task)
+            tried.append(f"{label} (unhealthy)")
+            continue
        client, model = try_fn()
        if client is not None:
            logger.info(
@ -2378,21 +2489,34 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
            resolved_provider = "custom"
            explicit_base_url = runtime_base_url
            explicit_api_key = runtime_api_key or None
-        client, resolved = resolve_provider_client(
-            resolved_provider,
-            main_model,
-            explicit_base_url=explicit_base_url,
-            explicit_api_key=explicit_api_key,
-            api_mode=runtime_api_mode or None,
-        )
-        if client is not None:
-            logger.info("Auxiliary auto-detect: using main provider %s (%s)",
-                        main_provider, resolved or main_model)
-            return client, resolved or main_model
+        # Skip Step-1 if the main provider was recently 402'd. The unhealthy
+        # cache TTL bounds how long we bypass it, so a topped-up account
+        # recovers automatically. If we tried Step-1 anyway, every aux call
+        # on a depleted main provider would pay one doomed 402 RTT before
+        # falling to Step-2.
+        main_chain_label = _normalize_chain_label(resolved_provider)
+        if main_chain_label and _is_provider_unhealthy(main_chain_label):
+            _log_skip_unhealthy(main_chain_label)
+        else:
+            client, resolved = resolve_provider_client(
+                resolved_provider,
+                main_model,
+                explicit_base_url=explicit_base_url,
+                explicit_api_key=explicit_api_key,
+                api_mode=runtime_api_mode or None,
+            )
+            if client is not None:
+                logger.info("Auxiliary auto-detect: using main provider %s (%s)",
+                            main_provider, resolved or main_model)
+                return client, resolved or main_model

    # ── Step 2: aggregator / fallback chain ──────────────────────────────
    tried = []
    for label, try_fn in _get_provider_chain():
+        if _is_provider_unhealthy(label):
+            _log_skip_unhealthy(label)
+            tried.append(f"{label} (unhealthy)")
+            continue
        client, model = try_fn()
        if client is not None:
            if tried:
@ -4224,6 +4348,13 @@ def call_llm(
        if should_fallback and is_auto:
            if _is_payment_error(first_err):
                reason = "payment error"
+                # Resolve the actual provider label (resolved_provider may be
+                # "auto"; the client's base_url tells us which backend got the
+                # 402). Mark THAT label unhealthy so subsequent aux calls
+                # skip it instead of paying another doomed RTT.
+                _mark_provider_unhealthy(
+                    _recoverable_pool_provider(resolved_provider, client) or resolved_provider
+                )
            elif _is_rate_limit_error(first_err):
                reason = "rate limit"
            else:
@ -4546,6 +4677,9 @@ async def async_call_llm(
        if should_fallback and is_auto:
            if _is_payment_error(first_err):
                reason = "payment error"
+                _mark_provider_unhealthy(
+                    _recoverable_pool_provider(resolved_provider, client) or resolved_provider
+                )
            elif _is_rate_limit_error(first_err):
                reason = "rate limit"
            else: