From 228b7d27bdb9b8461b9650137dd3aa2b739879eb Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 10 May 2026 22:43:14 -0700 Subject: [PATCH] fix(auxiliary): cache 402'd providers as unhealthy with TTL to stop per-call retry storms (#23597) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an auxiliary provider returns HTTP 402 (credit / payment), every subsequent compression / title-gen / session-search / vision call still re-tried it as the FIRST entry in the chain — burning ~1 RTT to hit 402 again, then falling back. On a long Discord/LCM session that meant dozens of doomed 402s per minute (issue #23570). Add a per-process unhealthy-provider cache with a 10 min TTL. When any caller observes a payment error against a provider, the label is marked unhealthy and skipped by: * _resolve_auto Step-1 (main provider use-as-aux path) * _resolve_auto Step-2 (aggregator/fallback chain) * _try_payment_fallback (used by call_llm/acall_llm on first 402) Skip-logs are throttled to once per minute per label so a bursty session doesn't spam agent.log. Entries auto-expire so a topped-up account recovers without manual intervention. The cache is in-process only by design — multi-profile users with different keys per profile must each hit the 402 once. Refs #23570 --- agent/auxiliary_client.py | 156 ++++++++++++++++++++++++-- tests/agent/test_auxiliary_client.py | 162 +++++++++++++++++++++++++++ 2 files changed, 307 insertions(+), 11 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index dd3718acfb9..693826920cb 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -1830,6 +1830,113 @@ def _get_provider_chain() -> List[tuple]: ] +# ── Auxiliary "recently 402'd" unhealthy-provider cache ──────────────────── +# +# When an auxiliary provider returns HTTP 402 (Payment Required / credit +# exhaustion), retrying it on every subsequent aux call is wasteful — the +# provider stays depleted for hours or days, but the chain re-tries it as +# the FIRST entry on every compression/title-gen/session-search call, +# burns ~1 RTT, gets 402 again, then falls back. On a long Discord/LCM +# session that adds up to dozens of doomed 402s. +# +# Solution: when ANY caller observes a payment error against a provider, +# mark it unhealthy for ``_AUX_UNHEALTHY_TTL_SECONDS``. ``_resolve_auto`` +# Step-2 and ``_try_payment_fallback`` both consult this cache and skip +# unhealthy entries (logging once per skip-reason so the user sees what +# happened). Entries auto-expire so a topped-up account recovers without +# manual intervention. +# +# Failure isolation: the cache is in-process only. A second hermes +# process won't inherit the unhealthy mark — that's intentional, since +# the user might be running two profiles with different OpenRouter keys. + +_AUX_UNHEALTHY_TTL_SECONDS = 600 # 10 minutes +_aux_unhealthy_until: Dict[str, float] = {} +_aux_unhealthy_logged_at: Dict[str, float] = {} + +# Map provider names that show up in resolved_provider / explicit-config +# back to the chain labels used by _get_provider_chain(). Keep in sync +# with the alias map in _try_payment_fallback below. +_AUX_UNHEALTHY_LABEL_ALIASES = { + "openrouter": "openrouter", + "nous": "nous", + "custom": "local/custom", + "local/custom": "local/custom", + "openai-codex": "openai-codex", + "codex": "openai-codex", +} + + +def _normalize_chain_label(provider: str) -> str: + """Normalize a resolved_provider value to a chain label used by + ``_get_provider_chain()``. Falls back to the lowercased input for + direct API-key providers (deepseek, alibaba, minimax, etc.) which + each report their own provider name from the api-key chain. + """ + if not provider: + return "" + p = str(provider).strip().lower() + return _AUX_UNHEALTHY_LABEL_ALIASES.get(p, p) + + +def _mark_provider_unhealthy(provider: str, ttl: Optional[float] = None) -> None: + """Mark ``provider`` as recently-402'd, hidden from chain iteration + until the TTL expires. Called from the payment-fallback branches in + ``call_llm`` and ``acall_llm`` after a confirmed payment error. + """ + label = _normalize_chain_label(provider) + if not label: + return + expires_at = time.time() + (ttl if ttl is not None else _AUX_UNHEALTHY_TTL_SECONDS) + _aux_unhealthy_until[label] = expires_at + logger.warning( + "Auxiliary: marking %s unhealthy for %ds (payment / credit error). " + "Subsequent auxiliary calls will skip it until %s.", + label, + int(ttl if ttl is not None else _AUX_UNHEALTHY_TTL_SECONDS), + time.strftime("%H:%M:%S", time.localtime(expires_at)), + ) + + +def _is_provider_unhealthy(label: str) -> bool: + """True iff ``label`` is in the unhealthy cache and the TTL hasn't expired. + Lazily evicts expired entries so the cache stays small. + """ + if not label: + return False + expires_at = _aux_unhealthy_until.get(label) + if expires_at is None: + return False + if time.time() >= expires_at: + _aux_unhealthy_until.pop(label, None) + _aux_unhealthy_logged_at.pop(label, None) + return False + return True + + +def _log_skip_unhealthy(label: str, task: Optional[str] = None) -> None: + """Emit a single info-level log per minute when we skip an unhealthy + provider. Avoids spamming the log on bursty sessions while still + giving the user a trail. + """ + now = time.time() + last = _aux_unhealthy_logged_at.get(label, 0.0) + if now - last >= 60: + _aux_unhealthy_logged_at[label] = now + expires_at = _aux_unhealthy_until.get(label, now) + logger.info( + "Auxiliary %s: skipping %s (recently returned payment error, retry in %ds)", + task or "call", label, max(0, int(expires_at - now)), + ) + + +def _reset_aux_unhealthy_cache() -> None: + """Clear the unhealthy cache. Used by tests and by a future explicit + user trigger (e.g. ``hermes config aux reset``).""" + _aux_unhealthy_until.clear() + _aux_unhealthy_logged_at.clear() + + def _is_payment_error(exc: Exception) -> bool: """Detect payment/credit/quota exhaustion errors. @@ -2302,6 +2409,10 @@ def _try_payment_fallback( for label, try_fn in _get_provider_chain(): if label in skip_chain_labels: continue + if _is_provider_unhealthy(label): + _log_skip_unhealthy(label, task) + tried.append(f"{label} (unhealthy)") + continue client, model = try_fn() if client is not None: logger.info( @@ -2378,21 +2489,34 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option resolved_provider = "custom" explicit_base_url = runtime_base_url explicit_api_key = runtime_api_key or None - client, resolved = resolve_provider_client( - resolved_provider, - main_model, - explicit_base_url=explicit_base_url, - explicit_api_key=explicit_api_key, - api_mode=runtime_api_mode or None, - ) - if client is not None: - logger.info("Auxiliary auto-detect: using main provider %s (%s)", - main_provider, resolved or main_model) - return client, resolved or main_model + # Skip Step-1 if the main provider was recently 402'd. The unhealthy + # cache TTL bounds how long we bypass it, so a topped-up account + # recovers automatically. If we tried Step-1 anyway, every aux call + # on a depleted main provider would pay one doomed 402 RTT before + # falling to Step-2. + main_chain_label = _normalize_chain_label(resolved_provider) + if main_chain_label and _is_provider_unhealthy(main_chain_label): + _log_skip_unhealthy(main_chain_label) + else: + client, resolved = resolve_provider_client( + resolved_provider, + main_model, + explicit_base_url=explicit_base_url, + explicit_api_key=explicit_api_key, + api_mode=runtime_api_mode or None, + ) + if client is not None: + logger.info("Auxiliary auto-detect: using main provider %s (%s)", + main_provider, resolved or main_model) + return client, resolved or main_model # ── Step 2: aggregator / fallback chain ────────────────────────────── tried = [] for label, try_fn in _get_provider_chain(): + if _is_provider_unhealthy(label): + _log_skip_unhealthy(label) + tried.append(f"{label} (unhealthy)") + continue client, model = try_fn() if client is not None: if tried: @@ -4224,6 +4348,13 @@ def call_llm( if should_fallback and is_auto: if _is_payment_error(first_err): reason = "payment error" + # Resolve the actual provider label (resolved_provider may be + # "auto"; the client's base_url tells us which backend got the + # 402). Mark THAT label unhealthy so subsequent aux calls + # skip it instead of paying another doomed RTT. + _mark_provider_unhealthy( + _recoverable_pool_provider(resolved_provider, client) or resolved_provider + ) elif _is_rate_limit_error(first_err): reason = "rate limit" else: @@ -4546,6 +4677,9 @@ async def async_call_llm( if should_fallback and is_auto: if _is_payment_error(first_err): reason = "payment error" + _mark_provider_unhealthy( + _recoverable_pool_provider(resolved_provider, client) or resolved_provider + ) elif _is_rate_limit_error(first_err): reason = "rate limit" else: diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 702c4c17552..a38f60b7edc 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -2496,3 +2496,165 @@ class TestAnthropicExplicitApiKey: assert mock_build.call_args.args[0] == "explicit-fallback-key", ( "resolve_provider_client must forward explicit_api_key to _try_anthropic()" ) + + +# ── Auxiliary unhealthy-provider TTL cache (issue #23570) ──────────────── + + +class TestAuxUnhealthyCache: + """Recently-402'd providers are skipped on subsequent aux calls. + + Without this, every compression / title-gen / session-search call on a + long session retries a depleted OpenRouter (~1 RTT to 402) before + falling back to the next provider. The TTL cache hides the unhealthy + provider for ``_AUX_UNHEALTHY_TTL_SECONDS`` so the chain skips it. + """ + + def setup_method(self): + from agent.auxiliary_client import _reset_aux_unhealthy_cache + _reset_aux_unhealthy_cache() + + def teardown_method(self): + from agent.auxiliary_client import _reset_aux_unhealthy_cache + _reset_aux_unhealthy_cache() + + def test_mark_then_skip(self): + from agent.auxiliary_client import ( + _mark_provider_unhealthy, + _is_provider_unhealthy, + ) + assert _is_provider_unhealthy("openrouter") is False + _mark_provider_unhealthy("openrouter") + assert _is_provider_unhealthy("openrouter") is True + + def test_ttl_expiry_evicts(self): + from agent.auxiliary_client import ( + _mark_provider_unhealthy, + _is_provider_unhealthy, + _aux_unhealthy_until, + ) + _mark_provider_unhealthy("openrouter", ttl=0.01) + assert _is_provider_unhealthy("openrouter") is True + import time + time.sleep(0.02) + # Lazy eviction: first lookup after expiry returns False AND removes the entry. + assert _is_provider_unhealthy("openrouter") is False + assert "openrouter" not in _aux_unhealthy_until + + def test_alias_normalization(self): + """'codex' should normalize to 'openai-codex' so the cache lookup + matches the chain label.""" + from agent.auxiliary_client import ( + _mark_provider_unhealthy, + _is_provider_unhealthy, + ) + _mark_provider_unhealthy("codex") + assert _is_provider_unhealthy("openai-codex") is True + + def test_resolve_auto_skips_unhealthy_step2(self): + """_resolve_auto Step-2 chain skips unhealthy providers.""" + from agent.auxiliary_client import ( + _resolve_auto, + _mark_provider_unhealthy, + ) + nous_client = MagicMock() + # Mark OpenRouter unhealthy → chain should skip it and pick nous. + _mark_provider_unhealthy("openrouter") + with patch("agent.auxiliary_client._read_main_provider", return_value=""), \ + patch("agent.auxiliary_client._read_main_model", return_value=""), \ + patch("agent.auxiliary_client._try_openrouter") as or_try, \ + patch("agent.auxiliary_client._try_nous", return_value=(nous_client, "nous-model")), \ + patch("agent.auxiliary_client._try_custom_endpoint", return_value=(None, None)), \ + patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)): + client, model = _resolve_auto() + assert client is nous_client + assert model == "nous-model" + # The skipped provider's _try_* should NOT have been called at all. + or_try.assert_not_called() + + def test_resolve_auto_skips_unhealthy_main_in_step1(self): + """Step-1 also consults the unhealthy cache so a depleted main + provider doesn't burn a 402 RTT every aux call. Falls through to + Step-2 chain (which also respects the cache).""" + from agent.auxiliary_client import ( + _resolve_auto, + _mark_provider_unhealthy, + ) + nous_client = MagicMock() + _mark_provider_unhealthy("openrouter") + with patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"), \ + patch("agent.auxiliary_client._read_main_model", return_value="anthropic/claude-sonnet-4.6"), \ + patch("agent.auxiliary_client.resolve_provider_client") as step1, \ + patch("agent.auxiliary_client._try_openrouter") as or_try, \ + patch("agent.auxiliary_client._try_nous", return_value=(nous_client, "n-model")), \ + patch("agent.auxiliary_client._try_custom_endpoint", return_value=(None, None)), \ + patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)): + client, model = _resolve_auto() + # Step-1 was bypassed — resolve_provider_client never invoked + step1.assert_not_called() + # Step-2 also skipped openrouter and landed on nous + or_try.assert_not_called() + assert client is nous_client + + def test_payment_fallback_skips_unhealthy(self): + """_try_payment_fallback also consults the unhealthy cache so a 402 + on OpenRouter doesn't cause a second OR call within the same chain + iteration if it gets re-entered.""" + from agent.auxiliary_client import ( + _try_payment_fallback, + _mark_provider_unhealthy, + ) + nous_client = MagicMock() + # Mark BOTH the failed provider (openrouter) and a sibling (custom) + # unhealthy. The chain should still find nous. + _mark_provider_unhealthy("local/custom") + with patch("agent.auxiliary_client._read_main_provider", return_value="openrouter"), \ + patch("agent.auxiliary_client._try_openrouter") as or_try, \ + patch("agent.auxiliary_client._try_nous", return_value=(nous_client, "n-model")), \ + patch("agent.auxiliary_client._try_custom_endpoint") as custom_try, \ + patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)): + client, model, label = _try_payment_fallback("openrouter", task="compression") + assert client is nous_client + assert label == "nous" + # OR is skipped via skip_chain_labels (failed provider), custom via unhealthy cache. + or_try.assert_not_called() + custom_try.assert_not_called() + + def test_call_llm_marks_provider_unhealthy_on_402(self, monkeypatch): + """A 402 from call_llm causes the provider to be marked unhealthy + so the next call skips it instead of re-trying the same depleted + endpoint.""" + from agent.auxiliary_client import ( + call_llm, + _is_provider_unhealthy, + ) + monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") + + primary_client = MagicMock() + # base_url tells _recoverable_pool_provider() that this is OpenRouter + # (resolved_provider="auto" doesn't carry that information by itself). + primary_client.base_url = "https://openrouter.ai/api/v1/" + err = Exception("Payment Required: insufficient credits") + err.status_code = 402 + primary_client.chat.completions.create.side_effect = err + + nous_client = MagicMock() + nous_resp = MagicMock() + nous_resp.choices = [MagicMock(message=MagicMock(content="ok"))] + nous_client.chat.completions.create.return_value = nous_resp + + with patch("agent.auxiliary_client._get_cached_client", + return_value=(primary_client, "google/gemini-3-flash-preview")), \ + patch("agent.auxiliary_client._resolve_task_provider_model", + return_value=("auto", "google/gemini-3-flash-preview", None, None, None)), \ + patch("agent.auxiliary_client._try_payment_fallback", + return_value=(nous_client, "n-model", "nous")), \ + patch("agent.auxiliary_client._build_call_kwargs", + return_value={"model": "n-model", "messages": [{"role": "user", "content": "hi"}]}): + assert _is_provider_unhealthy("openrouter") is False + call_llm( + task="compression", + messages=[{"role": "user", "content": "hi"}], + ) + # After the 402, OpenRouter is in the unhealthy cache. + assert _is_provider_unhealthy("openrouter") is True