fix(auxiliary): honor fallback chain when compression provider auth is unavailable

When an explicit aux provider cannot build a client before any request is
sent (missing raw env key, exhausted/unavailable OAuth or credential-pool
auth, resolver returning (None, None)), call_llm raised a misleading
"no API key was found" error and bypassed the configured fallback_chain
entirely. A provider authenticated through Hermes auth / the credential
pool (e.g. ollama-cloud) whose pool entry is exhausted hit this path, so
compression failed instead of routing to the configured fallback.

Adds _try_configured_fallback_for_unavailable_client() and wires it into
both sync and async call_llm before the raise, and into the startup
compression feasibility check.

Salvaged from #51835 by @herbalizer404.
This commit is contained in:
herbalizer404 2026-06-25 12:50:50 -07:00 committed by Teknium
parent 751adfa6b9
commit b82c83d320
4 changed files with 174 additions and 16 deletions

View file

@ -3205,6 +3205,28 @@ def _try_configured_fallback_chain(
return None, None, ""
def _try_configured_fallback_for_unavailable_client(
task: Optional[str],
failed_provider: str,
) -> Tuple[Optional[Any], Optional[str], str]:
"""Try task fallback_chain when an explicit aux provider cannot build.
This covers the "no client" case before any request is sent: missing
raw env key, unavailable OAuth/pool credentials, or provider resolver
returning ``(None, None)``. It deliberately stops at the configured
per-task fallback chain; the main-agent model remains the last-resort
runtime fallback for request-time capacity errors.
"""
explicit = (failed_provider or "").strip().lower()
if not task or not explicit or explicit in {"auto"}:
return None, None, ""
return _try_configured_fallback_chain(
task,
explicit,
reason="provider unavailable",
)
def _fallback_entry_api_key(entry: Dict[str, Any]) -> Optional[str]:
"""Resolve inline or env-backed API key from a fallback-chain entry."""
explicit = str(entry.get("api_key") or "").strip()
@ -5346,21 +5368,30 @@ def call_llm(
)
if client is None:
# When the user explicitly chose a non-OpenRouter provider but no
# credentials were found, fail fast instead of silently routing
# through OpenRouter (which causes confusing 404s).
# credentials were found, honor the task fallback_chain before
# raising. Missing raw env keys are recoverable for auxiliary
# tasks because fallback entries may use OAuth / credential-pool
# auth (for example openai-codex).
_explicit = (resolved_provider or "").strip().lower()
if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
raise RuntimeError(
f"Provider '{_explicit}' is set in config.yaml but no API key "
f"was found. Set the {_explicit.upper()}_API_KEY environment "
f"variable, or switch to a different provider with `hermes model`."
fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
task, _explicit,
)
if fb_client is not None:
client, final_model = fb_client, fb_model
resolved_provider = fb_label or resolved_provider
else:
raise RuntimeError(
f"Provider '{_explicit}' is set in config.yaml but no API key "
f"was found. Set the {_explicit.upper()}_API_KEY environment "
f"variable, or switch to a different provider with `hermes model`."
)
# For auto/custom with no credentials, try the full auto chain
# rather than hardcoding OpenRouter (which may be depleted).
# Pass model=None so each provider uses its own default —
# resolved_model may be an OpenRouter-format slug that doesn't
# work on other providers.
if not resolved_base_url:
if client is None and not resolved_base_url:
logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
task or "call", resolved_provider)
client, final_model = _get_cached_client("auto", main_runtime=main_runtime, task=task)
@ -5858,12 +5889,21 @@ async def async_call_llm(
if client is None:
_explicit = (resolved_provider or "").strip().lower()
if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
raise RuntimeError(
f"Provider '{_explicit}' is set in config.yaml but no API key "
f"was found. Set the {_explicit.upper()}_API_KEY environment "
f"variable, or switch to a different provider with `hermes model`."
fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
task, _explicit,
)
if not resolved_base_url:
if fb_client is not None:
client, final_model = _to_async_client(
fb_client, fb_model or "", is_vision=(task == "vision")
)
resolved_provider = fb_label or resolved_provider
else:
raise RuntimeError(
f"Provider '{_explicit}' is set in config.yaml but no API key "
f"was found. Set the {_explicit.upper()}_API_KEY environment "
f"variable, or switch to a different provider with `hermes model`."
)
if client is None and not resolved_base_url:
logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
task or "call", resolved_provider)
client, final_model = _get_cached_client("auto", async_mode=True, main_runtime=main_runtime, task=task)

View file

@ -90,6 +90,7 @@ def check_compression_model_feasibility(agent: Any) -> None:
try:
from agent.auxiliary_client import (
_resolve_task_provider_model,
_try_configured_fallback_for_unavailable_client,
get_text_auxiliary_client,
)
from agent.model_metadata import (
@ -97,10 +98,6 @@ def check_compression_model_feasibility(agent: Any) -> None:
get_model_context_length,
)
client, aux_model = get_text_auxiliary_client(
"compression",
main_runtime=agent._current_main_runtime(),
)
# Best-effort aux provider label for the warning message. The
# configured provider may be "auto", in which case we fall back
# to the client's base_url hostname so the user can still tell
@ -109,6 +106,19 @@ def check_compression_model_feasibility(agent: Any) -> None:
_aux_cfg_provider, _, _, _, _ = _resolve_task_provider_model("compression")
except Exception:
_aux_cfg_provider = ""
client, aux_model = get_text_auxiliary_client(
"compression",
main_runtime=agent._current_main_runtime(),
)
if client is None or not aux_model:
fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
"compression",
_aux_cfg_provider,
)
if fb_client is not None and fb_model:
client, aux_model = fb_client, fb_model
if "(" in fb_label and fb_label.endswith(")"):
_aux_cfg_provider = fb_label.rsplit("(", 1)[1][:-1]
if client is None or not aux_model:
if _aux_cfg_provider and _aux_cfg_provider != "auto":
msg = (

View file

@ -1949,6 +1949,81 @@ class TestAuxiliaryFallbackLayering:
"all fallbacks exhausted" in r.message for r in caplog.records
), f"Expected exhaustion warning, got: {[r.message for r in caplog.records]}"
def test_explicit_provider_no_client_uses_configured_chain_before_error(self, monkeypatch):
"""Missing primary credentials should still honor auxiliary fallback_chain."""
chain_client = MagicMock()
chain_client.chat.completions.create.return_value = MagicMock(choices=[
MagicMock(message=MagicMock(content="from configured chain"))
])
with patch("agent.auxiliary_client._get_cached_client",
return_value=(None, None)), \
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("ollama-cloud", "deepseek-v4-flash:cloud", None, None, None)), \
patch("agent.auxiliary_client._try_configured_fallback_chain",
return_value=(chain_client, "gpt-5.4-mini", "fallback_chain[0](openai-codex)")) as mock_chain:
result = call_llm(
task="compression",
messages=[{"role": "user", "content": "hello"}],
)
assert chain_client.chat.completions.create.called
assert result.choices[0].message.content == "from configured chain"
mock_chain.assert_called_once_with(
"compression",
"ollama-cloud",
reason="provider unavailable",
)
def test_explicit_provider_no_client_without_chain_keeps_clear_error(self, monkeypatch):
"""No fallback configured: keep the existing actionable missing-key error."""
with patch("agent.auxiliary_client._get_cached_client",
return_value=(None, None)), \
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("ollama-cloud", "deepseek-v4-flash:cloud", None, None, None)), \
patch("agent.auxiliary_client._try_configured_fallback_chain",
return_value=(None, None, "")) as mock_chain:
with pytest.raises(RuntimeError, match="Provider 'ollama-cloud'.*no API key"):
call_llm(
task="compression",
messages=[{"role": "user", "content": "hello"}],
)
mock_chain.assert_called_once_with(
"compression",
"ollama-cloud",
reason="provider unavailable",
)
def test_fallback_entry_openai_codex_uses_oauth_pool_without_inline_key(self):
"""Configured Codex fallback resolves through Hermes auth / credential pool."""
from agent.auxiliary_client import _resolve_fallback_entry
pool_entry = MagicMock()
pool_entry.id = "codex-pool-1"
pool_entry.runtime_api_key = "codex-oauth-token"
pool_entry.access_token = "codex-oauth-token"
pool_entry.runtime_base_url = "https://chatgpt.com/backend-api/codex"
real_client = MagicMock()
real_client.api_key = "codex-oauth-token"
real_client.base_url = "https://chatgpt.com/backend-api/codex"
with patch("agent.auxiliary_client._select_pool_entry",
return_value=(True, pool_entry)), \
patch("agent.auxiliary_client._read_codex_access_token",
side_effect=AssertionError("should use pool token")), \
patch("agent.auxiliary_client.OpenAI", return_value=real_client) as mock_openai:
client, model = _resolve_fallback_entry({
"provider": "openai-codex",
"model": "gpt-5.4-mini",
})
assert client is not None
assert model == "gpt-5.4-mini"
mock_openai.assert_called_once()
assert mock_openai.call_args.kwargs["api_key"] == "codex-oauth-token"
class TestTryMainAgentModelFallback:
"""_try_main_agent_model_fallback resolves the user's main provider+model as a safety net."""

View file

@ -306,6 +306,39 @@ def test_warns_when_no_auxiliary_provider(mock_get_client):
assert agent._compression_warning is not None
def test_no_unavailable_warning_when_configured_fallback_chain_resolves():
"""Primary compression provider can be down if configured fallback works."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
fallback_client = MagicMock()
fallback_client.base_url = "https://chatgpt.com/backend-api/codex"
fallback_client.api_key = "codex-oauth-token"
messages = []
agent._emit_status = lambda msg: messages.append(msg)
with patch(
"agent.auxiliary_client._resolve_task_provider_model",
return_value=("ollama-cloud", "deepseek-v4-flash:cloud", None, None, None),
), patch(
"agent.auxiliary_client.get_text_auxiliary_client",
return_value=(None, None),
), patch(
"agent.auxiliary_client._try_configured_fallback_for_unavailable_client",
return_value=(fallback_client, "gpt-5.4-mini", "fallback_chain[0](openai-codex)"),
) as mock_fallback, patch(
"agent.model_metadata.get_model_context_length",
return_value=200_000,
) as mock_ctx_len:
agent._check_compression_model_feasibility()
assert messages == []
assert agent._compression_warning is None
mock_fallback.assert_called_once_with("compression", "ollama-cloud")
mock_ctx_len.assert_called_once()
assert mock_ctx_len.call_args.args == ("gpt-5.4-mini",)
assert mock_ctx_len.call_args.kwargs["provider"] == "openai-codex"
def test_skips_check_when_compression_disabled():
"""No check performed when compression is disabled."""
agent = _make_agent(compression_enabled=False)