fix(auxiliary): honor fallback chain when compression provider auth is unavailable

When an explicit aux provider cannot build a client before any request is sent (missing raw env key, exhausted/unavailable OAuth or credential-pool auth, resolver returning (None, None)), call_llm raised a misleading "no API key was found" error and bypassed the configured fallback_chain entirely. A provider authenticated through Hermes auth / the credential pool (e.g. ollama-cloud) whose pool entry is exhausted hit this path, so compression failed instead of routing to the configured fallback. Adds _try_configured_fallback_for_unavailable_client() and wires it into both sync and async call_llm before the raise, and into the startup compression feasibility check. Salvaged from #51835 by @herbalizer404.
2026-06-27 11:22:03 +00:00 · 2026-06-25 12:50:50 -07:00 · 2026-06-25 12:50:50 -07:00 · b82c83d320
commit b82c83d320
parent 751adfa6b9
4 changed files with 174 additions and 16 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -3205,6 +3205,28 @@ def _try_configured_fallback_chain(
    return None, None, ""


+def _try_configured_fallback_for_unavailable_client(
+    task: Optional[str],
+    failed_provider: str,
+) -> Tuple[Optional[Any], Optional[str], str]:
+    """Try task fallback_chain when an explicit aux provider cannot build.
+
+    This covers the "no client" case before any request is sent: missing
+    raw env key, unavailable OAuth/pool credentials, or provider resolver
+    returning ``(None, None)``.  It deliberately stops at the configured
+    per-task fallback chain; the main-agent model remains the last-resort
+    runtime fallback for request-time capacity errors.
+    """
+    explicit = (failed_provider or "").strip().lower()
+    if not task or not explicit or explicit in {"auto"}:
+        return None, None, ""
+    return _try_configured_fallback_chain(
+        task,
+        explicit,
+        reason="provider unavailable",
+    )
+
+
 def _fallback_entry_api_key(entry: Dict[str, Any]) -> Optional[str]:
    """Resolve inline or env-backed API key from a fallback-chain entry."""
    explicit = str(entry.get("api_key") or "").strip()
@ -5346,21 +5368,30 @@ def call_llm(
        )
        if client is None:
            # When the user explicitly chose a non-OpenRouter provider but no
-            # credentials were found, fail fast instead of silently routing
-            # through OpenRouter (which causes confusing 404s).
+            # credentials were found, honor the task fallback_chain before
+            # raising.  Missing raw env keys are recoverable for auxiliary
+            # tasks because fallback entries may use OAuth / credential-pool
+            # auth (for example openai-codex).
            _explicit = (resolved_provider or "").strip().lower()
            if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
-                raise RuntimeError(
-                    f"Provider '{_explicit}' is set in config.yaml but no API key "
-                    f"was found. Set the {_explicit.upper()}_API_KEY environment "
-                    f"variable, or switch to a different provider with `hermes model`."
+                fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
+                    task, _explicit,
                )
+                if fb_client is not None:
+                    client, final_model = fb_client, fb_model
+                    resolved_provider = fb_label or resolved_provider
+                else:
+                    raise RuntimeError(
+                        f"Provider '{_explicit}' is set in config.yaml but no API key "
+                        f"was found. Set the {_explicit.upper()}_API_KEY environment "
+                        f"variable, or switch to a different provider with `hermes model`."
+                    )
            # For auto/custom with no credentials, try the full auto chain
            # rather than hardcoding OpenRouter (which may be depleted).
            # Pass model=None so each provider uses its own default —
            # resolved_model may be an OpenRouter-format slug that doesn't
            # work on other providers.
-            if not resolved_base_url:
+            if client is None and not resolved_base_url:
                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
                            task or "call", resolved_provider)
                client, final_model = _get_cached_client("auto", main_runtime=main_runtime, task=task)
@ -5858,12 +5889,21 @@ async def async_call_llm(
        if client is None:
            _explicit = (resolved_provider or "").strip().lower()
            if _explicit and _explicit not in {"auto", "openrouter", "custom"}:
-                raise RuntimeError(
-                    f"Provider '{_explicit}' is set in config.yaml but no API key "
-                    f"was found. Set the {_explicit.upper()}_API_KEY environment "
-                    f"variable, or switch to a different provider with `hermes model`."
+                fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
+                    task, _explicit,
                )
-            if not resolved_base_url:
+                if fb_client is not None:
+                    client, final_model = _to_async_client(
+                        fb_client, fb_model or "", is_vision=(task == "vision")
+                    )
+                    resolved_provider = fb_label or resolved_provider
+                else:
+                    raise RuntimeError(
+                        f"Provider '{_explicit}' is set in config.yaml but no API key "
+                        f"was found. Set the {_explicit.upper()}_API_KEY environment "
+                        f"variable, or switch to a different provider with `hermes model`."
+                    )
+            if client is None and not resolved_base_url:
                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
                            task or "call", resolved_provider)
                client, final_model = _get_cached_client("auto", async_mode=True, main_runtime=main_runtime, task=task)
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@ -90,6 +90,7 @@ def check_compression_model_feasibility(agent: Any) -> None:
    try:
        from agent.auxiliary_client import (
            _resolve_task_provider_model,
+            _try_configured_fallback_for_unavailable_client,
            get_text_auxiliary_client,
        )
        from agent.model_metadata import (
@ -97,10 +98,6 @@ def check_compression_model_feasibility(agent: Any) -> None:
            get_model_context_length,
        )

-        client, aux_model = get_text_auxiliary_client(
-            "compression",
-            main_runtime=agent._current_main_runtime(),
-        )
        # Best-effort aux provider label for the warning message. The
        # configured provider may be "auto", in which case we fall back
        # to the client's base_url hostname so the user can still tell
@ -109,6 +106,19 @@ def check_compression_model_feasibility(agent: Any) -> None:
            _aux_cfg_provider, _, _, _, _ = _resolve_task_provider_model("compression")
        except Exception:
            _aux_cfg_provider = ""
+        client, aux_model = get_text_auxiliary_client(
+            "compression",
+            main_runtime=agent._current_main_runtime(),
+        )
+        if client is None or not aux_model:
+            fb_client, fb_model, fb_label = _try_configured_fallback_for_unavailable_client(
+                "compression",
+                _aux_cfg_provider,
+            )
+            if fb_client is not None and fb_model:
+                client, aux_model = fb_client, fb_model
+                if "(" in fb_label and fb_label.endswith(")"):
+                    _aux_cfg_provider = fb_label.rsplit("(", 1)[1][:-1]
        if client is None or not aux_model:
            if _aux_cfg_provider and _aux_cfg_provider != "auto":
                msg = (
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -1949,6 +1949,81 @@ class TestAuxiliaryFallbackLayering:
            "all fallbacks exhausted" in r.message for r in caplog.records
        ), f"Expected exhaustion warning, got: {[r.message for r in caplog.records]}"

+    def test_explicit_provider_no_client_uses_configured_chain_before_error(self, monkeypatch):
+        """Missing primary credentials should still honor auxiliary fallback_chain."""
+        chain_client = MagicMock()
+        chain_client.chat.completions.create.return_value = MagicMock(choices=[
+            MagicMock(message=MagicMock(content="from configured chain"))
+        ])
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                   return_value=(None, None)), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                   return_value=("ollama-cloud", "deepseek-v4-flash:cloud", None, None, None)), \
+             patch("agent.auxiliary_client._try_configured_fallback_chain",
+                   return_value=(chain_client, "gpt-5.4-mini", "fallback_chain[0](openai-codex)")) as mock_chain:
+            result = call_llm(
+                task="compression",
+                messages=[{"role": "user", "content": "hello"}],
+            )
+
+        assert chain_client.chat.completions.create.called
+        assert result.choices[0].message.content == "from configured chain"
+        mock_chain.assert_called_once_with(
+            "compression",
+            "ollama-cloud",
+            reason="provider unavailable",
+        )
+
+    def test_explicit_provider_no_client_without_chain_keeps_clear_error(self, monkeypatch):
+        """No fallback configured: keep the existing actionable missing-key error."""
+        with patch("agent.auxiliary_client._get_cached_client",
+                   return_value=(None, None)), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                   return_value=("ollama-cloud", "deepseek-v4-flash:cloud", None, None, None)), \
+             patch("agent.auxiliary_client._try_configured_fallback_chain",
+                   return_value=(None, None, "")) as mock_chain:
+            with pytest.raises(RuntimeError, match="Provider 'ollama-cloud'.*no API key"):
+                call_llm(
+                    task="compression",
+                    messages=[{"role": "user", "content": "hello"}],
+                )
+
+        mock_chain.assert_called_once_with(
+            "compression",
+            "ollama-cloud",
+            reason="provider unavailable",
+        )
+
+    def test_fallback_entry_openai_codex_uses_oauth_pool_without_inline_key(self):
+        """Configured Codex fallback resolves through Hermes auth / credential pool."""
+        from agent.auxiliary_client import _resolve_fallback_entry
+
+        pool_entry = MagicMock()
+        pool_entry.id = "codex-pool-1"
+        pool_entry.runtime_api_key = "codex-oauth-token"
+        pool_entry.access_token = "codex-oauth-token"
+        pool_entry.runtime_base_url = "https://chatgpt.com/backend-api/codex"
+
+        real_client = MagicMock()
+        real_client.api_key = "codex-oauth-token"
+        real_client.base_url = "https://chatgpt.com/backend-api/codex"
+
+        with patch("agent.auxiliary_client._select_pool_entry",
+                   return_value=(True, pool_entry)), \
+             patch("agent.auxiliary_client._read_codex_access_token",
+                   side_effect=AssertionError("should use pool token")), \
+             patch("agent.auxiliary_client.OpenAI", return_value=real_client) as mock_openai:
+            client, model = _resolve_fallback_entry({
+                "provider": "openai-codex",
+                "model": "gpt-5.4-mini",
+            })
+
+        assert client is not None
+        assert model == "gpt-5.4-mini"
+        mock_openai.assert_called_once()
+        assert mock_openai.call_args.kwargs["api_key"] == "codex-oauth-token"
+

 class TestTryMainAgentModelFallback:
    """_try_main_agent_model_fallback resolves the user's main provider+model as a safety net."""
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@ -306,6 +306,39 @@ def test_warns_when_no_auxiliary_provider(mock_get_client):
    assert agent._compression_warning is not None


+def test_no_unavailable_warning_when_configured_fallback_chain_resolves():
+    """Primary compression provider can be down if configured fallback works."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+    fallback_client = MagicMock()
+    fallback_client.base_url = "https://chatgpt.com/backend-api/codex"
+    fallback_client.api_key = "codex-oauth-token"
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+
+    with patch(
+        "agent.auxiliary_client._resolve_task_provider_model",
+        return_value=("ollama-cloud", "deepseek-v4-flash:cloud", None, None, None),
+    ), patch(
+        "agent.auxiliary_client.get_text_auxiliary_client",
+        return_value=(None, None),
+    ), patch(
+        "agent.auxiliary_client._try_configured_fallback_for_unavailable_client",
+        return_value=(fallback_client, "gpt-5.4-mini", "fallback_chain[0](openai-codex)"),
+    ) as mock_fallback, patch(
+        "agent.model_metadata.get_model_context_length",
+        return_value=200_000,
+    ) as mock_ctx_len:
+        agent._check_compression_model_feasibility()
+
+    assert messages == []
+    assert agent._compression_warning is None
+    mock_fallback.assert_called_once_with("compression", "ollama-cloud")
+    mock_ctx_len.assert_called_once()
+    assert mock_ctx_len.call_args.args == ("gpt-5.4-mini",)
+    assert mock_ctx_len.call_args.kwargs["provider"] == "openai-codex"
+
+
 def test_skips_check_when_compression_disabled():
    """No check performed when compression is disabled."""
    agent = _make_agent(compression_enabled=False)