fix(auxiliary): honor main fallback chain for auto tasks (#47235)

This commit is contained in:
Teknium 2026-06-16 06:23:24 -07:00 committed by GitHub
parent 4d470b3dbb
commit 4858942c55
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 290 additions and 38 deletions

View file

@ -3079,23 +3079,20 @@ def _try_configured_fallback_chain(
if not fb_provider or fb_provider.lower() == skip:
continue
fb_model = str(entry.get("model", "")).strip() or None
fb_base_url = str(entry.get("base_url", "")).strip() or None
fb_api_key = str(entry.get("api_key", "")).strip() or None
label = f"fallback_chain[{i}]({fb_provider})"
try:
fb_client = _resolve_single_provider(
fb_provider, fb_model, fb_base_url, fb_api_key)
fb_client, resolved_model = _resolve_fallback_entry(entry)
except Exception:
fb_client = None
fb_client, resolved_model = None, None
if fb_client is not None:
logger.info(
"Auxiliary %s: %s on %s — configured fallback to %s (%s)",
task, reason, failed_provider, label, fb_model or "default",
task, reason, failed_provider, label, resolved_model or fb_model or "default",
)
return fb_client, fb_model, label
return fb_client, resolved_model or fb_model, label
tried.append(label)
if tried:
@ -3106,6 +3103,103 @@ def _try_configured_fallback_chain(
return None, None, ""
def _fallback_entry_api_key(entry: Dict[str, Any]) -> Optional[str]:
"""Resolve inline or env-backed API key from a fallback-chain entry."""
explicit = str(entry.get("api_key") or "").strip()
if explicit:
return explicit
key_env = str(entry.get("key_env") or entry.get("api_key_env") or "").strip()
if key_env:
return os.getenv(key_env, "").strip() or None
return None
def _resolve_fallback_entry(entry: Dict[str, Any]) -> Tuple[Optional[Any], Optional[str]]:
"""Resolve one fallback entry through the central provider router."""
provider = str(entry.get("provider") or "").strip()
model = str(entry.get("model") or "").strip() or None
if not provider or not model:
return None, None
base_url = str(entry.get("base_url") or "").strip() or None
api_key = _fallback_entry_api_key(entry)
api_mode = str(entry.get("api_mode") or entry.get("transport") or "").strip() or None
return resolve_provider_client(
provider,
model=model,
explicit_base_url=base_url,
explicit_api_key=api_key,
api_mode=api_mode,
)
def _try_main_fallback_chain(
task: Optional[str],
failed_provider: str = "",
reason: str = "error",
) -> Tuple[Optional[Any], Optional[str], str]:
"""Try the top-level main-agent fallback chain for an auxiliary call.
``provider: auto`` auxiliary tasks should respect the user's declared
main fallback policy before dropping into Hermes' built-in discovery
chain. The top-level chain is read through ``get_fallback_chain`` so
both modern ``fallback_providers`` and legacy ``fallback_model`` entries
participate in the same order as the main agent.
"""
try:
from hermes_cli.config import load_config
from hermes_cli.fallback_config import get_fallback_chain
chain = get_fallback_chain(load_config())
except Exception as exc:
logger.debug("Auxiliary %s: could not load main fallback chain: %s", task or "call", exc)
return None, None, ""
if not chain:
return None, None, ""
failed_norm = (failed_provider or "").strip().lower()
main_norm = (_read_main_provider() or "").strip().lower()
skip = {p for p in (failed_norm, main_norm, "auto") if p}
tried: List[str] = []
for i, entry in enumerate(chain):
if not isinstance(entry, dict):
continue
fb_provider = str(entry.get("provider") or "").strip()
fb_model = str(entry.get("model") or "").strip()
if not fb_provider or not fb_model:
continue
fb_norm = fb_provider.lower()
label = f"fallback_providers[{i}]({fb_provider})"
if fb_norm in skip:
tried.append(f"{label} (skipped)")
continue
if _is_provider_unhealthy(fb_norm):
_log_skip_unhealthy(fb_norm, task)
tried.append(f"{label} (unhealthy)")
continue
try:
fb_client, resolved_model = _resolve_fallback_entry(entry)
except Exception as exc:
logger.debug("Auxiliary %s: main fallback %s failed to resolve: %s", task or "call", label, exc)
fb_client, resolved_model = None, None
if fb_client is not None:
logger.info(
"Auxiliary %s: %s on %s — main fallback chain to %s (%s)",
task or "call", reason, failed_provider or "auto", label,
resolved_model or fb_model,
)
return fb_client, resolved_model or fb_model, fb_provider
tried.append(label)
if tried:
logger.debug(
"Auxiliary %s: main fallback chain exhausted (tried: %s)",
task or "call", ", ".join(tried),
)
return None, None, ""
def _resolve_single_provider(
provider: str,
model: Optional[str] = None,
@ -3116,16 +3210,19 @@ def _resolve_single_provider(
Uses the existing provider resolution infrastructure where possible.
"""
# Reuse resolve_provider_client which handles provider→client mapping
# Reuse resolve_provider_client which handles provider→client mapping.
client, resolved_model = resolve_provider_client(
provider=provider,
model=model,
base_url=base_url,
api_key=api_key,
explicit_base_url=base_url,
explicit_api_key=api_key,
)
return client
def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Optional[OpenAI], Optional[str]]:
def _resolve_auto(
main_runtime: Optional[Dict[str, Any]] = None,
task: Optional[str] = None,
) -> Tuple[Optional[OpenAI], Optional[str]]:
"""Full auto-detection chain.
Priority:
@ -3223,7 +3320,22 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
main_provider, resolved or main_model)
return client, resolved or main_model
# ── Step 2: aggregator / fallback chain ──────────────────────────────
# ── Step 2: user-configured fallback policy ─────────────────────────
# In auto mode, respect the task-specific fallback chain first, then the
# main agent's top-level fallback_providers/fallback_model chain. The
# hardcoded provider discovery chain below is only the convenience default
# for users who have not declared a fallback policy.
if task:
fb_client, fb_model, _fb_label = _try_configured_fallback_chain(
task, main_provider or "auto", reason="main provider unavailable")
if fb_client is not None:
return fb_client, fb_model
fb_client, fb_model, _fb_label = _try_main_fallback_chain(
task, main_provider or "auto", reason="main provider unavailable")
if fb_client is not None:
return fb_client, fb_model
# ── Step 3: aggregator / fallback chain ──────────────────────────────
tried = []
for label, try_fn in _get_provider_chain():
if _is_provider_unhealthy(label):
@ -3344,6 +3456,7 @@ def resolve_provider_client(
api_mode: str = None,
main_runtime: Optional[Dict[str, Any]] = None,
is_vision: bool = False,
task: Optional[str] = None,
) -> Tuple[Optional[Any], Optional[str]]:
"""Central router: given a provider name and optional model, return a
configured client with the correct auth, base URL, and API format.
@ -3464,7 +3577,7 @@ def resolve_provider_client(
# ── Auto: try all providers in priority order ────────────────────
if provider == "auto":
client, resolved = _resolve_auto(main_runtime=main_runtime)
client, resolved = _resolve_auto(main_runtime=main_runtime, task=task)
if client is None:
return None, None
# When auto-detection lands on a non-OpenRouter provider (e.g. a
@ -4357,11 +4470,16 @@ def _client_cache_key(
api_mode: Optional[str] = None,
main_runtime: Optional[Dict[str, Any]] = None,
is_vision: bool = False,
task: Optional[str] = None,
) -> tuple:
runtime = _normalize_main_runtime(main_runtime)
runtime_key = tuple(runtime.get(field, "") for field in _MAIN_RUNTIME_FIELDS) if provider == "auto" else ()
# `auto` can now resolve through task-specific or main fallback policy,
# so the task participates in the cache key. Non-auto providers keep the
# old cache shape because the explicit provider/model tuple is sufficient.
task_key = (task or "") if provider == "auto" else ""
pool_hint = _pool_cache_hint(provider, main_runtime=main_runtime)
return (provider, async_mode, base_url or "", api_key or "", api_mode or "", runtime_key, is_vision, pool_hint)
return (provider, async_mode, base_url or "", api_key or "", api_mode or "", runtime_key, is_vision, task_key, pool_hint)
def _store_cached_client(cache_key: tuple, client: Any, default_model: Optional[str], *, bound_loop: Any = None) -> None:
@ -4554,6 +4672,7 @@ def _get_cached_client(
api_mode: str = None,
main_runtime: Optional[Dict[str, Any]] = None,
is_vision: bool = False,
task: Optional[str] = None,
) -> Tuple[Optional[Any], Optional[str]]:
"""Get or create a cached client for the given provider.
@ -4591,6 +4710,7 @@ def _get_cached_client(
api_mode=api_mode,
main_runtime=main_runtime,
is_vision=is_vision,
task=task,
)
with _client_cache_lock:
if cache_key in _client_cache:
@ -4635,6 +4755,7 @@ def _get_cached_client(
api_mode=api_mode,
main_runtime=runtime,
is_vision=is_vision,
task=task,
)
if client is not None:
# For async clients, remember which loop they were created on so we
@ -5140,7 +5261,7 @@ def call_llm(
if not resolved_base_url:
logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
task or "call", resolved_provider)
client, final_model = _get_cached_client("auto", main_runtime=main_runtime)
client, final_model = _get_cached_client("auto", main_runtime=main_runtime, task=task)
if client is None:
raise RuntimeError(
f"No LLM provider configured for task={task} provider={resolved_provider}. "
@ -5466,14 +5587,19 @@ def call_llm(
# Fallback order (#26882, #26803):
# 1. User-configured fallback_chain (per-task) if set
# 2. Main agent model (last-resort safety net)
# For auto users (no explicit aux provider), use the full
# auto-detection chain instead — its Step 1 IS the main agent
# model, so users on `auto` already get main-model fallback.
# 2. For auto: top-level main fallback_providers/fallback_model
# 3. For auto: built-in auxiliary discovery chain
# 4. For explicit aux providers: main agent model safety net
fb_client, fb_model, fb_label = (None, None, "")
if is_auto:
fb_client, fb_model, fb_label = _try_payment_fallback(
resolved_provider, task, reason=reason)
fb_client, fb_model, fb_label = _try_configured_fallback_chain(
task, resolved_provider or "auto", reason=reason)
if fb_client is None:
fb_client, fb_model, fb_label = _try_main_fallback_chain(
task, resolved_provider or "auto", reason=reason)
if fb_client is None:
fb_client, fb_model, fb_label = _try_payment_fallback(
resolved_provider, task, reason=reason)
else:
fb_client, fb_model, fb_label = _try_configured_fallback_chain(
task, resolved_provider or "auto", reason=reason)
@ -5636,7 +5762,7 @@ async def async_call_llm(
if not resolved_base_url:
logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
task or "call", resolved_provider)
client, final_model = _get_cached_client("auto", async_mode=True)
client, final_model = _get_cached_client("auto", async_mode=True, main_runtime=main_runtime, task=task)
if client is None:
raise RuntimeError(
f"No LLM provider configured for task={task} provider={resolved_provider}. "
@ -5904,13 +6030,19 @@ async def async_call_llm(
# Fallback order (#26882, #26803):
# 1. User-configured fallback_chain (per-task) if set
# 2. Main agent model (last-resort safety net)
# Auto users get the full auto-detection chain instead — its
# Step 1 IS the main agent model.
# 2. For auto: top-level main fallback_providers/fallback_model
# 3. For auto: built-in auxiliary discovery chain
# 4. For explicit aux providers: main agent model safety net
fb_client, fb_model, fb_label = (None, None, "")
if is_auto:
fb_client, fb_model, fb_label = _try_payment_fallback(
resolved_provider, task, reason=reason)
fb_client, fb_model, fb_label = _try_configured_fallback_chain(
task, resolved_provider or "auto", reason=reason)
if fb_client is None:
fb_client, fb_model, fb_label = _try_main_fallback_chain(
task, resolved_provider or "auto", reason=reason)
if fb_client is None:
fb_client, fb_model, fb_label = _try_payment_fallback(
resolved_provider, task, reason=reason)
else:
fb_client, fb_model, fb_label = _try_configured_fallback_chain(
task, resolved_provider or "auto", reason=reason)

View file

@ -1653,6 +1653,37 @@ class TestAuxiliaryFallbackLayering:
exc.status_code = 402
return exc
def test_auto_provider_uses_task_then_main_chain_before_builtin_chain(self, monkeypatch):
"""Auto aux call failures try per-task then top-level fallback before built-ins."""
primary_client = MagicMock()
primary_client.chat.completions.create.side_effect = self._make_payment_err()
main_chain_client = MagicMock()
main_chain_client.chat.completions.create.return_value = MagicMock(choices=[
MagicMock(message=MagicMock(content="from main fallback chain"))
])
with patch("agent.auxiliary_client._get_cached_client",
return_value=(primary_client, "qwen/qwen3.5-122b-a10b")), \
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("auto", None, None, None, None)), \
patch("agent.auxiliary_client._try_configured_fallback_chain",
return_value=(None, None, "")) as mock_task_chain, \
patch("agent.auxiliary_client._try_main_fallback_chain",
return_value=(main_chain_client, "inclusionai/ring-2.6-1t:free", "openrouter")) as mock_main_chain, \
patch("agent.auxiliary_client._try_payment_fallback") as mock_builtin_chain:
result = call_llm(
task="title_generation",
messages=[{"role": "user", "content": "hello"}],
)
assert main_chain_client.chat.completions.create.called
mock_task_chain.assert_called_once_with(
"title_generation", "auto", reason="payment error")
mock_main_chain.assert_called_once_with(
"title_generation", "auto", reason="payment error")
mock_builtin_chain.assert_not_called()
def test_explicit_provider_uses_configured_chain_first(self, monkeypatch, caplog):
"""When a user has fallback_chain configured, it's tried BEFORE the main agent model."""
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")

View file

@ -118,6 +118,64 @@ class TestResolveAutoMainFirst:
assert client is chain_client
assert model == "google/gemini-3-flash-preview"
def test_main_unavailable_uses_task_fallback_chain_before_builtin_chain(self):
"""Auto aux resolution honors auxiliary.<task>.fallback_chain before built-ins."""
task_client = MagicMock()
with patch(
"agent.auxiliary_client._read_main_provider", return_value="nvidia",
), patch(
"agent.auxiliary_client._read_main_model", return_value="qwen/qwen3.5-122b-a10b",
), patch(
"agent.auxiliary_client.resolve_provider_client",
return_value=(None, None), # main provider has no client
), patch(
"agent.auxiliary_client._try_configured_fallback_chain",
return_value=(task_client, "task-free-model", "fallback_chain[0](openrouter)"),
) as mock_task_chain, patch(
"agent.auxiliary_client._try_main_fallback_chain",
) as mock_main_chain, patch(
"agent.auxiliary_client._try_openrouter",
) as mock_openrouter:
from agent.auxiliary_client import _resolve_auto
client, model = _resolve_auto(task="title_generation")
assert client is task_client
assert model == "task-free-model"
mock_task_chain.assert_called_once_with(
"title_generation", "nvidia", reason="main provider unavailable")
mock_main_chain.assert_not_called()
mock_openrouter.assert_not_called()
def test_main_unavailable_uses_main_fallback_chain_before_builtin_chain(self):
"""Auto aux resolution honors top-level fallback_providers before built-ins."""
main_fallback_client = MagicMock()
with patch(
"agent.auxiliary_client._read_main_provider", return_value="nvidia",
), patch(
"agent.auxiliary_client._read_main_model", return_value="qwen/qwen3.5-122b-a10b",
), patch(
"agent.auxiliary_client.resolve_provider_client",
return_value=(None, None), # main provider has no client
), patch(
"agent.auxiliary_client._try_configured_fallback_chain",
return_value=(None, None, ""),
), patch(
"agent.auxiliary_client._try_main_fallback_chain",
return_value=(main_fallback_client, "inclusionai/ring-2.6-1t:free", "openrouter"),
) as mock_main_chain, patch(
"agent.auxiliary_client._try_openrouter",
) as mock_openrouter:
from agent.auxiliary_client import _resolve_auto
client, model = _resolve_auto(task="title_generation")
assert client is main_fallback_client
assert model == "inclusionai/ring-2.6-1t:free"
mock_main_chain.assert_called_once_with(
"title_generation", "nvidia", reason="main provider unavailable")
mock_openrouter.assert_not_called()
def test_no_main_config_uses_chain_directly(self):
"""No main provider configured → skip step 1, use chain (no regression)."""
chain_client = MagicMock()

View file

@ -176,11 +176,16 @@ class TestClientCacheBoundedGrowth:
"""When the loop changes, the old entry should be replaced, not duplicated."""
from agent.auxiliary_client import (
_client_cache,
_client_cache_key,
_client_cache_lock,
_get_cached_client,
)
key = ("test_replace", True, "", "", "", (), False, "")
key = _client_cache_key(
"test_replace",
async_mode=True,
task="",
)
# Simulate a stale entry from a closed loop
old_loop = asyncio.new_event_loop()

View file

@ -687,7 +687,7 @@ For task-specific direct endpoints, Hermes uses the task's configured API key or
## Fallback Providers (config.yaml only)
The primary model fallback chain is configured exclusively through `config.yaml` — there are no environment variables for it. Add a top-level `fallback_providers` list with `provider` and `model` keys to enable automatic failover when your main model encounters errors.
The primary model fallback chain is configured exclusively through `config.yaml` — there are no environment variables for it. Add a top-level `fallback_providers` list with `provider` and `model` keys to enable automatic failover when your main model encounters errors. Auxiliary tasks whose provider is `auto` also consult this chain before Hermes' built-in auxiliary discovery chain.
```yaml
fallback_providers:
@ -695,7 +695,7 @@ fallback_providers:
model: anthropic/claude-sonnet-4
```
The older top-level `fallback_model` single-provider shape is still read for backward compatibility, but new configuration should use `fallback_providers`.
The older top-level `fallback_model` single-provider shape is still read for backward compatibility, but new configuration should use `fallback_providers`. For task-specific auxiliary policy, use `auxiliary.<task>.fallback_chain` in `config.yaml`; there is no environment variable equivalent.
See [Fallback Providers](/user-guide/features/fallback-providers) for full details.

View file

@ -53,7 +53,7 @@ Click **Show auxiliary** to reveal the 11 task slots:
![Auxiliary panel expanded](/img/docs/dashboard-models/auxiliary-expanded.png)
Every auxiliary task defaults to `auto` — meaning Hermes uses your main model for that job too. Override a specific task when you want a cheaper or faster model for a side-job.
Every auxiliary task defaults to `auto` — meaning Hermes tries your main model for that job too. If that route is unavailable or hits a capacity-style failure, `auto` follows any task-specific `auxiliary.<task>.fallback_chain`, then the main `fallback_providers` / `fallback_model` chain, then Hermes' built-in auxiliary discovery chain. Override a specific task when you want a cheaper or faster model for a side-job.
### Common override patterns
@ -129,7 +129,21 @@ auxiliary:
# ... other fields unchanged
```
`provider: auto` with `model: ''` tells Hermes to use the main model for that task.
`provider: auto` with `model: ''` tells Hermes to use the main model for that task, while still honoring fallback policy if the main route cannot serve the auxiliary call.
Optional task-specific fallback chains live under the same auxiliary task:
```yaml
auxiliary:
title_generation:
provider: auto
model: ''
fallback_chain:
- provider: openrouter
model: inclusionai/ring-2.6-1t:free
```
When `fallback_chain` is absent, `auto` uses the top-level `fallback_providers` chain before the built-in auxiliary discovery chain.
## When does it take effect?

View file

@ -168,7 +168,7 @@ fallback_providers:
| Messaging gateway (Telegram, Discord, etc.) | ✔ |
| Subagent delegation | ✔ (subagents inherit the parent fallback chain) |
| Cron jobs | ✔ (cron agents inherit configured fallback providers) |
| Auxiliary tasks (vision, compression) | ✘ (use their own provider chain — see below) |
| Auxiliary tasks on `provider: auto` | ✔ (try per-task fallback, then the main fallback chain before built-in aux discovery) |
:::tip
There are no environment variables for the primary fallback chain — configure it exclusively through `config.yaml` or `hermes fallback`. This is intentional: fallback configuration is a deliberate choice, not something a stale shell export should override.
@ -195,23 +195,30 @@ Hermes uses separate lightweight models for side tasks. Each task has its own pr
### Auto-Detection Chain
When a task's provider is set to `"auto"` (the default), Hermes tries providers in order until one works:
When a task's provider is set to `"auto"` (the default), Hermes first tries the main provider + main model for that auxiliary task. If that route is unavailable or later fails with a capacity-style error, Hermes now honors user-configured fallback policy before using the built-in discovery chain:
**For text tasks (compression, web extract, etc.):**
```text
Main provider + main model → auxiliary.<task>.fallback_chain →
fallback_providers / fallback_model → built-in auxiliary discovery chain
```
The task-specific chain is most precise and wins when present. The top-level `fallback_providers` chain is the same policy the main agent uses, so free-only or same-provider fallback rules apply to auxiliary tasks on `auto` as well.
**Built-in text discovery chain (compression, web extract, title generation, etc.):**
```text
OpenRouter → Nous Portal → Custom endpoint → Codex OAuth →
API-key providers (z.ai, Kimi, MiniMax, Xiaomi MiMo, Hugging Face, Anthropic) → give up
```
**For vision tasks:**
**Built-in vision discovery chain:**
```text
Main provider (if vision-capable) → OpenRouter → Nous Portal →
Codex OAuth → Anthropic → Custom endpoint → give up
```
If the resolved provider fails at call time, Hermes also has an internal retry: if the provider is not OpenRouter and no explicit `base_url` is set, it tries OpenRouter as a last-resort fallback.
Those built-in chains are a convenience fallback for users who have not declared a task-specific or main fallback policy.
### Configuring Auxiliary Providers
@ -232,6 +239,9 @@ auxiliary:
compression:
provider: "auto"
model: ""
fallback_chain: # optional, task-specific fallback policy
- provider: openrouter
model: inclusionai/ring-2.6-1t:free
skills_hub:
provider: "auto"
@ -242,7 +252,9 @@ auxiliary:
model: ""
```
Every task above follows the same **provider / model / base_url** pattern. Context compression is configured under `auxiliary.compression`:
Every task above follows the same **provider / model / base_url** pattern. Each task can also declare its own `fallback_chain`; if omitted, `provider: auto` uses the top-level `fallback_providers` chain before Hermes' built-in auxiliary discovery chain.
Context compression is configured under `auxiliary.compression`:
```yaml
auxiliary: