fix(agent): reset _fallback_index at turn start even when no fallback activated

In long-lived interactive sessions, _try_activate_fallback() advances
_fallback_index before attempting client resolution.  When resolution
fails (provider not configured, etc.) the function returns False without
ever setting _fallback_activated=True.  _restore_primary_runtime() then
skips its reset block entirely (guarded by `if not _fallback_activated`),
leaving _fallback_index >= len(_fallback_chain) for all subsequent turns.
The eager-fallback guard at the top of the retry loop checks
`_fallback_index < len(_fallback_chain)`, so the condition fails silently
and no fallback is ever attempted again for that session.

Cron jobs spawn a fresh AIAgent per run and never hit this path, which is
why the same fallback chain works reliably for cron but not interactive.

Fix: reset _fallback_index=0 in the `not _fallback_activated` early-return
branch so every new turn starts with the full chain available.

Fixes #20465
This commit is contained in:
konsisumer 2026-05-06 17:21:47 +02:00 committed by Teknium
parent 2b193907d6
commit 33528b428d
2 changed files with 28 additions and 0 deletions

View file

@ -9223,6 +9223,14 @@ class AIAgent:
``gateway/run.py``), so this restoration IS needed there too.
"""
if not self._fallback_activated:
# Reset the chain index even when no fallback was activated this
# turn. Without this, a turn where _try_activate_fallback() was
# called but returned False (chain exhausted or provider not
# configured) leaves _fallback_index >= len(_fallback_chain) while
# _fallback_activated stays False. The next turn skips this block
# entirely, stranding the index and silently blocking all future
# fallback attempts for the session. Fixes #20465.
self._fallback_index = 0
return False
if getattr(self, "_rate_limited_until", 0) > time.monotonic():

View file

@ -123,6 +123,26 @@ class TestRestorePrimaryRuntime:
assert agent._fallback_activated is False
assert agent._restore_primary_runtime() is False
def test_resets_index_when_fallback_not_activated(self):
"""Regression for #20465: failed activation leaves _fallback_index advanced
with _fallback_activated=False; the next turn's restore must reset the index."""
fbs = [{"provider": "custom", "model": "gpt-oss:20b",
"base_url": "http://host.docker.internal:11434/v1", "api_key": "ollama"}]
agent = _make_agent(fallback_model=fbs)
# resolve_provider_client returns None → _try_activate_fallback returns False
# but _fallback_index has already been incremented to 1
with patch("agent.auxiliary_client.resolve_provider_client", return_value=(None, None)):
assert agent._try_activate_fallback() is False
assert agent._fallback_activated is False
assert agent._fallback_index == 1 # advanced past the only entry
# _restore_primary_runtime must reset the index so the next turn can retry
result = agent._restore_primary_runtime()
assert result is False # still no-op (primary was never left)
assert agent._fallback_index == 0 # chain available again
def test_restores_model_and_provider(self):
agent = _make_agent(
fallback_model={"provider": "openrouter", "model": "anthropic/claude-sonnet-4"},