From e7c0d6ee5371dab9eb8b54af60ea88f8455353b8 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 9 May 2026 12:48:19 -0700 Subject: [PATCH] fix(fallback): skip chain entries matching current provider/model/base_url (#22780) _try_activate_fallback() walked the chain by index without comparing the candidate entry against the currently-failing backend. So a misconfigured chain that listed the same provider+model as the primary, or two custom_providers entries pointing at the same shim URL, would loop the same failure 3x for the same backend. After the fix, advance() skips: - entries where (provider, model) match the current agent's - entries with a base_url + model matching the current backend (catches two custom_providers names pointing at the same shim) Recursing through self._try_activate_fallback() continues to the next chain entry; if everything matches, returns False and the caller moves on without retrying the same broken path. 3 regression tests covering same-provider-same-model skip, same-base_url- same-model skip, and the all-self-matching-returns-False exhaustion path. Closes #22548 (the Hermes-side portion). The 120s timeout itself in the downstream claude-cli shim is a deployment concern documented in that issue's wherewolf87 comment. --- run_agent.py | 26 +++++++ tests/run_agent/test_provider_fallback.py | 85 +++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/run_agent.py b/run_agent.py index 6fe17d8a7fa..5bc644e45c8 100644 --- a/run_agent.py +++ b/run_agent.py @@ -8042,6 +8042,32 @@ class AIAgent: if not fb_provider or not fb_model: return self._try_activate_fallback() # skip invalid, try next + # Skip entries that resolve to the current (provider, model) — falling + # back to the same backend that just failed loops the failure. Compare + # base_url too so two distinct custom_providers entries pointing at the + # same shim/proxy URL also dedup. See issue #22548. + current_provider = (getattr(self, "provider", "") or "").strip().lower() + current_model = (getattr(self, "model", "") or "").strip() + current_base_url = str(getattr(self, "base_url", "") or "").rstrip("/").lower() + fb_base_url_for_dedup = (fb.get("base_url") or "").strip().rstrip("/").lower() + if fb_provider == current_provider and fb_model == current_model: + logging.warning( + "Fallback skip: chain entry %s/%s matches current provider/model", + fb_provider, fb_model, + ) + return self._try_activate_fallback() + if ( + fb_base_url_for_dedup + and current_base_url + and fb_base_url_for_dedup == current_base_url + and fb_model == current_model + ): + logging.warning( + "Fallback skip: chain entry base_url %s matches current backend", + fb_base_url_for_dedup, + ) + return self._try_activate_fallback() + # Use centralized router for client construction. # raw_codex=True because the main agent needs direct responses.stream() # access for Codex providers. diff --git a/tests/run_agent/test_provider_fallback.py b/tests/run_agent/test_provider_fallback.py index 44de0846f4d..b179cc341cc 100644 --- a/tests/run_agent/test_provider_fallback.py +++ b/tests/run_agent/test_provider_fallback.py @@ -220,3 +220,88 @@ class TestPoolRotationRoom: def test_many_credentials_available_returns_true(self): assert _pool_may_recover_from_rate_limit(_pool(10)) is True + + +# ── Skip-self dedup (#22548) ─────────────────────────────────────────────── + + +class TestFallbackChainDedup: + """A fallback chain entry that resolves to the current provider/model + (or the same custom-provider base_url) must be skipped, not retried. + Otherwise a misconfigured chain or two custom_providers entries pointing + at the same shim loop the same failure. See issue #22548.""" + + def test_skips_entry_matching_current_provider_and_model(self): + """Chain has [same-as-current, real-fallback]; activate must skip + the first and use the second.""" + fbs = [ + # First entry == current state. Should be skipped. + {"provider": "openrouter", "model": "z-ai/glm-4.7"}, + # Second entry: real fallback. + {"provider": "zai", "model": "glm-4.7"}, + ] + agent = _make_agent(fallback_model=fbs) + agent.provider = "openrouter" + agent.model = "z-ai/glm-4.7" + agent.base_url = "https://openrouter.ai/api/v1" + + # Stub out resolve_provider_client so we can assert which entry was + # actually used — return a MagicMock client tagged with the provider. + called = [] + def _resolve(provider, model=None, raw_codex=False, **kwargs): + called.append((provider, model)) + return _mock_client(), model + with patch("agent.auxiliary_client.resolve_provider_client", side_effect=_resolve): + with patch("hermes_cli.model_normalize.normalize_model_for_provider", side_effect=lambda m, p: m): + ok = agent._try_activate_fallback() + + assert ok is True + # The first entry was skipped — only the second reached resolve. + assert called == [("zai", "glm-4.7")], ( + f"expected fallback to skip same-state entry, got call order: {called}" + ) + + def test_skips_entry_matching_current_base_url_and_model(self): + """Two custom_providers entries pointing at the same shim URL + with the same model should dedup even if their provider names differ.""" + fbs = [ + # Different provider name but same shim URL + model — same backend. + {"provider": "claude-cli-alt", "model": "claude-opus-4.7", + "base_url": "http://127.0.0.1:7891/v1"}, + # Real different fallback. + {"provider": "openrouter", "model": "anthropic/claude-opus-4.7"}, + ] + agent = _make_agent(fallback_model=fbs) + agent.provider = "claude-cli" + agent.model = "claude-opus-4.7" + agent.base_url = "http://127.0.0.1:7891/v1" + + called = [] + def _resolve(provider, model=None, raw_codex=False, **kwargs): + called.append((provider, model)) + return _mock_client(), model + with patch("agent.auxiliary_client.resolve_provider_client", side_effect=_resolve): + with patch("hermes_cli.model_normalize.normalize_model_for_provider", side_effect=lambda m, p: m): + ok = agent._try_activate_fallback() + + assert ok is True + # Same shim/base_url+model entry skipped, second one used. + assert called == [("openrouter", "anthropic/claude-opus-4.7")], ( + f"expected base_url-aware dedup, got call order: {called}" + ) + + def test_returns_false_when_only_self_matching_entries(self): + """A chain with only self-matching entries exhausts to False.""" + fbs = [ + {"provider": "openrouter", "model": "z-ai/glm-4.7"}, + ] + agent = _make_agent(fallback_model=fbs) + agent.provider = "openrouter" + agent.model = "z-ai/glm-4.7" + agent.base_url = "https://openrouter.ai/api/v1" + + with patch("agent.auxiliary_client.resolve_provider_client") as mock_resolve: + ok = agent._try_activate_fallback() + + assert ok is False + mock_resolve.assert_not_called()