fix(agent): throttle cross-turn fallback-switch replay storm (#24996) (#53909)

When every provider in the fallback chain fails non-retryably back-to-back (e.g. HTTP 400/402/429 across distinct providers), the within-turn walk is already bounded — _fallback_index advances monotonically and the loop aborts when the chain exhausts. The damaging mode is cross-turn: restore_primary_ runtime resets _fallback_index=0 every turn, so a client that re-submits immediately replays the entire chain, re-marshaling the full (potentially 80k-token) context once per provider every turn with no throttle on the non-rate-limit path. On constrained hosts this exhausts memory/swap. Rate-limit/billing failures already arm a 60s cooldown via _rate_limited_until; the gap was the non-rate-limit case. Now, when the chain exhausts on a non- rate-limit failure with a non-empty chain, arm a short (5s) cooldown on the same _rate_limited_until gate (max(), never shrinking an existing window). The next turn's restore stays gated and does NOT reset the index, so the chain isn't replayed until the cooldown clears. No new state, no thread sleep, no false-trip on legitimately long chains (those walk normally within a turn). Tests: tests/run_agent/test_24996_fallback_exhaustion_cooldown.py
2026-07-01 12:02:05 +00:00 · 2026-06-27 19:15:40 -07:00 · 2026-06-27 19:15:40 -07:00 · fae920642a
commit fae920642a
parent 1dde7e2f2a
2 changed files with 142 additions and 1 deletions
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@ -39,6 +39,17 @@ from utils import base_url_host_matches, base_url_hostname, env_float, env_int
 logger = logging.getLogger(__name__)
 _OPENROUTER_PROVIDER_SORT_VALUES = {"throughput", "latency", "price"}

+# When the fallback chain is fully exhausted on a non-rate-limit failure
+# (e.g. every provider returns a non-retryable client error like HTTP 400),
+# arm a short cooldown so the NEXT turn's restore_primary_runtime stays gated
+# and does not reset _fallback_index=0 to replay the entire chain again.
+# Without this, a client/gateway that re-submits immediately would re-marshal
+# the full (potentially 80k-token) context once per provider every turn and
+# can drive a constrained host into memory/swap exhaustion.  Rate-limit /
+# billing reasons keep their own 60s cooldown (set above); this is the
+# narrower non-rate-limit case.  See issue #24996.
+_FALLBACK_EXHAUSTED_COOLDOWN_S = 5.0
+

 def _ra():
    """Lazy ``run_agent`` reference.
@ -1117,8 +1128,22 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
        if (not fallback_already_active) or (primary_provider and current_provider == primary_provider):
            agent._rate_limited_until = time.monotonic() + 60
    if agent._fallback_index >= len(agent._fallback_chain):
+        # Chain exhausted.  If we actually walked a non-empty chain and the
+        # failure was NOT a rate-limit/billing event (those already armed
+        # their own 60s cooldown above), arm a short cooldown so the next
+        # turn's restore_primary_runtime stays gated instead of resetting
+        # _fallback_index=0 and re-marshaling the whole context across every
+        # provider again.  Guards the cross-turn replay storm in #24996.
+        if (
+            len(agent._fallback_chain) > 0
+            and reason not in {FailoverReason.rate_limit, FailoverReason.billing}
+        ):
+            _existing_cooldown = getattr(agent, "_rate_limited_until", 0) or 0
+            agent._rate_limited_until = max(
+                _existing_cooldown,
+                time.monotonic() + _FALLBACK_EXHAUSTED_COOLDOWN_S,
+            )
        return False
-
    fb = agent._fallback_chain[agent._fallback_index]
    agent._fallback_index += 1
    fb_provider = (fb.get("provider") or "").strip().lower()
--- a/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py
+++ b/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py
@ -0,0 +1,116 @@
+"""Regression tests for #24996 — fallback-switch storm on host memory.
+
+When every provider in the fallback chain fails non-retryably back-to-back
+(e.g. HTTP 400/402/429 across distinct providers), the within-turn walk is
+bounded (``_fallback_index`` advances monotonically and the loop aborts when
+the chain exhausts).  The damaging mode is *cross-turn*: ``restore_primary_
+runtime`` resets ``_fallback_index = 0`` every turn, so a client that
+re-submits immediately replays the entire chain — re-marshaling the full
+(potentially 80k-token) context once per provider every turn — with no
+throttle on the non-rate-limit path.
+
+The fix arms a short cooldown via the existing ``_rate_limited_until`` gate
+when the chain exhausts on a non-rate-limit failure, so the next turn's
+restore stays gated (and does NOT reset the index) until the cooldown clears.
+Rate-limit / billing failures keep their own 60s cooldown and are unaffected.
+"""
+
+import time
+from unittest.mock import MagicMock, patch
+
+from run_agent import AIAgent
+from agent.error_classifier import FailoverReason
+from agent.chat_completion_helpers import _FALLBACK_EXHAUSTED_COOLDOWN_S
+
+
+def _make_agent(fallback_model=None):
+    with (
+        patch("run_agent.get_tool_definitions", return_value=[]),
+        patch("run_agent.check_toolset_requirements", return_value={}),
+        patch("run_agent.OpenAI"),
+    ):
+        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
+            quiet_mode=True,
+            skip_context_files=True,
+            skip_memory=True,
+            fallback_model=fallback_model,
+        )
+        agent.client = MagicMock()
+        return agent
+
+
+def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"):
+    mock = MagicMock()
+    mock.base_url = base_url
+    mock.api_key = api_key
+    return mock
+
+
+class TestExhaustionArmsCooldown:
+    def test_non_retryable_exhaustion_arms_cooldown(self):
+        """Walking a non-empty chain to exhaustion on a non-rate-limit
+        failure arms a short ``_rate_limited_until`` cooldown."""
+        fbs = [
+            {"provider": "openai", "model": "gpt-4o"},
+            {"provider": "zai", "model": "glm-4.7"},
+        ]
+        agent = _make_agent(fallback_model=fbs)
+        agent._rate_limited_until = 0
+        before = time.monotonic()
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(_mock_client(), "resolved"),
+        ):
+            assert agent._try_activate_fallback() is True   # -> entry 0
+            assert agent._try_activate_fallback() is True   # -> entry 1
+            # Chain now exhausted; a non-rate-limit failure must arm cooldown.
+            assert agent._try_activate_fallback() is False
+        cooldown = getattr(agent, "_rate_limited_until", 0)
+        assert cooldown > before
+        # Cooldown is the short exhaustion window, not the 60s rate-limit one.
+        assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 1.0
+
+    def test_no_chain_does_not_arm_cooldown(self):
+        """An empty chain (no fallback configured) must not arm a cooldown —
+        there is no chain to storm, and gating primary restoration would be
+        pointless punishment."""
+        agent = _make_agent(fallback_model=None)
+        agent._rate_limited_until = 0
+        assert agent._try_activate_fallback() is False
+        assert getattr(agent, "_rate_limited_until", 0) == 0
+
+    def test_rate_limit_exhaustion_keeps_60s_cooldown(self):
+        """A rate-limit failure already arms its own 60s cooldown; the short
+        exhaustion window must not shrink it."""
+        fbs = [{"provider": "openai", "model": "gpt-4o"}]
+        agent = _make_agent(fallback_model=fbs)
+        agent._rate_limited_until = 0
+        before = time.monotonic()
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(_mock_client(), "resolved"),
+        ):
+            # First activation with rate_limit reason arms the 60s cooldown.
+            assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is True
+            # Chain exhausted on the next call (also rate_limit) -> still False,
+            # and the 60s cooldown must survive (max(), not overwritten down).
+            assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is False
+        cooldown = getattr(agent, "_rate_limited_until", 0)
+        assert cooldown > before + 50  # ~60s, far past the short window
+
+    def test_cooldown_never_shrinks_existing_window(self):
+        """If a longer cooldown is already armed, exhaustion must not reduce
+        it (we take the max)."""
+        fbs = [{"provider": "openai", "model": "gpt-4o"}]
+        agent = _make_agent(fallback_model=fbs)
+        far_future = time.monotonic() + 999
+        agent._rate_limited_until = far_future
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(_mock_client(), "resolved"),
+        ):
+            assert agent._try_activate_fallback() is True
+            assert agent._try_activate_fallback() is False
+        assert getattr(agent, "_rate_limited_until", 0) >= far_future