diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py
index 680125c7611..9728f9cfff0 100644
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -39,6 +39,17 @@ from utils import base_url_host_matches, base_url_hostname, env_float, env_int
 logger = logging.getLogger(__name__)
 _OPENROUTER_PROVIDER_SORT_VALUES = {"throughput", "latency", "price"}
 
+# When the fallback chain is fully exhausted on a non-rate-limit failure
+# (e.g. every provider returns a non-retryable client error like HTTP 400),
+# arm a short cooldown so the NEXT turn's restore_primary_runtime stays gated
+# and does not reset _fallback_index=0 to replay the entire chain again.
+# Without this, a client/gateway that re-submits immediately would re-marshal
+# the full (potentially 80k-token) context once per provider every turn and
+# can drive a constrained host into memory/swap exhaustion.  Rate-limit /
+# billing reasons keep their own 60s cooldown (set above); this is the
+# narrower non-rate-limit case.  See issue #24996.
+_FALLBACK_EXHAUSTED_COOLDOWN_S = 5.0
+
 
 def _ra():
     """Lazy ``run_agent`` reference.
@@ -1117,8 +1128,22 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
         if (not fallback_already_active) or (primary_provider and current_provider == primary_provider):
             agent._rate_limited_until = time.monotonic() + 60
     if agent._fallback_index >= len(agent._fallback_chain):
+        # Chain exhausted.  If we actually walked a non-empty chain and the
+        # failure was NOT a rate-limit/billing event (those already armed
+        # their own 60s cooldown above), arm a short cooldown so the next
+        # turn's restore_primary_runtime stays gated instead of resetting
+        # _fallback_index=0 and re-marshaling the whole context across every
+        # provider again.  Guards the cross-turn replay storm in #24996.
+        if (
+            len(agent._fallback_chain) > 0
+            and reason not in {FailoverReason.rate_limit, FailoverReason.billing}
+        ):
+            _existing_cooldown = getattr(agent, "_rate_limited_until", 0) or 0
+            agent._rate_limited_until = max(
+                _existing_cooldown,
+                time.monotonic() + _FALLBACK_EXHAUSTED_COOLDOWN_S,
+            )
         return False
-
     fb = agent._fallback_chain[agent._fallback_index]
     agent._fallback_index += 1
     fb_provider = (fb.get("provider") or "").strip().lower()
diff --git a/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py b/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py
new file mode 100644
index 00000000000..8d82a2af679
--- /dev/null
+++ b/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py
@@ -0,0 +1,116 @@
+"""Regression tests for #24996 — fallback-switch storm on host memory.
+
+When every provider in the fallback chain fails non-retryably back-to-back
+(e.g. HTTP 400/402/429 across distinct providers), the within-turn walk is
+bounded (``_fallback_index`` advances monotonically and the loop aborts when
+the chain exhausts).  The damaging mode is *cross-turn*: ``restore_primary_
+runtime`` resets ``_fallback_index = 0`` every turn, so a client that
+re-submits immediately replays the entire chain — re-marshaling the full
+(potentially 80k-token) context once per provider every turn — with no
+throttle on the non-rate-limit path.
+
+The fix arms a short cooldown via the existing ``_rate_limited_until`` gate
+when the chain exhausts on a non-rate-limit failure, so the next turn's
+restore stays gated (and does NOT reset the index) until the cooldown clears.
+Rate-limit / billing failures keep their own 60s cooldown and are unaffected.
+"""
+
+import time
+from unittest.mock import MagicMock, patch
+
+from run_agent import AIAgent
+from agent.error_classifier import FailoverReason
+from agent.chat_completion_helpers import _FALLBACK_EXHAUSTED_COOLDOWN_S
+
+
+def _make_agent(fallback_model=None):
+    with (
+        patch("run_agent.get_tool_definitions", return_value=[]),
+        patch("run_agent.check_toolset_requirements", return_value={}),
+        patch("run_agent.OpenAI"),
+    ):
+        agent = AIAgent(
+            api_key="test-key",
+            base_url="https://openrouter.ai/api/v1",
+            quiet_mode=True,
+            skip_context_files=True,
+            skip_memory=True,
+            fallback_model=fallback_model,
+        )
+        agent.client = MagicMock()
+        return agent
+
+
+def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"):
+    mock = MagicMock()
+    mock.base_url = base_url
+    mock.api_key = api_key
+    return mock
+
+
+class TestExhaustionArmsCooldown:
+    def test_non_retryable_exhaustion_arms_cooldown(self):
+        """Walking a non-empty chain to exhaustion on a non-rate-limit
+        failure arms a short ``_rate_limited_until`` cooldown."""
+        fbs = [
+            {"provider": "openai", "model": "gpt-4o"},
+            {"provider": "zai", "model": "glm-4.7"},
+        ]
+        agent = _make_agent(fallback_model=fbs)
+        agent._rate_limited_until = 0
+        before = time.monotonic()
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(_mock_client(), "resolved"),
+        ):
+            assert agent._try_activate_fallback() is True   # -> entry 0
+            assert agent._try_activate_fallback() is True   # -> entry 1
+            # Chain now exhausted; a non-rate-limit failure must arm cooldown.
+            assert agent._try_activate_fallback() is False
+        cooldown = getattr(agent, "_rate_limited_until", 0)
+        assert cooldown > before
+        # Cooldown is the short exhaustion window, not the 60s rate-limit one.
+        assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 1.0
+
+    def test_no_chain_does_not_arm_cooldown(self):
+        """An empty chain (no fallback configured) must not arm a cooldown —
+        there is no chain to storm, and gating primary restoration would be
+        pointless punishment."""
+        agent = _make_agent(fallback_model=None)
+        agent._rate_limited_until = 0
+        assert agent._try_activate_fallback() is False
+        assert getattr(agent, "_rate_limited_until", 0) == 0
+
+    def test_rate_limit_exhaustion_keeps_60s_cooldown(self):
+        """A rate-limit failure already arms its own 60s cooldown; the short
+        exhaustion window must not shrink it."""
+        fbs = [{"provider": "openai", "model": "gpt-4o"}]
+        agent = _make_agent(fallback_model=fbs)
+        agent._rate_limited_until = 0
+        before = time.monotonic()
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(_mock_client(), "resolved"),
+        ):
+            # First activation with rate_limit reason arms the 60s cooldown.
+            assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is True
+            # Chain exhausted on the next call (also rate_limit) -> still False,
+            # and the 60s cooldown must survive (max(), not overwritten down).
+            assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is False
+        cooldown = getattr(agent, "_rate_limited_until", 0)
+        assert cooldown > before + 50  # ~60s, far past the short window
+
+    def test_cooldown_never_shrinks_existing_window(self):
+        """If a longer cooldown is already armed, exhaustion must not reduce
+        it (we take the max)."""
+        fbs = [{"provider": "openai", "model": "gpt-4o"}]
+        agent = _make_agent(fallback_model=fbs)
+        far_future = time.monotonic() + 999
+        agent._rate_limited_until = far_future
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(_mock_client(), "resolved"),
+        ):
+            assert agent._try_activate_fallback() is True
+            assert agent._try_activate_fallback() is False
+        assert getattr(agent, "_rate_limited_until", 0) >= far_future