fix(agent): throttle cross-turn fallback-switch replay storm (#24996) (#53909)

When every provider in the fallback chain fails non-retryably back-to-back
(e.g. HTTP 400/402/429 across distinct providers), the within-turn walk is
already bounded — _fallback_index advances monotonically and the loop aborts
when the chain exhausts. The damaging mode is cross-turn: restore_primary_
runtime resets _fallback_index=0 every turn, so a client that re-submits
immediately replays the entire chain, re-marshaling the full (potentially
80k-token) context once per provider every turn with no throttle on the
non-rate-limit path. On constrained hosts this exhausts memory/swap.

Rate-limit/billing failures already arm a 60s cooldown via _rate_limited_until;
the gap was the non-rate-limit case. Now, when the chain exhausts on a non-
rate-limit failure with a non-empty chain, arm a short (5s) cooldown on the
same _rate_limited_until gate (max(), never shrinking an existing window).
The next turn's restore stays gated and does NOT reset the index, so the
chain isn't replayed until the cooldown clears. No new state, no thread sleep,
no false-trip on legitimately long chains (those walk normally within a turn).

Tests: tests/run_agent/test_24996_fallback_exhaustion_cooldown.py
This commit is contained in:
Teknium 2026-06-27 19:15:40 -07:00 committed by GitHub
parent 1dde7e2f2a
commit fae920642a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 142 additions and 1 deletions

View file

@ -39,6 +39,17 @@ from utils import base_url_host_matches, base_url_hostname, env_float, env_int
logger = logging.getLogger(__name__)
_OPENROUTER_PROVIDER_SORT_VALUES = {"throughput", "latency", "price"}
# When the fallback chain is fully exhausted on a non-rate-limit failure
# (e.g. every provider returns a non-retryable client error like HTTP 400),
# arm a short cooldown so the NEXT turn's restore_primary_runtime stays gated
# and does not reset _fallback_index=0 to replay the entire chain again.
# Without this, a client/gateway that re-submits immediately would re-marshal
# the full (potentially 80k-token) context once per provider every turn and
# can drive a constrained host into memory/swap exhaustion. Rate-limit /
# billing reasons keep their own 60s cooldown (set above); this is the
# narrower non-rate-limit case. See issue #24996.
_FALLBACK_EXHAUSTED_COOLDOWN_S = 5.0
def _ra():
"""Lazy ``run_agent`` reference.
@ -1117,8 +1128,22 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
if (not fallback_already_active) or (primary_provider and current_provider == primary_provider):
agent._rate_limited_until = time.monotonic() + 60
if agent._fallback_index >= len(agent._fallback_chain):
# Chain exhausted. If we actually walked a non-empty chain and the
# failure was NOT a rate-limit/billing event (those already armed
# their own 60s cooldown above), arm a short cooldown so the next
# turn's restore_primary_runtime stays gated instead of resetting
# _fallback_index=0 and re-marshaling the whole context across every
# provider again. Guards the cross-turn replay storm in #24996.
if (
len(agent._fallback_chain) > 0
and reason not in {FailoverReason.rate_limit, FailoverReason.billing}
):
_existing_cooldown = getattr(agent, "_rate_limited_until", 0) or 0
agent._rate_limited_until = max(
_existing_cooldown,
time.monotonic() + _FALLBACK_EXHAUSTED_COOLDOWN_S,
)
return False
fb = agent._fallback_chain[agent._fallback_index]
agent._fallback_index += 1
fb_provider = (fb.get("provider") or "").strip().lower()

View file

@ -0,0 +1,116 @@
"""Regression tests for #24996 — fallback-switch storm on host memory.
When every provider in the fallback chain fails non-retryably back-to-back
(e.g. HTTP 400/402/429 across distinct providers), the within-turn walk is
bounded (``_fallback_index`` advances monotonically and the loop aborts when
the chain exhausts). The damaging mode is *cross-turn*: ``restore_primary_
runtime`` resets ``_fallback_index = 0`` every turn, so a client that
re-submits immediately replays the entire chain re-marshaling the full
(potentially 80k-token) context once per provider every turn with no
throttle on the non-rate-limit path.
The fix arms a short cooldown via the existing ``_rate_limited_until`` gate
when the chain exhausts on a non-rate-limit failure, so the next turn's
restore stays gated (and does NOT reset the index) until the cooldown clears.
Rate-limit / billing failures keep their own 60s cooldown and are unaffected.
"""
import time
from unittest.mock import MagicMock, patch
from run_agent import AIAgent
from agent.error_classifier import FailoverReason
from agent.chat_completion_helpers import _FALLBACK_EXHAUSTED_COOLDOWN_S
def _make_agent(fallback_model=None):
with (
patch("run_agent.get_tool_definitions", return_value=[]),
patch("run_agent.check_toolset_requirements", return_value={}),
patch("run_agent.OpenAI"),
):
agent = AIAgent(
api_key="test-key",
base_url="https://openrouter.ai/api/v1",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
fallback_model=fallback_model,
)
agent.client = MagicMock()
return agent
def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"):
mock = MagicMock()
mock.base_url = base_url
mock.api_key = api_key
return mock
class TestExhaustionArmsCooldown:
def test_non_retryable_exhaustion_arms_cooldown(self):
"""Walking a non-empty chain to exhaustion on a non-rate-limit
failure arms a short ``_rate_limited_until`` cooldown."""
fbs = [
{"provider": "openai", "model": "gpt-4o"},
{"provider": "zai", "model": "glm-4.7"},
]
agent = _make_agent(fallback_model=fbs)
agent._rate_limited_until = 0
before = time.monotonic()
with patch(
"agent.auxiliary_client.resolve_provider_client",
return_value=(_mock_client(), "resolved"),
):
assert agent._try_activate_fallback() is True # -> entry 0
assert agent._try_activate_fallback() is True # -> entry 1
# Chain now exhausted; a non-rate-limit failure must arm cooldown.
assert agent._try_activate_fallback() is False
cooldown = getattr(agent, "_rate_limited_until", 0)
assert cooldown > before
# Cooldown is the short exhaustion window, not the 60s rate-limit one.
assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 1.0
def test_no_chain_does_not_arm_cooldown(self):
"""An empty chain (no fallback configured) must not arm a cooldown —
there is no chain to storm, and gating primary restoration would be
pointless punishment."""
agent = _make_agent(fallback_model=None)
agent._rate_limited_until = 0
assert agent._try_activate_fallback() is False
assert getattr(agent, "_rate_limited_until", 0) == 0
def test_rate_limit_exhaustion_keeps_60s_cooldown(self):
"""A rate-limit failure already arms its own 60s cooldown; the short
exhaustion window must not shrink it."""
fbs = [{"provider": "openai", "model": "gpt-4o"}]
agent = _make_agent(fallback_model=fbs)
agent._rate_limited_until = 0
before = time.monotonic()
with patch(
"agent.auxiliary_client.resolve_provider_client",
return_value=(_mock_client(), "resolved"),
):
# First activation with rate_limit reason arms the 60s cooldown.
assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is True
# Chain exhausted on the next call (also rate_limit) -> still False,
# and the 60s cooldown must survive (max(), not overwritten down).
assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is False
cooldown = getattr(agent, "_rate_limited_until", 0)
assert cooldown > before + 50 # ~60s, far past the short window
def test_cooldown_never_shrinks_existing_window(self):
"""If a longer cooldown is already armed, exhaustion must not reduce
it (we take the max)."""
fbs = [{"provider": "openai", "model": "gpt-4o"}]
agent = _make_agent(fallback_model=fbs)
far_future = time.monotonic() + 999
agent._rate_limited_until = far_future
with patch(
"agent.auxiliary_client.resolve_provider_client",
return_value=(_mock_client(), "resolved"),
):
assert agent._try_activate_fallback() is True
assert agent._try_activate_fallback() is False
assert getattr(agent, "_rate_limited_until", 0) >= far_future