diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py index 680125c7611..9728f9cfff0 100644 --- a/agent/chat_completion_helpers.py +++ b/agent/chat_completion_helpers.py @@ -39,6 +39,17 @@ from utils import base_url_host_matches, base_url_hostname, env_float, env_int logger = logging.getLogger(__name__) _OPENROUTER_PROVIDER_SORT_VALUES = {"throughput", "latency", "price"} +# When the fallback chain is fully exhausted on a non-rate-limit failure +# (e.g. every provider returns a non-retryable client error like HTTP 400), +# arm a short cooldown so the NEXT turn's restore_primary_runtime stays gated +# and does not reset _fallback_index=0 to replay the entire chain again. +# Without this, a client/gateway that re-submits immediately would re-marshal +# the full (potentially 80k-token) context once per provider every turn and +# can drive a constrained host into memory/swap exhaustion. Rate-limit / +# billing reasons keep their own 60s cooldown (set above); this is the +# narrower non-rate-limit case. See issue #24996. +_FALLBACK_EXHAUSTED_COOLDOWN_S = 5.0 + def _ra(): """Lazy ``run_agent`` reference. @@ -1117,8 +1128,22 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool if (not fallback_already_active) or (primary_provider and current_provider == primary_provider): agent._rate_limited_until = time.monotonic() + 60 if agent._fallback_index >= len(agent._fallback_chain): + # Chain exhausted. If we actually walked a non-empty chain and the + # failure was NOT a rate-limit/billing event (those already armed + # their own 60s cooldown above), arm a short cooldown so the next + # turn's restore_primary_runtime stays gated instead of resetting + # _fallback_index=0 and re-marshaling the whole context across every + # provider again. Guards the cross-turn replay storm in #24996. + if ( + len(agent._fallback_chain) > 0 + and reason not in {FailoverReason.rate_limit, FailoverReason.billing} + ): + _existing_cooldown = getattr(agent, "_rate_limited_until", 0) or 0 + agent._rate_limited_until = max( + _existing_cooldown, + time.monotonic() + _FALLBACK_EXHAUSTED_COOLDOWN_S, + ) return False - fb = agent._fallback_chain[agent._fallback_index] agent._fallback_index += 1 fb_provider = (fb.get("provider") or "").strip().lower() diff --git a/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py b/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py new file mode 100644 index 00000000000..8d82a2af679 --- /dev/null +++ b/tests/run_agent/test_24996_fallback_exhaustion_cooldown.py @@ -0,0 +1,116 @@ +"""Regression tests for #24996 — fallback-switch storm on host memory. + +When every provider in the fallback chain fails non-retryably back-to-back +(e.g. HTTP 400/402/429 across distinct providers), the within-turn walk is +bounded (``_fallback_index`` advances monotonically and the loop aborts when +the chain exhausts). The damaging mode is *cross-turn*: ``restore_primary_ +runtime`` resets ``_fallback_index = 0`` every turn, so a client that +re-submits immediately replays the entire chain — re-marshaling the full +(potentially 80k-token) context once per provider every turn — with no +throttle on the non-rate-limit path. + +The fix arms a short cooldown via the existing ``_rate_limited_until`` gate +when the chain exhausts on a non-rate-limit failure, so the next turn's +restore stays gated (and does NOT reset the index) until the cooldown clears. +Rate-limit / billing failures keep their own 60s cooldown and are unaffected. +""" + +import time +from unittest.mock import MagicMock, patch + +from run_agent import AIAgent +from agent.error_classifier import FailoverReason +from agent.chat_completion_helpers import _FALLBACK_EXHAUSTED_COOLDOWN_S + + +def _make_agent(fallback_model=None): + with ( + patch("run_agent.get_tool_definitions", return_value=[]), + patch("run_agent.check_toolset_requirements", return_value={}), + patch("run_agent.OpenAI"), + ): + agent = AIAgent( + api_key="test-key", + base_url="https://openrouter.ai/api/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + fallback_model=fallback_model, + ) + agent.client = MagicMock() + return agent + + +def _mock_client(base_url="https://openrouter.ai/api/v1", api_key="fb-key"): + mock = MagicMock() + mock.base_url = base_url + mock.api_key = api_key + return mock + + +class TestExhaustionArmsCooldown: + def test_non_retryable_exhaustion_arms_cooldown(self): + """Walking a non-empty chain to exhaustion on a non-rate-limit + failure arms a short ``_rate_limited_until`` cooldown.""" + fbs = [ + {"provider": "openai", "model": "gpt-4o"}, + {"provider": "zai", "model": "glm-4.7"}, + ] + agent = _make_agent(fallback_model=fbs) + agent._rate_limited_until = 0 + before = time.monotonic() + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(_mock_client(), "resolved"), + ): + assert agent._try_activate_fallback() is True # -> entry 0 + assert agent._try_activate_fallback() is True # -> entry 1 + # Chain now exhausted; a non-rate-limit failure must arm cooldown. + assert agent._try_activate_fallback() is False + cooldown = getattr(agent, "_rate_limited_until", 0) + assert cooldown > before + # Cooldown is the short exhaustion window, not the 60s rate-limit one. + assert cooldown <= before + _FALLBACK_EXHAUSTED_COOLDOWN_S + 1.0 + + def test_no_chain_does_not_arm_cooldown(self): + """An empty chain (no fallback configured) must not arm a cooldown — + there is no chain to storm, and gating primary restoration would be + pointless punishment.""" + agent = _make_agent(fallback_model=None) + agent._rate_limited_until = 0 + assert agent._try_activate_fallback() is False + assert getattr(agent, "_rate_limited_until", 0) == 0 + + def test_rate_limit_exhaustion_keeps_60s_cooldown(self): + """A rate-limit failure already arms its own 60s cooldown; the short + exhaustion window must not shrink it.""" + fbs = [{"provider": "openai", "model": "gpt-4o"}] + agent = _make_agent(fallback_model=fbs) + agent._rate_limited_until = 0 + before = time.monotonic() + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(_mock_client(), "resolved"), + ): + # First activation with rate_limit reason arms the 60s cooldown. + assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is True + # Chain exhausted on the next call (also rate_limit) -> still False, + # and the 60s cooldown must survive (max(), not overwritten down). + assert agent._try_activate_fallback(reason=FailoverReason.rate_limit) is False + cooldown = getattr(agent, "_rate_limited_until", 0) + assert cooldown > before + 50 # ~60s, far past the short window + + def test_cooldown_never_shrinks_existing_window(self): + """If a longer cooldown is already armed, exhaustion must not reduce + it (we take the max).""" + fbs = [{"provider": "openai", "model": "gpt-4o"}] + agent = _make_agent(fallback_model=fbs) + far_future = time.monotonic() + 999 + agent._rate_limited_until = far_future + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(_mock_client(), "resolved"), + ): + assert agent._try_activate_fallback() is True + assert agent._try_activate_fallback() is False + assert getattr(agent, "_rate_limited_until", 0) >= far_future