fix: include rate-limit in auxiliary capacity-error fallback gate

Rate-limit (429) errors on explicit-provider auxiliary tasks were silently failing instead of triggering the fallback chain. The is_capacity_error gate only checked payment and connection errors, excluding rate limits — so when a configured provider like openai-codex hit its rate limit, auxiliary tasks (kanban_decomposer, vision, web_extract, approval, etc.) had zero resilience. Add _is_rate_limit_error() to is_capacity_error at both call sites (sync and async paths) so rate limits trigger fallback regardless of whether the provider was auto-detected or explicitly configured. Fixes #52228
2026-06-27 11:22:03 +00:00 · 2026-06-25 01:17:40 +00:00 · 2026-06-25 01:17:40 +00:00 · 751adfa6b9
commit 751adfa6b9
parent ff8920299c
2 changed files with 46 additions and 4 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -5669,7 +5669,9 @@ def call_llm(
        is_auto = resolved_provider in {"auto", "", None}
        # Capacity errors bypass the explicit-provider gate: the provider
        # literally cannot serve this request regardless of user intent.
-        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
+        # Rate limits are included: after retries are exhausted, a 429 means
+        # the provider cannot serve this request — fall back. See #52228.
+        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err) or _is_rate_limit_error(first_err)
        if should_fallback and (is_auto or is_capacity_error):
            if _is_payment_error(first_err):
                reason = "payment error"
@ -6112,11 +6114,13 @@ async def async_call_llm(
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
        )
-        # Capacity errors (payment/quota/connection) bypass the explicit-provider
-        # gate — the provider cannot serve the request regardless of user intent.
+        # Capacity errors (payment/quota/connection/rate-limit) bypass the
+        # explicit-provider gate — the provider cannot serve the request
+        # regardless of user intent. Rate limits are included: after retries
+        # are exhausted, a 429 means the provider is at capacity. See #52228.
        # See #26803: daily token quota must fall back like a 402 credit error.
        is_auto = resolved_provider in {"auto", "", None}
-        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
+        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err) or _is_rate_limit_error(first_err)
        if should_fallback and (is_auto or is_capacity_error):
            if _is_payment_error(first_err):
                reason = "payment error"
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -1885,6 +1885,44 @@ class TestAuxiliaryFallbackLayering:

        assert main_client.chat.completions.create.called

+    def test_explicit_provider_rate_limit_triggers_fallback(self, monkeypatch):
+        """429 rate-limit on an explicit provider must trigger fallback (not be ignored).
+
+        Regression test for #52228: rate limits were excluded from
+        ``is_capacity_error``, so explicit-provider auxiliary calls never
+        fell back on 429 — only auto-provider calls did.
+        """
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+        primary_client = MagicMock()
+        rate_err = Exception("Rate limit exceeded, try again in 60 seconds")
+        rate_err.status_code = 429
+        primary_client.chat.completions.create.side_effect = rate_err
+
+        fallback_client = MagicMock()
+        fallback_client.chat.completions.create.return_value = MagicMock(choices=[
+            MagicMock(message=MagicMock(content="from fallback chain"))
+        ])
+
+        with patch("agent.auxiliary_client._get_cached_client",
+                   return_value=(primary_client, "gpt-5.5")), \
+             patch("agent.auxiliary_client._resolve_task_provider_model",
+                   return_value=("openai-codex", "gpt-5.5", None, None, None)), \
+             patch("agent.auxiliary_client._try_configured_fallback_chain",
+                   return_value=(fallback_client, "deepseek-v4-pro", "fallback_chain[0](opencode-go)")) as mock_chain, \
+             patch("agent.auxiliary_client._try_main_agent_model_fallback") as mock_main:
+            result = call_llm(
+                task="kanban_decomposer",
+                messages=[{"role": "user", "content": "decompose this"}],
+            )
+
+        # Fallback chain MUST be tried for rate-limit on explicit provider
+        mock_chain.assert_called()
+        assert fallback_client.chat.completions.create.called
+        # Main agent fallback should NOT be needed when chain succeeds
+        mock_main.assert_not_called()
+
+
    def test_warning_emitted_when_all_fallbacks_exhausted(self, monkeypatch, caplog):
        """When chain AND main model both fail, a user-visible warning fires before re-raise."""
        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")