fix: include rate-limit in auxiliary capacity-error fallback gate

Rate-limit (429) errors on explicit-provider auxiliary tasks were
silently failing instead of triggering the fallback chain. The
is_capacity_error gate only checked payment and connection errors,
excluding rate limits — so when a configured provider like
openai-codex hit its rate limit, auxiliary tasks (kanban_decomposer,
vision, web_extract, approval, etc.) had zero resilience.

Add _is_rate_limit_error() to is_capacity_error at both call sites
(sync and async paths) so rate limits trigger fallback regardless
of whether the provider was auto-detected or explicitly configured.

Fixes #52228
This commit is contained in:
pyxl-dev 2026-06-25 01:17:40 +00:00 committed by Teknium
parent ff8920299c
commit 751adfa6b9
2 changed files with 46 additions and 4 deletions

View file

@ -5669,7 +5669,9 @@ def call_llm(
is_auto = resolved_provider in {"auto", "", None}
# Capacity errors bypass the explicit-provider gate: the provider
# literally cannot serve this request regardless of user intent.
is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
# Rate limits are included: after retries are exhausted, a 429 means
# the provider cannot serve this request — fall back. See #52228.
is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err) or _is_rate_limit_error(first_err)
if should_fallback and (is_auto or is_capacity_error):
if _is_payment_error(first_err):
reason = "payment error"
@ -6112,11 +6114,13 @@ async def async_call_llm(
or _is_connection_error(first_err)
or _is_rate_limit_error(first_err)
)
# Capacity errors (payment/quota/connection) bypass the explicit-provider
# gate — the provider cannot serve the request regardless of user intent.
# Capacity errors (payment/quota/connection/rate-limit) bypass the
# explicit-provider gate — the provider cannot serve the request
# regardless of user intent. Rate limits are included: after retries
# are exhausted, a 429 means the provider is at capacity. See #52228.
# See #26803: daily token quota must fall back like a 402 credit error.
is_auto = resolved_provider in {"auto", "", None}
is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err) or _is_rate_limit_error(first_err)
if should_fallback and (is_auto or is_capacity_error):
if _is_payment_error(first_err):
reason = "payment error"

View file

@ -1885,6 +1885,44 @@ class TestAuxiliaryFallbackLayering:
assert main_client.chat.completions.create.called
def test_explicit_provider_rate_limit_triggers_fallback(self, monkeypatch):
"""429 rate-limit on an explicit provider must trigger fallback (not be ignored).
Regression test for #52228: rate limits were excluded from
``is_capacity_error``, so explicit-provider auxiliary calls never
fell back on 429 only auto-provider calls did.
"""
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
primary_client = MagicMock()
rate_err = Exception("Rate limit exceeded, try again in 60 seconds")
rate_err.status_code = 429
primary_client.chat.completions.create.side_effect = rate_err
fallback_client = MagicMock()
fallback_client.chat.completions.create.return_value = MagicMock(choices=[
MagicMock(message=MagicMock(content="from fallback chain"))
])
with patch("agent.auxiliary_client._get_cached_client",
return_value=(primary_client, "gpt-5.5")), \
patch("agent.auxiliary_client._resolve_task_provider_model",
return_value=("openai-codex", "gpt-5.5", None, None, None)), \
patch("agent.auxiliary_client._try_configured_fallback_chain",
return_value=(fallback_client, "deepseek-v4-pro", "fallback_chain[0](opencode-go)")) as mock_chain, \
patch("agent.auxiliary_client._try_main_agent_model_fallback") as mock_main:
result = call_llm(
task="kanban_decomposer",
messages=[{"role": "user", "content": "decompose this"}],
)
# Fallback chain MUST be tried for rate-limit on explicit provider
mock_chain.assert_called()
assert fallback_client.chat.completions.create.called
# Main agent fallback should NOT be needed when chain succeeds
mock_main.assert_not_called()
def test_warning_emitted_when_all_fallbacks_exhausted(self, monkeypatch, caplog):
"""When chain AND main model both fail, a user-visible warning fires before re-raise."""
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")