fix(auxiliary): retry transient transport error once before fallback (#16587)
Some checks failed
Deploy Site / deploy-vercel (push) Waiting to run
Deploy Site / deploy-docs (push) Waiting to run
Docker Build and Publish / build-amd64 (push) Waiting to run
Docker Build and Publish / build-arm64 (push) Waiting to run
Docker Build and Publish / merge (push) Blocked by required conditions
Lint (ruff + ty) / ruff + ty diff (push) Waiting to run
Lint (ruff + ty) / ruff enforcement (blocking) (push) Waiting to run
Lint (ruff + ty) / Windows footguns (blocking) (push) Waiting to run
Nix / nix (macos-latest) (push) Waiting to run
Nix / nix (ubuntu-latest) (push) Waiting to run
Tests / test (1) (push) Waiting to run
Tests / test (2) (push) Waiting to run
Tests / test (3) (push) Waiting to run
Tests / test (4) (push) Waiting to run
Tests / test (5) (push) Waiting to run
Tests / test (6) (push) Waiting to run
Tests / save-durations (push) Blocked by required conditions
Tests / e2e (push) Waiting to run
Nix Lockfile Fix / auto-fix-main (push) Has been cancelled
Nix Lockfile Fix / fix (push) Has been cancelled

A one-off transient transport failure (streaming-close / incomplete
chunked read / 5xx / 408) on an auxiliary LLM call escalated straight to
provider/model fallback (or, for context compression, dropped the summary
and entered cooldown), even when an immediate retry on the same provider
would have succeeded.

Add a single same-target retry at the top of call_llm() and
async_call_llm() — before the existing except-chain — gated on a new
_is_transient_transport_error() that reuses the canonical
_is_connection_error() detector plus a 5xx/408 status check. A second
failure (or any non-transient error: auth, other 4xx, malformed payload)
falls through to first_err and the existing fallback handling unchanged.

This lives in call_llm so every auxiliary task (compression, memory flush,
title generation, session search, vision) shares one transient-retry
surface, rather than each caller re-implementing it. The context
compressor needs no change — it calls call_llm and inherits the retry; its
existing fallback-to-main path (#18458) now composes naturally (retry the
aux model once, then fall back to main only if the retry also fails).

Co-authored-by: ARegalado1 <alberto.regalado@ymail.com>
This commit is contained in:
teknium1 2026-06-08 00:03:34 -07:00 committed by Teknium
parent 4107076128
commit 02a4d66951
3 changed files with 160 additions and 4 deletions

View file

@ -2476,6 +2476,25 @@ def _is_connection_error(exc: Exception) -> bool:
return False
def _is_transient_transport_error(exc: Exception) -> bool:
"""Return True for a one-off transport blip worth retrying ONCE on the
same provider before any provider/model fallback.
Covers connection/streaming-close errors (via the canonical
``_is_connection_error`` detector, shared so the two cannot drift) plus a
pure 5xx/408 HTTP status. Deliberately narrow: this is the "retry the
same target once" gate, distinct from ``_is_payment_error`` /
``_is_auth_error`` / ``_is_rate_limit_error`` which the except-chain
handles by switching provider, refreshing creds, or rotating the pool.
"""
if _is_connection_error(exc):
return True
status = getattr(exc, "status_code", None) or getattr(
getattr(exc, "response", None), "status_code", None
)
return isinstance(status, int) and (status == 408 or 500 <= status < 600)
def _is_auth_error(exc: Exception) -> bool:
"""Detect auth failures that should trigger provider-specific refresh."""
status = getattr(exc, "status_code", None)
@ -5147,8 +5166,28 @@ def call_llm(
# Handle unsupported temperature, max_tokens vs max_completion_tokens retry,
# then payment fallback.
try:
return _validate_llm_response(
client.chat.completions.create(**kwargs), task)
# Retry ONCE on the same provider for a one-off transient transport
# blip (streaming-close / incomplete chunked read / 5xx / 408) before
# the except-chain below escalates to provider/model fallback. A
# single dropped connection shouldn't abandon an otherwise-healthy
# provider. A second failure (or any non-transient error) falls
# through to ``first_err`` and the existing fallback handling
# unchanged. This is the unified home for the transient retry that
# every auxiliary task (compression, memory flush, title-gen,
# session-search, vision) shares. (PR #16587)
try:
return _validate_llm_response(
client.chat.completions.create(**kwargs), task)
except Exception as transient_err:
if not _is_transient_transport_error(transient_err):
raise
logger.info(
"Auxiliary %s: transient transport error; retrying once on "
"the same provider before fallback: %s",
task or "call", transient_err,
)
return _validate_llm_response(
client.chat.completions.create(**kwargs), task)
except Exception as first_err:
if "temperature" in kwargs and _is_unsupported_temperature_error(first_err):
retry_kwargs = dict(kwargs)
@ -5614,8 +5653,22 @@ async def async_call_llm(
kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])
try:
return _validate_llm_response(
await client.chat.completions.create(**kwargs), task)
# Retry ONCE on the same provider for a transient transport blip
# before the except-chain escalates to fallback — see call_llm()
# for the rationale. (PR #16587)
try:
return _validate_llm_response(
await client.chat.completions.create(**kwargs), task)
except Exception as transient_err:
if not _is_transient_transport_error(transient_err):
raise
logger.info(
"Auxiliary %s (async): transient transport error; retrying "
"once on the same provider before fallback: %s",
task or "call", transient_err,
)
return _validate_llm_response(
await client.chat.completions.create(**kwargs), task)
except Exception as first_err:
if "temperature" in kwargs and _is_unsupported_temperature_error(first_err):
retry_kwargs = dict(kwargs)

View file

@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
# Auto-extracted from noreply emails + manual overrides
AUTHOR_MAP = {
"alberto.regalado@ymail.com": "ARegalado1",
"alchemistchaos@protonmail.com": "AlchemistChaos", # co-author only
"gilad@smiti.ai": "giladbau",
"yusufalweshdemir@gmail.com": "Dusk1e",

View file

@ -1794,6 +1794,108 @@ def test_resolve_api_key_provider_skips_unconfigured_anthropic(monkeypatch):
# ---------------------------------------------------------------------------
class TestTransientTransportRetry:
"""call_llm retries ONCE on the same provider for a transient transport
blip before escalating to the fallback chain.
Salvaged from PR #16587 (@ARegalado1). The original fixed only the
context-compression caller; this lives in call_llm so every auxiliary
task (compression, memory flush, title-gen, session-search, vision)
gets the same same-target retry, and the gate reuses the canonical
_is_connection_error detector.
"""
def _patches(self, client):
return (
patch(
"agent.auxiliary_client._resolve_task_provider_model",
return_value=("openrouter", "some-model", None, None, None),
),
patch(
"agent.auxiliary_client._get_cached_client",
return_value=(client, "some-model"),
),
patch(
"agent.auxiliary_client._validate_llm_response",
side_effect=lambda resp, _task: resp,
),
)
def test_retries_streaming_close_once_same_provider(self):
client = MagicMock()
client.base_url = "https://openrouter.ai/api/v1"
client.chat.completions.create.side_effect = [
Exception(
"peer closed connection without sending complete message body "
"(incomplete chunked read)"
),
{"ok": True},
]
p1, p2, p3 = self._patches(client)
with p1, p2, p3:
result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
assert result == {"ok": True}
# Same client called twice — no provider fallback needed.
assert client.chat.completions.create.call_count == 2
def test_retries_5xx_once_same_provider(self):
class _Err503(Exception):
status_code = 503
client = MagicMock()
client.base_url = "https://openrouter.ai/api/v1"
client.chat.completions.create.side_effect = [_Err503("upstream"), {"ok": True}]
p1, p2, p3 = self._patches(client)
with p1, p2, p3:
result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
assert result == {"ok": True}
assert client.chat.completions.create.call_count == 2
def test_does_not_retry_non_transient_400(self):
class _Err400(Exception):
status_code = 400
client = MagicMock()
client.base_url = "https://openrouter.ai/api/v1"
client.chat.completions.create.side_effect = _Err400("bad request")
p1, p2, p3 = self._patches(client)
with p1, p2, p3, pytest.raises(_Err400):
call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
# Non-transient: single attempt, no same-target retry.
assert client.chat.completions.create.call_count == 1
def test_second_transient_failure_escalates_to_fallback(self):
"""Two transient failures in a row exhaust the same-target retry and
fall through to the existing connection-error provider fallback."""
primary = MagicMock()
primary.base_url = "https://openrouter.ai/api/v1"
primary.chat.completions.create.side_effect = Exception(
"peer closed connection without sending complete message body"
)
fb_client = MagicMock()
fb_client.base_url = "https://api.openai.com/v1"
fb_client.chat.completions.create.return_value = {"fallback": True}
p1, p2, p3 = self._patches(primary)
with (
p1, p2, p3,
patch(
"agent.auxiliary_client._try_configured_fallback_chain",
return_value=(None, None, ""),
),
patch(
"agent.auxiliary_client._try_main_agent_model_fallback",
return_value=(fb_client, "fb-model", "openai"),
),
):
result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
assert result == {"fallback": True}
# Primary tried twice (initial + same-target retry), then fallback.
assert primary.chat.completions.create.call_count == 2
assert fb_client.chat.completions.create.call_count == 1
class TestIsConnectionError:
"""Tests for _is_connection_error detection."""