fix(auxiliary): stop SDK retries from multiplying compression stall (#54465) (#55544)

The auxiliary OpenAI clients were built without overriding the SDK's
default max_retries=2, so every aux call silently made up to 3 attempts
against a slow/hung endpoint — a 120s timeout could stall ~360s before
Hermes saw a single failure. On the critical compression preflight path,
Hermes then added its own same-provider timeout retry on top, roughly
doubling the user-visible stall again before fallback.

- Build both the sync (_create_openai_client) and async (_to_async_client)
  aux clients with max_retries=0 (setdefault, so explicit callers still
  override). Hermes already owns retry + provider/model fallback policy.
- For task == compression, skip the same-provider transient retry on a
  full-budget timeout and fall straight through to fallback. Fast blips
  (streaming-close, 5xx) still retry, since those are cheap.
- Add _is_timeout_error to distinguish a full-budget timeout from a fast
  connection drop.

Addresses the retry-multiplication root cause of #54465 (the resume-wedge
persistence half landed in #55499).
This commit is contained in:
Teknium 2026-06-30 02:54:08 -07:00 committed by GitHub
parent e6f66bc0f0
commit c8376e0dc6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 196 additions and 0 deletions

View file

@ -124,6 +124,15 @@ def _openai_http_client_kwargs(
def _create_openai_client(*, api_key: str, base_url: str, **kwargs: Any) -> Any:
kwargs = {**_openai_http_client_kwargs(base_url), **kwargs}
# Hermes owns auxiliary retry + provider/model fallback policy (the
# same-provider transient retry in call_llm plus the except-chain
# fallback). The OpenAI SDK's own default (max_retries=2 → up to 3
# attempts) silently multiplies the effective wall time of every aux call
# by 3× on a slow/hung endpoint, so a 120s timeout can stall ~360s before
# Hermes sees a single failure (issue #54465). Disable SDK-internal retries
# by default and let Hermes control the budget; explicit callers can still
# override via kwargs.
kwargs.setdefault("max_retries", 0)
return OpenAI(api_key=api_key, base_url=base_url, **kwargs)
@ -2590,6 +2599,27 @@ def _is_rate_limit_error(exc: Exception) -> bool:
return False
def _is_timeout_error(exc: Exception) -> bool:
"""Detect a request timeout — the full-budget stall, distinct from a fast
connection drop.
A timeout burns the entire configured ``timeout`` before surfacing, so a
same-provider retry on the critical compression path doubles the
user-visible wall time (issue #54465). A streaming-close / dropped
connection, by contrast, fails fast and is cheap to retry those stay on
the retry path even for compression.
"""
try:
from openai import APITimeoutError
if isinstance(exc, APITimeoutError):
return True
except ImportError:
pass
if "Timeout" in type(exc).__name__:
return True
return "timed out" in str(exc).lower()
def _is_connection_error(exc: Exception) -> bool:
"""Detect connection/network errors that warrant provider fallback.
@ -3824,6 +3854,9 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
**_openai_http_client_kwargs(sync_base_url, async_mode=True),
**async_kwargs,
}
# See _create_openai_client: disable SDK-internal retries so Hermes owns
# the auxiliary retry/timeout budget (issue #54465).
async_kwargs.setdefault("max_retries", 0)
return AsyncOpenAI(**async_kwargs), model
@ -5785,6 +5818,21 @@ def call_llm(
except Exception as transient_err:
if not _is_transient_transport_error(transient_err):
raise
# Compression is on the critical preflight path: a user cannot
# continue or resume an oversized session until it compacts. A
# same-provider retry on a timeout means another full ``timeout``-
# long wall-clock block before the except-chain below can fall
# back — doubling the user-visible stall (issue #54465). Skip the
# same-provider retry for compression on a full-budget timeout and
# fall straight through to provider/model fallback; fast blips (a
# streaming-close or a 5xx) still retry, since those are cheap.
if task == "compression" and _is_timeout_error(transient_err):
logger.info(
"Auxiliary compression: timeout on the critical path; "
"skipping same-provider retry and falling back: %s",
transient_err,
)
raise
logger.info(
"Auxiliary %s: transient transport error; retrying once on "
"the same provider before fallback: %s",
@ -6310,6 +6358,16 @@ async def async_call_llm(
except Exception as transient_err:
if not _is_transient_transport_error(transient_err):
raise
# See call_llm(): compression is on the critical preflight path,
# so skip the same-provider retry on a full-budget timeout and
# fall straight through to fallback (issue #54465).
if task == "compression" and _is_timeout_error(transient_err):
logger.info(
"Auxiliary compression (async): timeout on the critical "
"path; skipping same-provider retry and falling back: %s",
transient_err,
)
raise
logger.info(
"Auxiliary %s (async): transient transport error; retrying "
"once on the same provider before fallback: %s",

View file

@ -2492,6 +2492,144 @@ class TestTransientTransportRetry:
assert primary.chat.completions.create.call_count == 2
assert fb_client.chat.completions.create.call_count == 1
def test_compression_skips_same_provider_retry_on_timeout(self):
"""A timeout on the critical compression path must NOT retry the same
provider (that doubles the user-visible stall, issue #54465) — it
falls straight through to the fallback chain instead.
"""
class _Timeout(Exception):
pass
_Timeout.__name__ = "APITimeoutError"
primary = MagicMock()
primary.base_url = "https://openrouter.ai/api/v1"
primary.chat.completions.create.side_effect = _Timeout("Request timed out.")
fb_client = MagicMock()
fb_client.base_url = "https://api.openai.com/v1"
fb_client.chat.completions.create.return_value = {"fallback": True}
p1, p2, p3 = self._patches(primary)
with (
p1, p2, p3,
patch(
"agent.auxiliary_client._try_configured_fallback_chain",
return_value=(None, None, ""),
),
patch(
"agent.auxiliary_client._try_main_agent_model_fallback",
return_value=(fb_client, "fb-model", "openai"),
),
):
result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
assert result == {"fallback": True}
# Primary tried ONCE only — no same-provider timeout retry — then fallback.
assert primary.chat.completions.create.call_count == 1
assert fb_client.chat.completions.create.call_count == 1
def test_non_compression_still_retries_same_provider_on_timeout(self):
"""The timeout skip is scoped to compression only; other auxiliary
tasks keep the single same-provider transient retry.
"""
class _Timeout(Exception):
pass
_Timeout.__name__ = "APITimeoutError"
client = MagicMock()
client.base_url = "https://openrouter.ai/api/v1"
client.chat.completions.create.side_effect = [
_Timeout("Request timed out."),
{"ok": True},
]
p1, p2, p3 = self._patches(client)
with p1, p2, p3:
result = call_llm(task="title_generation", messages=[{"role": "user", "content": "hi"}])
assert result == {"ok": True}
assert client.chat.completions.create.call_count == 2
def test_compression_still_retries_streaming_close_on_timeout_path(self):
"""A fast streaming-close (not a full-budget timeout) still retries
same-provider even for compression only timeouts are skipped.
"""
client = MagicMock()
client.base_url = "https://openrouter.ai/api/v1"
client.chat.completions.create.side_effect = [
Exception(
"peer closed connection without sending complete message body "
"(incomplete chunked read)"
),
{"ok": True},
]
p1, p2, p3 = self._patches(client)
with p1, p2, p3:
result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
assert result == {"ok": True}
assert client.chat.completions.create.call_count == 2
class TestAuxClientNoSdkRetries:
"""Auxiliary OpenAI clients are constructed with SDK-internal retries
disabled so Hermes owns the retry/timeout budget (issue #54465). The SDK
default (max_retries=2 3 attempts) silently triples the effective wall
time of every aux call against a slow/hung endpoint.
"""
def test_sync_client_disables_sdk_retries(self):
from agent import auxiliary_client as ac
captured = {}
class _FakeOpenAI:
def __init__(self, **kwargs):
captured.update(kwargs)
with patch.object(ac, "OpenAI", _FakeOpenAI), \
patch.object(ac, "_openai_http_client_kwargs", return_value={}):
ac._create_openai_client(api_key="k", base_url="https://x/v1")
assert captured.get("max_retries") == 0
def test_explicit_max_retries_override_wins(self):
from agent import auxiliary_client as ac
captured = {}
class _FakeOpenAI:
def __init__(self, **kwargs):
captured.update(kwargs)
with patch.object(ac, "OpenAI", _FakeOpenAI), \
patch.object(ac, "_openai_http_client_kwargs", return_value={}):
ac._create_openai_client(api_key="k", base_url="https://x/v1", max_retries=5)
assert captured.get("max_retries") == 5
class TestIsTimeoutError:
"""_is_timeout_error distinguishes a full-budget timeout from a fast
connection drop."""
def test_timed_out_string(self):
from agent.auxiliary_client import _is_timeout_error
assert _is_timeout_error(Exception("Request timed out.")) is True
def test_timeout_typename(self):
from agent.auxiliary_client import _is_timeout_error
class ReadTimeout(Exception):
pass
assert _is_timeout_error(ReadTimeout("slow")) is True
def test_streaming_close_is_not_timeout(self):
from agent.auxiliary_client import _is_timeout_error
err = Exception("peer closed connection (incomplete chunked read)")
assert _is_timeout_error(err) is False
def test_5xx_is_not_timeout(self):
from agent.auxiliary_client import _is_timeout_error
class _Err503(Exception):
status_code = 503
assert _is_timeout_error(_Err503("upstream")) is False
class TestIsConnectionError:
"""Tests for _is_connection_error detection."""