mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
The auxiliary OpenAI clients were built without overriding the SDK's default max_retries=2, so every aux call silently made up to 3 attempts against a slow/hung endpoint — a 120s timeout could stall ~360s before Hermes saw a single failure. On the critical compression preflight path, Hermes then added its own same-provider timeout retry on top, roughly doubling the user-visible stall again before fallback. - Build both the sync (_create_openai_client) and async (_to_async_client) aux clients with max_retries=0 (setdefault, so explicit callers still override). Hermes already owns retry + provider/model fallback policy. - For task == compression, skip the same-provider transient retry on a full-budget timeout and fall straight through to fallback. Fast blips (streaming-close, 5xx) still retry, since those are cheap. - Add _is_timeout_error to distinguish a full-budget timeout from a fast connection drop. Addresses the retry-multiplication root cause of #54465 (the resume-wedge persistence half landed in #55499).
This commit is contained in:
parent
e6f66bc0f0
commit
c8376e0dc6
2 changed files with 196 additions and 0 deletions
|
|
@ -124,6 +124,15 @@ def _openai_http_client_kwargs(
|
|||
|
||||
def _create_openai_client(*, api_key: str, base_url: str, **kwargs: Any) -> Any:
|
||||
kwargs = {**_openai_http_client_kwargs(base_url), **kwargs}
|
||||
# Hermes owns auxiliary retry + provider/model fallback policy (the
|
||||
# same-provider transient retry in call_llm plus the except-chain
|
||||
# fallback). The OpenAI SDK's own default (max_retries=2 → up to 3
|
||||
# attempts) silently multiplies the effective wall time of every aux call
|
||||
# by 3× on a slow/hung endpoint, so a 120s timeout can stall ~360s before
|
||||
# Hermes sees a single failure (issue #54465). Disable SDK-internal retries
|
||||
# by default and let Hermes control the budget; explicit callers can still
|
||||
# override via kwargs.
|
||||
kwargs.setdefault("max_retries", 0)
|
||||
return OpenAI(api_key=api_key, base_url=base_url, **kwargs)
|
||||
|
||||
|
||||
|
|
@ -2590,6 +2599,27 @@ def _is_rate_limit_error(exc: Exception) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def _is_timeout_error(exc: Exception) -> bool:
|
||||
"""Detect a request timeout — the full-budget stall, distinct from a fast
|
||||
connection drop.
|
||||
|
||||
A timeout burns the entire configured ``timeout`` before surfacing, so a
|
||||
same-provider retry on the critical compression path doubles the
|
||||
user-visible wall time (issue #54465). A streaming-close / dropped
|
||||
connection, by contrast, fails fast and is cheap to retry — those stay on
|
||||
the retry path even for compression.
|
||||
"""
|
||||
try:
|
||||
from openai import APITimeoutError
|
||||
if isinstance(exc, APITimeoutError):
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
if "Timeout" in type(exc).__name__:
|
||||
return True
|
||||
return "timed out" in str(exc).lower()
|
||||
|
||||
|
||||
def _is_connection_error(exc: Exception) -> bool:
|
||||
"""Detect connection/network errors that warrant provider fallback.
|
||||
|
||||
|
|
@ -3824,6 +3854,9 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
|
|||
**_openai_http_client_kwargs(sync_base_url, async_mode=True),
|
||||
**async_kwargs,
|
||||
}
|
||||
# See _create_openai_client: disable SDK-internal retries so Hermes owns
|
||||
# the auxiliary retry/timeout budget (issue #54465).
|
||||
async_kwargs.setdefault("max_retries", 0)
|
||||
return AsyncOpenAI(**async_kwargs), model
|
||||
|
||||
|
||||
|
|
@ -5785,6 +5818,21 @@ def call_llm(
|
|||
except Exception as transient_err:
|
||||
if not _is_transient_transport_error(transient_err):
|
||||
raise
|
||||
# Compression is on the critical preflight path: a user cannot
|
||||
# continue or resume an oversized session until it compacts. A
|
||||
# same-provider retry on a timeout means another full ``timeout``-
|
||||
# long wall-clock block before the except-chain below can fall
|
||||
# back — doubling the user-visible stall (issue #54465). Skip the
|
||||
# same-provider retry for compression on a full-budget timeout and
|
||||
# fall straight through to provider/model fallback; fast blips (a
|
||||
# streaming-close or a 5xx) still retry, since those are cheap.
|
||||
if task == "compression" and _is_timeout_error(transient_err):
|
||||
logger.info(
|
||||
"Auxiliary compression: timeout on the critical path; "
|
||||
"skipping same-provider retry and falling back: %s",
|
||||
transient_err,
|
||||
)
|
||||
raise
|
||||
logger.info(
|
||||
"Auxiliary %s: transient transport error; retrying once on "
|
||||
"the same provider before fallback: %s",
|
||||
|
|
@ -6310,6 +6358,16 @@ async def async_call_llm(
|
|||
except Exception as transient_err:
|
||||
if not _is_transient_transport_error(transient_err):
|
||||
raise
|
||||
# See call_llm(): compression is on the critical preflight path,
|
||||
# so skip the same-provider retry on a full-budget timeout and
|
||||
# fall straight through to fallback (issue #54465).
|
||||
if task == "compression" and _is_timeout_error(transient_err):
|
||||
logger.info(
|
||||
"Auxiliary compression (async): timeout on the critical "
|
||||
"path; skipping same-provider retry and falling back: %s",
|
||||
transient_err,
|
||||
)
|
||||
raise
|
||||
logger.info(
|
||||
"Auxiliary %s (async): transient transport error; retrying "
|
||||
"once on the same provider before fallback: %s",
|
||||
|
|
|
|||
|
|
@ -2492,6 +2492,144 @@ class TestTransientTransportRetry:
|
|||
assert primary.chat.completions.create.call_count == 2
|
||||
assert fb_client.chat.completions.create.call_count == 1
|
||||
|
||||
def test_compression_skips_same_provider_retry_on_timeout(self):
|
||||
"""A timeout on the critical compression path must NOT retry the same
|
||||
provider (that doubles the user-visible stall, issue #54465) — it
|
||||
falls straight through to the fallback chain instead.
|
||||
"""
|
||||
class _Timeout(Exception):
|
||||
pass
|
||||
_Timeout.__name__ = "APITimeoutError"
|
||||
|
||||
primary = MagicMock()
|
||||
primary.base_url = "https://openrouter.ai/api/v1"
|
||||
primary.chat.completions.create.side_effect = _Timeout("Request timed out.")
|
||||
|
||||
fb_client = MagicMock()
|
||||
fb_client.base_url = "https://api.openai.com/v1"
|
||||
fb_client.chat.completions.create.return_value = {"fallback": True}
|
||||
|
||||
p1, p2, p3 = self._patches(primary)
|
||||
with (
|
||||
p1, p2, p3,
|
||||
patch(
|
||||
"agent.auxiliary_client._try_configured_fallback_chain",
|
||||
return_value=(None, None, ""),
|
||||
),
|
||||
patch(
|
||||
"agent.auxiliary_client._try_main_agent_model_fallback",
|
||||
return_value=(fb_client, "fb-model", "openai"),
|
||||
),
|
||||
):
|
||||
result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
|
||||
assert result == {"fallback": True}
|
||||
# Primary tried ONCE only — no same-provider timeout retry — then fallback.
|
||||
assert primary.chat.completions.create.call_count == 1
|
||||
assert fb_client.chat.completions.create.call_count == 1
|
||||
|
||||
def test_non_compression_still_retries_same_provider_on_timeout(self):
|
||||
"""The timeout skip is scoped to compression only; other auxiliary
|
||||
tasks keep the single same-provider transient retry.
|
||||
"""
|
||||
class _Timeout(Exception):
|
||||
pass
|
||||
_Timeout.__name__ = "APITimeoutError"
|
||||
|
||||
client = MagicMock()
|
||||
client.base_url = "https://openrouter.ai/api/v1"
|
||||
client.chat.completions.create.side_effect = [
|
||||
_Timeout("Request timed out."),
|
||||
{"ok": True},
|
||||
]
|
||||
p1, p2, p3 = self._patches(client)
|
||||
with p1, p2, p3:
|
||||
result = call_llm(task="title_generation", messages=[{"role": "user", "content": "hi"}])
|
||||
assert result == {"ok": True}
|
||||
assert client.chat.completions.create.call_count == 2
|
||||
|
||||
def test_compression_still_retries_streaming_close_on_timeout_path(self):
|
||||
"""A fast streaming-close (not a full-budget timeout) still retries
|
||||
same-provider even for compression — only timeouts are skipped.
|
||||
"""
|
||||
client = MagicMock()
|
||||
client.base_url = "https://openrouter.ai/api/v1"
|
||||
client.chat.completions.create.side_effect = [
|
||||
Exception(
|
||||
"peer closed connection without sending complete message body "
|
||||
"(incomplete chunked read)"
|
||||
),
|
||||
{"ok": True},
|
||||
]
|
||||
p1, p2, p3 = self._patches(client)
|
||||
with p1, p2, p3:
|
||||
result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
|
||||
assert result == {"ok": True}
|
||||
assert client.chat.completions.create.call_count == 2
|
||||
|
||||
|
||||
class TestAuxClientNoSdkRetries:
|
||||
"""Auxiliary OpenAI clients are constructed with SDK-internal retries
|
||||
disabled so Hermes owns the retry/timeout budget (issue #54465). The SDK
|
||||
default (max_retries=2 → 3 attempts) silently triples the effective wall
|
||||
time of every aux call against a slow/hung endpoint.
|
||||
"""
|
||||
|
||||
def test_sync_client_disables_sdk_retries(self):
|
||||
from agent import auxiliary_client as ac
|
||||
captured = {}
|
||||
|
||||
class _FakeOpenAI:
|
||||
def __init__(self, **kwargs):
|
||||
captured.update(kwargs)
|
||||
|
||||
with patch.object(ac, "OpenAI", _FakeOpenAI), \
|
||||
patch.object(ac, "_openai_http_client_kwargs", return_value={}):
|
||||
ac._create_openai_client(api_key="k", base_url="https://x/v1")
|
||||
assert captured.get("max_retries") == 0
|
||||
|
||||
def test_explicit_max_retries_override_wins(self):
|
||||
from agent import auxiliary_client as ac
|
||||
captured = {}
|
||||
|
||||
class _FakeOpenAI:
|
||||
def __init__(self, **kwargs):
|
||||
captured.update(kwargs)
|
||||
|
||||
with patch.object(ac, "OpenAI", _FakeOpenAI), \
|
||||
patch.object(ac, "_openai_http_client_kwargs", return_value={}):
|
||||
ac._create_openai_client(api_key="k", base_url="https://x/v1", max_retries=5)
|
||||
assert captured.get("max_retries") == 5
|
||||
|
||||
|
||||
class TestIsTimeoutError:
|
||||
"""_is_timeout_error distinguishes a full-budget timeout from a fast
|
||||
connection drop."""
|
||||
|
||||
def test_timed_out_string(self):
|
||||
from agent.auxiliary_client import _is_timeout_error
|
||||
assert _is_timeout_error(Exception("Request timed out.")) is True
|
||||
|
||||
def test_timeout_typename(self):
|
||||
from agent.auxiliary_client import _is_timeout_error
|
||||
|
||||
class ReadTimeout(Exception):
|
||||
pass
|
||||
|
||||
assert _is_timeout_error(ReadTimeout("slow")) is True
|
||||
|
||||
def test_streaming_close_is_not_timeout(self):
|
||||
from agent.auxiliary_client import _is_timeout_error
|
||||
err = Exception("peer closed connection (incomplete chunked read)")
|
||||
assert _is_timeout_error(err) is False
|
||||
|
||||
def test_5xx_is_not_timeout(self):
|
||||
from agent.auxiliary_client import _is_timeout_error
|
||||
|
||||
class _Err503(Exception):
|
||||
status_code = 503
|
||||
|
||||
assert _is_timeout_error(_Err503("upstream")) is False
|
||||
|
||||
|
||||
class TestIsConnectionError:
|
||||
"""Tests for _is_connection_error detection."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue