diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 1807e7be2ee..459ec8da2ac 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -124,6 +124,15 @@ def _openai_http_client_kwargs( def _create_openai_client(*, api_key: str, base_url: str, **kwargs: Any) -> Any: kwargs = {**_openai_http_client_kwargs(base_url), **kwargs} + # Hermes owns auxiliary retry + provider/model fallback policy (the + # same-provider transient retry in call_llm plus the except-chain + # fallback). The OpenAI SDK's own default (max_retries=2 → up to 3 + # attempts) silently multiplies the effective wall time of every aux call + # by 3× on a slow/hung endpoint, so a 120s timeout can stall ~360s before + # Hermes sees a single failure (issue #54465). Disable SDK-internal retries + # by default and let Hermes control the budget; explicit callers can still + # override via kwargs. + kwargs.setdefault("max_retries", 0) return OpenAI(api_key=api_key, base_url=base_url, **kwargs) @@ -2590,6 +2599,27 @@ def _is_rate_limit_error(exc: Exception) -> bool: return False +def _is_timeout_error(exc: Exception) -> bool: + """Detect a request timeout — the full-budget stall, distinct from a fast + connection drop. + + A timeout burns the entire configured ``timeout`` before surfacing, so a + same-provider retry on the critical compression path doubles the + user-visible wall time (issue #54465). A streaming-close / dropped + connection, by contrast, fails fast and is cheap to retry — those stay on + the retry path even for compression. + """ + try: + from openai import APITimeoutError + if isinstance(exc, APITimeoutError): + return True + except ImportError: + pass + if "Timeout" in type(exc).__name__: + return True + return "timed out" in str(exc).lower() + + def _is_connection_error(exc: Exception) -> bool: """Detect connection/network errors that warrant provider fallback. @@ -3824,6 +3854,9 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False): **_openai_http_client_kwargs(sync_base_url, async_mode=True), **async_kwargs, } + # See _create_openai_client: disable SDK-internal retries so Hermes owns + # the auxiliary retry/timeout budget (issue #54465). + async_kwargs.setdefault("max_retries", 0) return AsyncOpenAI(**async_kwargs), model @@ -5785,6 +5818,21 @@ def call_llm( except Exception as transient_err: if not _is_transient_transport_error(transient_err): raise + # Compression is on the critical preflight path: a user cannot + # continue or resume an oversized session until it compacts. A + # same-provider retry on a timeout means another full ``timeout``- + # long wall-clock block before the except-chain below can fall + # back — doubling the user-visible stall (issue #54465). Skip the + # same-provider retry for compression on a full-budget timeout and + # fall straight through to provider/model fallback; fast blips (a + # streaming-close or a 5xx) still retry, since those are cheap. + if task == "compression" and _is_timeout_error(transient_err): + logger.info( + "Auxiliary compression: timeout on the critical path; " + "skipping same-provider retry and falling back: %s", + transient_err, + ) + raise logger.info( "Auxiliary %s: transient transport error; retrying once on " "the same provider before fallback: %s", @@ -6310,6 +6358,16 @@ async def async_call_llm( except Exception as transient_err: if not _is_transient_transport_error(transient_err): raise + # See call_llm(): compression is on the critical preflight path, + # so skip the same-provider retry on a full-budget timeout and + # fall straight through to fallback (issue #54465). + if task == "compression" and _is_timeout_error(transient_err): + logger.info( + "Auxiliary compression (async): timeout on the critical " + "path; skipping same-provider retry and falling back: %s", + transient_err, + ) + raise logger.info( "Auxiliary %s (async): transient transport error; retrying " "once on the same provider before fallback: %s", diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 8d82bdeb573..ac8cddd83c0 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -2492,6 +2492,144 @@ class TestTransientTransportRetry: assert primary.chat.completions.create.call_count == 2 assert fb_client.chat.completions.create.call_count == 1 + def test_compression_skips_same_provider_retry_on_timeout(self): + """A timeout on the critical compression path must NOT retry the same + provider (that doubles the user-visible stall, issue #54465) — it + falls straight through to the fallback chain instead. + """ + class _Timeout(Exception): + pass + _Timeout.__name__ = "APITimeoutError" + + primary = MagicMock() + primary.base_url = "https://openrouter.ai/api/v1" + primary.chat.completions.create.side_effect = _Timeout("Request timed out.") + + fb_client = MagicMock() + fb_client.base_url = "https://api.openai.com/v1" + fb_client.chat.completions.create.return_value = {"fallback": True} + + p1, p2, p3 = self._patches(primary) + with ( + p1, p2, p3, + patch( + "agent.auxiliary_client._try_configured_fallback_chain", + return_value=(None, None, ""), + ), + patch( + "agent.auxiliary_client._try_main_agent_model_fallback", + return_value=(fb_client, "fb-model", "openai"), + ), + ): + result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}]) + assert result == {"fallback": True} + # Primary tried ONCE only — no same-provider timeout retry — then fallback. + assert primary.chat.completions.create.call_count == 1 + assert fb_client.chat.completions.create.call_count == 1 + + def test_non_compression_still_retries_same_provider_on_timeout(self): + """The timeout skip is scoped to compression only; other auxiliary + tasks keep the single same-provider transient retry. + """ + class _Timeout(Exception): + pass + _Timeout.__name__ = "APITimeoutError" + + client = MagicMock() + client.base_url = "https://openrouter.ai/api/v1" + client.chat.completions.create.side_effect = [ + _Timeout("Request timed out."), + {"ok": True}, + ] + p1, p2, p3 = self._patches(client) + with p1, p2, p3: + result = call_llm(task="title_generation", messages=[{"role": "user", "content": "hi"}]) + assert result == {"ok": True} + assert client.chat.completions.create.call_count == 2 + + def test_compression_still_retries_streaming_close_on_timeout_path(self): + """A fast streaming-close (not a full-budget timeout) still retries + same-provider even for compression — only timeouts are skipped. + """ + client = MagicMock() + client.base_url = "https://openrouter.ai/api/v1" + client.chat.completions.create.side_effect = [ + Exception( + "peer closed connection without sending complete message body " + "(incomplete chunked read)" + ), + {"ok": True}, + ] + p1, p2, p3 = self._patches(client) + with p1, p2, p3: + result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}]) + assert result == {"ok": True} + assert client.chat.completions.create.call_count == 2 + + +class TestAuxClientNoSdkRetries: + """Auxiliary OpenAI clients are constructed with SDK-internal retries + disabled so Hermes owns the retry/timeout budget (issue #54465). The SDK + default (max_retries=2 → 3 attempts) silently triples the effective wall + time of every aux call against a slow/hung endpoint. + """ + + def test_sync_client_disables_sdk_retries(self): + from agent import auxiliary_client as ac + captured = {} + + class _FakeOpenAI: + def __init__(self, **kwargs): + captured.update(kwargs) + + with patch.object(ac, "OpenAI", _FakeOpenAI), \ + patch.object(ac, "_openai_http_client_kwargs", return_value={}): + ac._create_openai_client(api_key="k", base_url="https://x/v1") + assert captured.get("max_retries") == 0 + + def test_explicit_max_retries_override_wins(self): + from agent import auxiliary_client as ac + captured = {} + + class _FakeOpenAI: + def __init__(self, **kwargs): + captured.update(kwargs) + + with patch.object(ac, "OpenAI", _FakeOpenAI), \ + patch.object(ac, "_openai_http_client_kwargs", return_value={}): + ac._create_openai_client(api_key="k", base_url="https://x/v1", max_retries=5) + assert captured.get("max_retries") == 5 + + +class TestIsTimeoutError: + """_is_timeout_error distinguishes a full-budget timeout from a fast + connection drop.""" + + def test_timed_out_string(self): + from agent.auxiliary_client import _is_timeout_error + assert _is_timeout_error(Exception("Request timed out.")) is True + + def test_timeout_typename(self): + from agent.auxiliary_client import _is_timeout_error + + class ReadTimeout(Exception): + pass + + assert _is_timeout_error(ReadTimeout("slow")) is True + + def test_streaming_close_is_not_timeout(self): + from agent.auxiliary_client import _is_timeout_error + err = Exception("peer closed connection (incomplete chunked read)") + assert _is_timeout_error(err) is False + + def test_5xx_is_not_timeout(self): + from agent.auxiliary_client import _is_timeout_error + + class _Err503(Exception): + status_code = 503 + + assert _is_timeout_error(_Err503("upstream")) is False + class TestIsConnectionError: """Tests for _is_connection_error detection."""