fix(auxiliary): stop SDK retries from multiplying compression stall (#54465) (#55544)

The auxiliary OpenAI clients were built without overriding the SDK's default max_retries=2, so every aux call silently made up to 3 attempts against a slow/hung endpoint — a 120s timeout could stall ~360s before Hermes saw a single failure. On the critical compression preflight path, Hermes then added its own same-provider timeout retry on top, roughly doubling the user-visible stall again before fallback. - Build both the sync (_create_openai_client) and async (_to_async_client) aux clients with max_retries=0 (setdefault, so explicit callers still override). Hermes already owns retry + provider/model fallback policy. - For task == compression, skip the same-provider transient retry on a full-budget timeout and fall straight through to fallback. Fast blips (streaming-close, 5xx) still retry, since those are cheap. - Add _is_timeout_error to distinguish a full-budget timeout from a fast connection drop. Addresses the retry-multiplication root cause of #54465 (the resume-wedge persistence half landed in #55499).
2026-07-01 12:02:05 +00:00 · 2026-06-30 02:54:08 -07:00 · 2026-06-30 02:54:08 -07:00 · c8376e0dc6
commit c8376e0dc6
parent e6f66bc0f0
2 changed files with 196 additions and 0 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -124,6 +124,15 @@ def _openai_http_client_kwargs(

 def _create_openai_client(*, api_key: str, base_url: str, **kwargs: Any) -> Any:
    kwargs = {**_openai_http_client_kwargs(base_url), **kwargs}
+    # Hermes owns auxiliary retry + provider/model fallback policy (the
+    # same-provider transient retry in call_llm plus the except-chain
+    # fallback). The OpenAI SDK's own default (max_retries=2 → up to 3
+    # attempts) silently multiplies the effective wall time of every aux call
+    # by 3× on a slow/hung endpoint, so a 120s timeout can stall ~360s before
+    # Hermes sees a single failure (issue #54465). Disable SDK-internal retries
+    # by default and let Hermes control the budget; explicit callers can still
+    # override via kwargs.
+    kwargs.setdefault("max_retries", 0)
    return OpenAI(api_key=api_key, base_url=base_url, **kwargs)


@ -2590,6 +2599,27 @@ def _is_rate_limit_error(exc: Exception) -> bool:
    return False


+def _is_timeout_error(exc: Exception) -> bool:
+    """Detect a request timeout — the full-budget stall, distinct from a fast
+    connection drop.
+
+    A timeout burns the entire configured ``timeout`` before surfacing, so a
+    same-provider retry on the critical compression path doubles the
+    user-visible wall time (issue #54465). A streaming-close / dropped
+    connection, by contrast, fails fast and is cheap to retry — those stay on
+    the retry path even for compression.
+    """
+    try:
+        from openai import APITimeoutError
+        if isinstance(exc, APITimeoutError):
+            return True
+    except ImportError:
+        pass
+    if "Timeout" in type(exc).__name__:
+        return True
+    return "timed out" in str(exc).lower()
+
+
 def _is_connection_error(exc: Exception) -> bool:
    """Detect connection/network errors that warrant provider fallback.

@ -3824,6 +3854,9 @@ def _to_async_client(sync_client, model: str, is_vision: bool = False):
        **_openai_http_client_kwargs(sync_base_url, async_mode=True),
        **async_kwargs,
    }
+    # See _create_openai_client: disable SDK-internal retries so Hermes owns
+    # the auxiliary retry/timeout budget (issue #54465).
+    async_kwargs.setdefault("max_retries", 0)
    return AsyncOpenAI(**async_kwargs), model


@ -5785,6 +5818,21 @@ def call_llm(
        except Exception as transient_err:
            if not _is_transient_transport_error(transient_err):
                raise
+            # Compression is on the critical preflight path: a user cannot
+            # continue or resume an oversized session until it compacts. A
+            # same-provider retry on a timeout means another full ``timeout``-
+            # long wall-clock block before the except-chain below can fall
+            # back — doubling the user-visible stall (issue #54465). Skip the
+            # same-provider retry for compression on a full-budget timeout and
+            # fall straight through to provider/model fallback; fast blips (a
+            # streaming-close or a 5xx) still retry, since those are cheap.
+            if task == "compression" and _is_timeout_error(transient_err):
+                logger.info(
+                    "Auxiliary compression: timeout on the critical path; "
+                    "skipping same-provider retry and falling back: %s",
+                    transient_err,
+                )
+                raise
            logger.info(
                "Auxiliary %s: transient transport error; retrying once on "
                "the same provider before fallback: %s",
@ -6310,6 +6358,16 @@ async def async_call_llm(
        except Exception as transient_err:
            if not _is_transient_transport_error(transient_err):
                raise
+            # See call_llm(): compression is on the critical preflight path,
+            # so skip the same-provider retry on a full-budget timeout and
+            # fall straight through to fallback (issue #54465).
+            if task == "compression" and _is_timeout_error(transient_err):
+                logger.info(
+                    "Auxiliary compression (async): timeout on the critical "
+                    "path; skipping same-provider retry and falling back: %s",
+                    transient_err,
+                )
+                raise
            logger.info(
                "Auxiliary %s (async): transient transport error; retrying "
                "once on the same provider before fallback: %s",
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -2492,6 +2492,144 @@ class TestTransientTransportRetry:
        assert primary.chat.completions.create.call_count == 2
        assert fb_client.chat.completions.create.call_count == 1

+    def test_compression_skips_same_provider_retry_on_timeout(self):
+        """A timeout on the critical compression path must NOT retry the same
+        provider (that doubles the user-visible stall, issue #54465) — it
+        falls straight through to the fallback chain instead.
+        """
+        class _Timeout(Exception):
+            pass
+        _Timeout.__name__ = "APITimeoutError"
+
+        primary = MagicMock()
+        primary.base_url = "https://openrouter.ai/api/v1"
+        primary.chat.completions.create.side_effect = _Timeout("Request timed out.")
+
+        fb_client = MagicMock()
+        fb_client.base_url = "https://api.openai.com/v1"
+        fb_client.chat.completions.create.return_value = {"fallback": True}
+
+        p1, p2, p3 = self._patches(primary)
+        with (
+            p1, p2, p3,
+            patch(
+                "agent.auxiliary_client._try_configured_fallback_chain",
+                return_value=(None, None, ""),
+            ),
+            patch(
+                "agent.auxiliary_client._try_main_agent_model_fallback",
+                return_value=(fb_client, "fb-model", "openai"),
+            ),
+        ):
+            result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
+        assert result == {"fallback": True}
+        # Primary tried ONCE only — no same-provider timeout retry — then fallback.
+        assert primary.chat.completions.create.call_count == 1
+        assert fb_client.chat.completions.create.call_count == 1
+
+    def test_non_compression_still_retries_same_provider_on_timeout(self):
+        """The timeout skip is scoped to compression only; other auxiliary
+        tasks keep the single same-provider transient retry.
+        """
+        class _Timeout(Exception):
+            pass
+        _Timeout.__name__ = "APITimeoutError"
+
+        client = MagicMock()
+        client.base_url = "https://openrouter.ai/api/v1"
+        client.chat.completions.create.side_effect = [
+            _Timeout("Request timed out."),
+            {"ok": True},
+        ]
+        p1, p2, p3 = self._patches(client)
+        with p1, p2, p3:
+            result = call_llm(task="title_generation", messages=[{"role": "user", "content": "hi"}])
+        assert result == {"ok": True}
+        assert client.chat.completions.create.call_count == 2
+
+    def test_compression_still_retries_streaming_close_on_timeout_path(self):
+        """A fast streaming-close (not a full-budget timeout) still retries
+        same-provider even for compression — only timeouts are skipped.
+        """
+        client = MagicMock()
+        client.base_url = "https://openrouter.ai/api/v1"
+        client.chat.completions.create.side_effect = [
+            Exception(
+                "peer closed connection without sending complete message body "
+                "(incomplete chunked read)"
+            ),
+            {"ok": True},
+        ]
+        p1, p2, p3 = self._patches(client)
+        with p1, p2, p3:
+            result = call_llm(task="compression", messages=[{"role": "user", "content": "hi"}])
+        assert result == {"ok": True}
+        assert client.chat.completions.create.call_count == 2
+
+
+class TestAuxClientNoSdkRetries:
+    """Auxiliary OpenAI clients are constructed with SDK-internal retries
+    disabled so Hermes owns the retry/timeout budget (issue #54465). The SDK
+    default (max_retries=2 → 3 attempts) silently triples the effective wall
+    time of every aux call against a slow/hung endpoint.
+    """
+
+    def test_sync_client_disables_sdk_retries(self):
+        from agent import auxiliary_client as ac
+        captured = {}
+
+        class _FakeOpenAI:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        with patch.object(ac, "OpenAI", _FakeOpenAI), \
+             patch.object(ac, "_openai_http_client_kwargs", return_value={}):
+            ac._create_openai_client(api_key="k", base_url="https://x/v1")
+        assert captured.get("max_retries") == 0
+
+    def test_explicit_max_retries_override_wins(self):
+        from agent import auxiliary_client as ac
+        captured = {}
+
+        class _FakeOpenAI:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        with patch.object(ac, "OpenAI", _FakeOpenAI), \
+             patch.object(ac, "_openai_http_client_kwargs", return_value={}):
+            ac._create_openai_client(api_key="k", base_url="https://x/v1", max_retries=5)
+        assert captured.get("max_retries") == 5
+
+
+class TestIsTimeoutError:
+    """_is_timeout_error distinguishes a full-budget timeout from a fast
+    connection drop."""
+
+    def test_timed_out_string(self):
+        from agent.auxiliary_client import _is_timeout_error
+        assert _is_timeout_error(Exception("Request timed out.")) is True
+
+    def test_timeout_typename(self):
+        from agent.auxiliary_client import _is_timeout_error
+
+        class ReadTimeout(Exception):
+            pass
+
+        assert _is_timeout_error(ReadTimeout("slow")) is True
+
+    def test_streaming_close_is_not_timeout(self):
+        from agent.auxiliary_client import _is_timeout_error
+        err = Exception("peer closed connection (incomplete chunked read)")
+        assert _is_timeout_error(err) is False
+
+    def test_5xx_is_not_timeout(self):
+        from agent.auxiliary_client import _is_timeout_error
+
+        class _Err503(Exception):
+            status_code = 503
+
+        assert _is_timeout_error(_Err503("upstream")) is False
+

 class TestIsConnectionError:
    """Tests for _is_connection_error detection."""