From 55d6a1636bb1f38b01b708582c527b91cc9fe578 Mon Sep 17 00:00:00 2001
From: teknium1 <127238744+teknium1@users.noreply.github.com>
Date: Sun, 17 May 2026 11:36:15 -0700
Subject: [PATCH] fix(agent): honor provider timeout config in streaming API
 calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #25249 (and supersedes PR #25260) in spirit.

Two bugs in the streaming chat-completions path caused provider timeout
configuration to be silently ignored:

1. Hardcoded connect/pool timeout. The httpx.Timeout for streaming
   calls used hardcoded connect=30.0 and pool=30.0 regardless of the
   user's providers.<id>.request_timeout_seconds config. If the custom
   provider (e.g. Ollama) was unreachable, the call always waited
   exactly 30s before failing, ignoring any configured timeout.

   Fix: use min(_base_timeout, 60.0) for connect and pool when a
   provider timeout is configured, falling back to 30.0 otherwise.
   The 60s cap addresses review feedback (TCP handshake shouldn't
   wait the inference timeout — connect/pool cover the connection
   layer, not model latency).

2. Streaming stale-stream detector ignored provider config. The
   stale detector read only HERMES_STREAM_STALE_TIMEOUT (env default
   180s). The providers.<id>.stale_timeout_seconds key (correctly
   used in the non-streaming path) was never consulted.

   Fix: check get_provider_stale_timeout(provider, model) first,
   then fall back to the env var. Aligns the streaming path with
   the non-streaming path's priority chain (config > env > default).

Salvage shape diverged from PR #25260: the function moved to
agent/chat_completion_helpers.py and the contributor's two commits
(initial fix + 60s-cap review follow-up) are squashed into one final
commit applied at the new location.

Original diagnosis, fix shape, AND the 60s-cap review response from
@zccyman in PR #25260; credited via Co-authored-by.

Co-authored-by: zccyman <16263913+zccyman@users.noreply.github.com>
---
 agent/chat_completion_helpers.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py
index 1bf1ebc651e..e536db95eb1 100644
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -33,7 +33,7 @@ from types import SimpleNamespace
 from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urlparse, parse_qs, urlunparse
 
-from hermes_cli.timeouts import get_provider_request_timeout
+from hermes_cli.timeouts import get_provider_request_timeout, get_provider_stale_timeout
 from agent.error_classifier import classify_api_error, FailoverReason
 from agent.model_metadata import is_local_endpoint
 from agent.message_sanitization import (
@@ -1272,15 +1272,18 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                     "Local provider detected (%s) — stream read timeout raised to %.0fs",
                     agent.base_url, _stream_read_timeout,
                 )
+        # Cap connect/pool at 60s even when provider timeout is higher.
+        # connect/pool cover TCP handshake, not model inference.
+        _conn_cap = min(_base_timeout, 60.0) if _provider_timeout_cfg is not None else 30.0
         stream_kwargs = {
             **api_kwargs,
             "stream": True,
             "stream_options": {"include_usage": True},
             "timeout": _httpx.Timeout(
-                connect=30.0,
+                connect=_conn_cap,
                 read=_stream_read_timeout,
                 write=_base_timeout,
-                pool=30.0,
+                pool=_conn_cap,
             ),
         }
         request_client_holder["client"] = agent._create_request_openai_client(
@@ -1868,7 +1871,12 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
             if request_client is not None:
                 agent._close_request_openai_client(request_client, reason="stream_request_complete")
 
-    _stream_stale_timeout_base = float(os.getenv("HERMES_STREAM_STALE_TIMEOUT", 180.0))
+    # Provider-configured stale timeout takes priority over env default.
+    _cfg_stale = get_provider_stale_timeout(agent.provider, agent.model)
+    if _cfg_stale is not None:
+        _stream_stale_timeout_base = _cfg_stale
+    else:
+        _stream_stale_timeout_base = float(os.getenv("HERMES_STREAM_STALE_TIMEOUT", 180.0))
     # Local providers (Ollama, oMLX, llama-cpp) can take 300+ seconds
     # for prefill on large contexts.  Disable the stale detector unless
     # the user explicitly set HERMES_STREAM_STALE_TIMEOUT.