From ae4a884e8dfc5cccf7303f1270d3d913791ae960 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 8 Apr 2026 19:53:39 -0700 Subject: [PATCH] fix(agent): disable stale stream timeout for local providers (#6368) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Local inference providers (Ollama, oMLX, llama-cpp) can take 300+ seconds for prefill on large contexts. The 180s stale stream detector was killing these connections while the provider was still processing. Uses the existing is_local_endpoint() (proper URL parsing with RFC-1918, localhost, WSL detection) instead of ad-hoc substring matching. The stale timeout is only disabled when the user hasn't explicitly set HERMES_STREAM_STALE_TIMEOUT — explicit user config is always honored. Fixes #5889 --- run_agent.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/run_agent.py b/run_agent.py index e511f4088ea..10932b4bafa 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4728,18 +4728,25 @@ class AIAgent: self._close_request_openai_client(request_client, reason="stream_request_complete") _stream_stale_timeout_base = float(os.getenv("HERMES_STREAM_STALE_TIMEOUT", 180.0)) - # Scale the stale timeout for large contexts: slow models (like Opus) - # can legitimately think for minutes before producing the first token - # when the context is large. Without this, the stale detector kills - # healthy connections during the model's thinking phase, producing - # spurious RemoteProtocolError ("peer closed connection"). - _est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4 - if _est_tokens > 100_000: - _stream_stale_timeout = max(_stream_stale_timeout_base, 300.0) - elif _est_tokens > 50_000: - _stream_stale_timeout = max(_stream_stale_timeout_base, 240.0) + # Local providers (Ollama, oMLX, llama-cpp) can take 300+ seconds + # for prefill on large contexts. Disable the stale detector unless + # the user explicitly set HERMES_STREAM_STALE_TIMEOUT. + if _stream_stale_timeout_base == 180.0 and self.base_url and is_local_endpoint(self.base_url): + _stream_stale_timeout = float("inf") + logger.debug("Local provider detected (%s) — stale stream timeout disabled", self.base_url) else: - _stream_stale_timeout = _stream_stale_timeout_base + # Scale the stale timeout for large contexts: slow models (like Opus) + # can legitimately think for minutes before producing the first token + # when the context is large. Without this, the stale detector kills + # healthy connections during the model's thinking phase, producing + # spurious RemoteProtocolError ("peer closed connection"). + _est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4 + if _est_tokens > 100_000: + _stream_stale_timeout = max(_stream_stale_timeout_base, 300.0) + elif _est_tokens > 50_000: + _stream_stale_timeout = max(_stream_stale_timeout_base, 240.0) + else: + _stream_stale_timeout = _stream_stale_timeout_base t = threading.Thread(target=_call, daemon=True) t.start()