mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: increase stream read timeout default to 120s, auto-raise for local LLMs (#6967)
Raise the default httpx stream read timeout from 60s to 120s for all providers. Additionally, auto-detect local LLM endpoints (Ollama, llama.cpp, vLLM) and raise the read timeout to HERMES_API_TIMEOUT (1800s) since local models can take minutes for prefill on large contexts before producing the first token. The stale stream timeout already had this local auto-detection pattern; the httpx read timeout was missing it — causing a hard 60s wall that users couldn't find (HERMES_STREAM_READ_TIMEOUT was undocumented). Changes: - Default HERMES_STREAM_READ_TIMEOUT: 60s -> 120s - Auto-detect local endpoints -> raise to 1800s (user override respected) - Document HERMES_STREAM_READ_TIMEOUT and HERMES_STREAM_STALE_TIMEOUT - Add 10 parametrized tests Reported-by: Pavan Srinivas (@pavanandums)
This commit is contained in:
parent
bda9aa17cb
commit
f783986f5a
3 changed files with 83 additions and 1 deletions
12
run_agent.py
12
run_agent.py
|
|
@ -4433,7 +4433,17 @@ class AIAgent:
|
|||
"""Stream a chat completions response."""
|
||||
import httpx as _httpx
|
||||
_base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
||||
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 60.0))
|
||||
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
|
||||
# Local providers (Ollama, llama.cpp, vLLM) can take minutes for
|
||||
# prefill on large contexts before producing the first token.
|
||||
# Auto-increase the httpx read timeout unless the user explicitly
|
||||
# overrode HERMES_STREAM_READ_TIMEOUT.
|
||||
if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url):
|
||||
_stream_read_timeout = _base_timeout
|
||||
logger.debug(
|
||||
"Local provider detected (%s) — stream read timeout raised to %.0fs",
|
||||
self.base_url, _stream_read_timeout,
|
||||
)
|
||||
stream_kwargs = {
|
||||
**api_kwargs,
|
||||
"stream": True,
|
||||
|
|
|
|||
70
tests/agent/test_local_stream_timeout.py
Normal file
70
tests/agent/test_local_stream_timeout.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
"""Tests for local provider stream read timeout auto-detection.
|
||||
|
||||
When a local LLM provider is detected (Ollama, llama.cpp, vLLM, etc.),
|
||||
the httpx stream read timeout should be automatically increased from the
|
||||
default 60s to HERMES_API_TIMEOUT (1800s) to avoid premature connection
|
||||
kills during long prefill phases.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
from agent.model_metadata import is_local_endpoint
|
||||
|
||||
|
||||
class TestLocalStreamReadTimeout:
|
||||
"""Verify stream read timeout auto-detection logic."""
|
||||
|
||||
@pytest.mark.parametrize("base_url", [
|
||||
"http://localhost:11434",
|
||||
"http://127.0.0.1:8080",
|
||||
"http://0.0.0.0:5000",
|
||||
"http://192.168.1.100:8000",
|
||||
"http://10.0.0.5:1234",
|
||||
])
|
||||
def test_local_endpoint_bumps_read_timeout(self, base_url):
|
||||
"""Local endpoint + default timeout -> bumps to base_timeout."""
|
||||
with patch.dict(os.environ, {}, clear=False):
|
||||
os.environ.pop("HERMES_STREAM_READ_TIMEOUT", None)
|
||||
_base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
||||
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
|
||||
if _stream_read_timeout == 120.0 and base_url and is_local_endpoint(base_url):
|
||||
_stream_read_timeout = _base_timeout
|
||||
assert _stream_read_timeout == 1800.0
|
||||
|
||||
def test_user_override_respected_for_local(self):
|
||||
"""User sets HERMES_STREAM_READ_TIMEOUT -> keep their value even for local."""
|
||||
with patch.dict(os.environ, {"HERMES_STREAM_READ_TIMEOUT": "300"}, clear=False):
|
||||
_base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
||||
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
|
||||
base_url = "http://localhost:11434"
|
||||
if _stream_read_timeout == 120.0 and base_url and is_local_endpoint(base_url):
|
||||
_stream_read_timeout = _base_timeout
|
||||
assert _stream_read_timeout == 300.0
|
||||
|
||||
@pytest.mark.parametrize("base_url", [
|
||||
"https://api.openai.com",
|
||||
"https://openrouter.ai/api",
|
||||
"https://api.anthropic.com",
|
||||
])
|
||||
def test_remote_endpoint_keeps_default(self, base_url):
|
||||
"""Remote endpoint -> keep 120s default."""
|
||||
with patch.dict(os.environ, {}, clear=False):
|
||||
os.environ.pop("HERMES_STREAM_READ_TIMEOUT", None)
|
||||
_base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
||||
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
|
||||
if _stream_read_timeout == 120.0 and base_url and is_local_endpoint(base_url):
|
||||
_stream_read_timeout = _base_timeout
|
||||
assert _stream_read_timeout == 120.0
|
||||
|
||||
def test_empty_base_url_keeps_default(self):
|
||||
"""No base_url set -> keep 120s default."""
|
||||
with patch.dict(os.environ, {}, clear=False):
|
||||
os.environ.pop("HERMES_STREAM_READ_TIMEOUT", None)
|
||||
_base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
||||
_stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
|
||||
base_url = ""
|
||||
if _stream_read_timeout == 120.0 and base_url and is_local_endpoint(base_url):
|
||||
_stream_read_timeout = _base_timeout
|
||||
assert _stream_read_timeout == 120.0
|
||||
|
|
@ -278,6 +278,8 @@ For cloud sandbox backends, persistence is filesystem-oriented. `TERMINAL_LIFETI
|
|||
| `HERMES_HUMAN_DELAY_MAX_MS` | Custom delay range maximum (ms) |
|
||||
| `HERMES_QUIET` | Suppress non-essential output (`true`/`false`) |
|
||||
| `HERMES_API_TIMEOUT` | LLM API call timeout in seconds (default: `1800`) |
|
||||
| `HERMES_STREAM_READ_TIMEOUT` | Streaming socket read timeout in seconds (default: `120`). Auto-increased to `HERMES_API_TIMEOUT` for local providers. Increase if local LLMs time out during long code generation. |
|
||||
| `HERMES_STREAM_STALE_TIMEOUT` | Stale stream detection timeout in seconds (default: `180`). Auto-disabled for local providers. Triggers connection kill if no chunks arrive within this window. |
|
||||
| `HERMES_EXEC_ASK` | Enable execution approval prompts in gateway mode (`true`/`false`) |
|
||||
| `HERMES_ENABLE_PROJECT_PLUGINS` | Enable auto-discovery of repo-local plugins from `./.hermes/plugins/` (`true`/`false`, default: `false`) |
|
||||
| `HERMES_BACKGROUND_NOTIFICATIONS` | Background process notification mode in gateway: `all` (default), `result`, `error`, `off` |
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue