diff --git a/run_agent.py b/run_agent.py index 9a684d17f3..3e7ddc6870 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4433,7 +4433,17 @@ class AIAgent: """Stream a chat completions response.""" import httpx as _httpx _base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) - _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 60.0)) + _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) + # Local providers (Ollama, llama.cpp, vLLM) can take minutes for + # prefill on large contexts before producing the first token. + # Auto-increase the httpx read timeout unless the user explicitly + # overrode HERMES_STREAM_READ_TIMEOUT. + if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url): + _stream_read_timeout = _base_timeout + logger.debug( + "Local provider detected (%s) — stream read timeout raised to %.0fs", + self.base_url, _stream_read_timeout, + ) stream_kwargs = { **api_kwargs, "stream": True, diff --git a/tests/agent/test_local_stream_timeout.py b/tests/agent/test_local_stream_timeout.py new file mode 100644 index 0000000000..929f2e3c84 --- /dev/null +++ b/tests/agent/test_local_stream_timeout.py @@ -0,0 +1,70 @@ +"""Tests for local provider stream read timeout auto-detection. + +When a local LLM provider is detected (Ollama, llama.cpp, vLLM, etc.), +the httpx stream read timeout should be automatically increased from the +default 60s to HERMES_API_TIMEOUT (1800s) to avoid premature connection +kills during long prefill phases. +""" + +import os +import pytest +from unittest.mock import patch + +from agent.model_metadata import is_local_endpoint + + +class TestLocalStreamReadTimeout: + """Verify stream read timeout auto-detection logic.""" + + @pytest.mark.parametrize("base_url", [ + "http://localhost:11434", + "http://127.0.0.1:8080", + "http://0.0.0.0:5000", + "http://192.168.1.100:8000", + "http://10.0.0.5:1234", + ]) + def test_local_endpoint_bumps_read_timeout(self, base_url): + """Local endpoint + default timeout -> bumps to base_timeout.""" + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("HERMES_STREAM_READ_TIMEOUT", None) + _base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) + _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) + if _stream_read_timeout == 120.0 and base_url and is_local_endpoint(base_url): + _stream_read_timeout = _base_timeout + assert _stream_read_timeout == 1800.0 + + def test_user_override_respected_for_local(self): + """User sets HERMES_STREAM_READ_TIMEOUT -> keep their value even for local.""" + with patch.dict(os.environ, {"HERMES_STREAM_READ_TIMEOUT": "300"}, clear=False): + _base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) + _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) + base_url = "http://localhost:11434" + if _stream_read_timeout == 120.0 and base_url and is_local_endpoint(base_url): + _stream_read_timeout = _base_timeout + assert _stream_read_timeout == 300.0 + + @pytest.mark.parametrize("base_url", [ + "https://api.openai.com", + "https://openrouter.ai/api", + "https://api.anthropic.com", + ]) + def test_remote_endpoint_keeps_default(self, base_url): + """Remote endpoint -> keep 120s default.""" + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("HERMES_STREAM_READ_TIMEOUT", None) + _base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) + _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) + if _stream_read_timeout == 120.0 and base_url and is_local_endpoint(base_url): + _stream_read_timeout = _base_timeout + assert _stream_read_timeout == 120.0 + + def test_empty_base_url_keeps_default(self): + """No base_url set -> keep 120s default.""" + with patch.dict(os.environ, {}, clear=False): + os.environ.pop("HERMES_STREAM_READ_TIMEOUT", None) + _base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) + _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) + base_url = "" + if _stream_read_timeout == 120.0 and base_url and is_local_endpoint(base_url): + _stream_read_timeout = _base_timeout + assert _stream_read_timeout == 120.0 diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md index 0d5823bf6c..f881074780 100644 --- a/website/docs/reference/environment-variables.md +++ b/website/docs/reference/environment-variables.md @@ -278,6 +278,8 @@ For cloud sandbox backends, persistence is filesystem-oriented. `TERMINAL_LIFETI | `HERMES_HUMAN_DELAY_MAX_MS` | Custom delay range maximum (ms) | | `HERMES_QUIET` | Suppress non-essential output (`true`/`false`) | | `HERMES_API_TIMEOUT` | LLM API call timeout in seconds (default: `1800`) | +| `HERMES_STREAM_READ_TIMEOUT` | Streaming socket read timeout in seconds (default: `120`). Auto-increased to `HERMES_API_TIMEOUT` for local providers. Increase if local LLMs time out during long code generation. | +| `HERMES_STREAM_STALE_TIMEOUT` | Stale stream detection timeout in seconds (default: `180`). Auto-disabled for local providers. Triggers connection kill if no chunks arrive within this window. | | `HERMES_EXEC_ASK` | Enable execution approval prompts in gateway mode (`true`/`false`) | | `HERMES_ENABLE_PROJECT_PLUGINS` | Enable auto-discovery of repo-local plugins from `./.hermes/plugins/` (`true`/`false`, default: `false`) | | `HERMES_BACKGROUND_NOTIFICATIONS` | Background process notification mode in gateway: `all` (default), `result`, `error`, `off` |