From c11ab6f64df6754c68421e3c99568d1259ea63aa Mon Sep 17 00:00:00 2001 From: Teknium Date: Sun, 19 Apr 2026 11:10:47 -0700 Subject: [PATCH] feat(providers): enforce request_timeout_seconds on OpenAI-wire primary calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live test with timeout_seconds: 0.5 on claude-sonnet-4.6 proved the initial wiring was insufficient: run_agent.py was overriding the client-level timeout on every call via hardcoded per-request kwargs. Root cause: run_agent.py had two sites that pass an explicit timeout= kwarg into chat.completions.create() — api_kwargs['timeout'] at line 7075 (HERMES_API_TIMEOUT=1800s default) and the streaming path's _httpx.Timeout(..., read=HERMES_STREAM_READ_TIMEOUT=120s, ...) at line 5760. Both override the per-provider config value the client was constructed with, so a 0.5s config timeout would silently not enforce. This commit: - Adds AIAgent._resolved_api_call_timeout() — config > HERMES_API_TIMEOUT env > 1800s default. - Uses it for the non-streaming api_kwargs['timeout'] field. - Uses it for the streaming path's httpx.Timeout(connect, read, write, pool) so both connect and read respect the configured value when set. Local-provider auto-bump (Ollama/vLLM cold-start) only applies when no explicit config value is set. - New test: test_resolved_api_call_timeout_priority covers all three precedence cases (config, env, default). Live verified: 0.5s config on claude-sonnet-4.6 now triggers APITimeoutError at ~3s per retry, exhausts 3 retries in ~15s total (was: 29-47s success with timeout ignored). Positive case (60s config + gpt-4o-mini) still succeeds at 1.3s. --- cli-config.yaml.example | 8 ++- run_agent.py | 58 +++++++++++++++++----- tests/hermes_cli/test_timeouts.py | 63 ++++++++++++++++++++++++ website/docs/user-guide/configuration.md | 2 +- 4 files changed, 115 insertions(+), 16 deletions(-) diff --git a/cli-config.yaml.example b/cli-config.yaml.example index a7d1e9935..34a3fc7e5 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -69,8 +69,12 @@ model: # Use this for per-provider request timeouts and per-model exceptions. # Applies to the primary turn client on every api_mode (OpenAI-wire, native # Anthropic, and Anthropic-compatible providers), the fallback chain, and -# client rebuilds during credential rotation. Leaving these unset keeps the -# SDK defaults (OpenAI ≈ 600s, native Anthropic 900s). +# client rebuilds during credential rotation. For OpenAI-wire chat +# completions (streaming and non-streaming) the configured value is also +# used as the per-request ``timeout=`` kwarg so it wins over the legacy +# HERMES_API_TIMEOUT env var (which still applies when no config is set). +# Leaving these unset keeps the legacy defaults (HERMES_API_TIMEOUT=1800s, +# native Anthropic 900s). # # providers: # ollama-local: diff --git a/run_agent.py b/run_agent.py index ec5922f98..44d55d298 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2102,6 +2102,26 @@ class AIAgent: url = (base_url or self._base_url_lower).lower() return "api.openai.com" in url and "openrouter" not in url + def _resolved_api_call_timeout(self) -> float: + """Resolve the effective per-call request timeout in seconds. + + Priority: + 1. ``providers..models..timeout_seconds`` (per-model override) + 2. ``providers..request_timeout_seconds`` (provider-wide) + 3. ``HERMES_API_TIMEOUT`` env var (legacy escape hatch) + 4. 1800.0s default + + Used by OpenAI-wire chat completions (streaming and non-streaming) so + the per-provider config knob wins over the 1800s default. Without this + helper, the hardcoded ``HERMES_API_TIMEOUT`` fallback would always be + passed as a per-call ``timeout=`` kwarg, overriding the client-level + timeout the AIAgent.__init__ path configured. + """ + cfg = get_provider_request_timeout(self.provider, self.model) + if cfg is not None: + return cfg + return float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) + def _is_openrouter_url(self) -> bool: """Return True when the base URL targets OpenRouter.""" return "openrouter" in self._base_url_lower @@ -5754,18 +5774,30 @@ class AIAgent: def _call_chat_completions(): """Stream a chat completions response.""" import httpx as _httpx - _base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) - _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) - # Local providers (Ollama, llama.cpp, vLLM) can take minutes for - # prefill on large contexts before producing the first token. - # Auto-increase the httpx read timeout unless the user explicitly - # overrode HERMES_STREAM_READ_TIMEOUT. - if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url): - _stream_read_timeout = _base_timeout - logger.debug( - "Local provider detected (%s) — stream read timeout raised to %.0fs", - self.base_url, _stream_read_timeout, - ) + # Per-provider / per-model request_timeout_seconds (from config.yaml) + # wins over the HERMES_API_TIMEOUT env default if the user set it. + _provider_timeout_cfg = get_provider_request_timeout(self.provider, self.model) + _base_timeout = ( + _provider_timeout_cfg + if _provider_timeout_cfg is not None + else float(os.getenv("HERMES_API_TIMEOUT", 1800.0)) + ) + # Read timeout: config wins here too. Otherwise use + # HERMES_STREAM_READ_TIMEOUT (default 120s) for cloud providers. + if _provider_timeout_cfg is not None: + _stream_read_timeout = _provider_timeout_cfg + else: + _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0)) + # Local providers (Ollama, llama.cpp, vLLM) can take minutes for + # prefill on large contexts before producing the first token. + # Auto-increase the httpx read timeout unless the user explicitly + # overrode HERMES_STREAM_READ_TIMEOUT. + if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url): + _stream_read_timeout = _base_timeout + logger.debug( + "Local provider detected (%s) — stream read timeout raised to %.0fs", + self.base_url, _stream_read_timeout, + ) stream_kwargs = { **api_kwargs, "stream": True, @@ -7081,7 +7113,7 @@ class AIAgent: api_kwargs = { "model": self.model, "messages": sanitized_messages, - "timeout": float(os.getenv("HERMES_API_TIMEOUT", 1800.0)), + "timeout": self._resolved_api_call_timeout(), } try: from agent.auxiliary_client import _fixed_temperature_for_model diff --git a/tests/hermes_cli/test_timeouts.py b/tests/hermes_cli/test_timeouts.py index e551f0c6c..da8f2a4c2 100644 --- a/tests/hermes_cli/test_timeouts.py +++ b/tests/hermes_cli/test_timeouts.py @@ -95,3 +95,66 @@ def test_anthropic_adapter_honors_timeout_kwarg(): # Connect timeout always stays at 10s regardless assert c_default.timeout.connect == 10.0 assert c_custom.timeout.connect == 10.0 + + +def test_resolved_api_call_timeout_priority(monkeypatch, tmp_path): + """AIAgent._resolved_api_call_timeout() honors config > env > default priority.""" + # Isolate HERMES_HOME + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + (tmp_path / ".env").write_text("", encoding="utf-8") + + # Case A: config wins over env var + _write_config(tmp_path, """\ + providers: + openrouter: + request_timeout_seconds: 77 + models: + openai/gpt-4o-mini: + timeout_seconds: 42 + """) + monkeypatch.setenv("HERMES_API_TIMEOUT", "999") + + from run_agent import AIAgent + agent = AIAgent( + model="openai/gpt-4o-mini", + provider="openrouter", + api_key="sk-dummy", + base_url="https://openrouter.ai/api/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + platform="cli", + ) + # Per-model override wins + assert agent._resolved_api_call_timeout() == 42.0 + + # Provider-level (different model, no per-model override) + agent.model = "some/other-model" + assert agent._resolved_api_call_timeout() == 77.0 + + # Case B: no config → env wins + _write_config(tmp_path, "") + # Clear the cached config load + import importlib + from hermes_cli import config as cfg_mod + importlib.reload(cfg_mod) + from hermes_cli import timeouts as to_mod + importlib.reload(to_mod) + import run_agent as ra_mod + importlib.reload(ra_mod) + + agent2 = ra_mod.AIAgent( + model="some/model", + provider="openrouter", + api_key="sk-dummy", + base_url="https://openrouter.ai/api/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + platform="cli", + ) + assert agent2._resolved_api_call_timeout() == 999.0 + + # Case C: no config, no env → 1800.0 default + monkeypatch.delenv("HERMES_API_TIMEOUT", raising=False) + assert agent2._resolved_api_call_timeout() == 1800.0 diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index ebba920dc..007794554 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -75,7 +75,7 @@ For AI provider setup (OpenRouter, Anthropic, Copilot, custom endpoints, self-ho ### Provider Request Timeouts -You can set `providers..request_timeout_seconds` for a provider-wide timeout, plus `providers..models..timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, and rebuilds after credential rotation. Leaving these unset keeps SDK defaults (OpenAI ≈ 600s, native Anthropic 900s). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example). +You can set `providers..request_timeout_seconds` for a provider-wide timeout, plus `providers..models..timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, rebuilds after credential rotation, and (for OpenAI-wire) the per-request timeout kwarg — so the configured value wins over the legacy `HERMES_API_TIMEOUT` env var. Leaving these unset keeps legacy defaults (`HERMES_API_TIMEOUT=1800`s, native Anthropic 900s). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example). ## Terminal Backend Configuration