From c11ab6f64df6754c68421e3c99568d1259ea63aa Mon Sep 17 00:00:00 2001
From: Teknium <teknium1@gmail.com>
Date: Sun, 19 Apr 2026 11:10:47 -0700
Subject: [PATCH] feat(providers): enforce request_timeout_seconds on
 OpenAI-wire primary calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Live test with timeout_seconds: 0.5 on claude-sonnet-4.6 proved the
initial wiring was insufficient: run_agent.py was overriding the
client-level timeout on every call via hardcoded per-request kwargs.

Root cause: run_agent.py had two sites that pass an explicit timeout=
kwarg into chat.completions.create() — api_kwargs['timeout'] at line
7075 (HERMES_API_TIMEOUT=1800s default) and the streaming path's
_httpx.Timeout(..., read=HERMES_STREAM_READ_TIMEOUT=120s, ...) at line
5760. Both override the per-provider config value the client was
constructed with, so a 0.5s config timeout would silently not enforce.

This commit:
- Adds AIAgent._resolved_api_call_timeout() — config > HERMES_API_TIMEOUT env > 1800s default.
- Uses it for the non-streaming api_kwargs['timeout'] field.
- Uses it for the streaming path's httpx.Timeout(connect, read, write, pool)
  so both connect and read respect the configured value when set.
  Local-provider auto-bump (Ollama/vLLM cold-start) only applies when
  no explicit config value is set.
- New test: test_resolved_api_call_timeout_priority covers all three
  precedence cases (config, env, default).

Live verified: 0.5s config on claude-sonnet-4.6 now triggers
APITimeoutError at ~3s per retry, exhausts 3 retries in ~15s total
(was: 29-47s success with timeout ignored). Positive case (60s config
+ gpt-4o-mini) still succeeds at 1.3s.
---
 cli-config.yaml.example                  |  8 ++-
 run_agent.py                             | 58 +++++++++++++++++-----
 tests/hermes_cli/test_timeouts.py        | 63 ++++++++++++++++++++++++
 website/docs/user-guide/configuration.md |  2 +-
 4 files changed, 115 insertions(+), 16 deletions(-)
diff --git a/cli-config.yaml.example b/cli-config.yaml.example
index a7d1e9935..34a3fc7e5 100644
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -69,8 +69,12 @@ model:
 # Use this for per-provider request timeouts and per-model exceptions.
 # Applies to the primary turn client on every api_mode (OpenAI-wire, native
 # Anthropic, and Anthropic-compatible providers), the fallback chain, and
-# client rebuilds during credential rotation.  Leaving these unset keeps the
-# SDK defaults (OpenAI ≈ 600s, native Anthropic 900s).
+# client rebuilds during credential rotation.  For OpenAI-wire chat
+# completions (streaming and non-streaming) the configured value is also
+# used as the per-request ``timeout=`` kwarg so it wins over the legacy
+# HERMES_API_TIMEOUT env var (which still applies when no config is set).
+# Leaving these unset keeps the legacy defaults (HERMES_API_TIMEOUT=1800s,
+# native Anthropic 900s).
 #
 # providers:
 #   ollama-local:
diff --git a/run_agent.py b/run_agent.py
index ec5922f98..44d55d298 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -2102,6 +2102,26 @@ class AIAgent:
         url = (base_url or self._base_url_lower).lower()
         return "api.openai.com" in url and "openrouter" not in url
 
+    def _resolved_api_call_timeout(self) -> float:
+        """Resolve the effective per-call request timeout in seconds.
+
+        Priority:
+          1. ``providers.<id>.models.<model>.timeout_seconds`` (per-model override)
+          2. ``providers.<id>.request_timeout_seconds`` (provider-wide)
+          3. ``HERMES_API_TIMEOUT`` env var (legacy escape hatch)
+          4. 1800.0s default
+
+        Used by OpenAI-wire chat completions (streaming and non-streaming) so
+        the per-provider config knob wins over the 1800s default.  Without this
+        helper, the hardcoded ``HERMES_API_TIMEOUT`` fallback would always be
+        passed as a per-call ``timeout=`` kwarg, overriding the client-level
+        timeout the AIAgent.__init__ path configured.
+        """
+        cfg = get_provider_request_timeout(self.provider, self.model)
+        if cfg is not None:
+            return cfg
+        return float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
+
     def _is_openrouter_url(self) -> bool:
         """Return True when the base URL targets OpenRouter."""
         return "openrouter" in self._base_url_lower
@@ -5754,18 +5774,30 @@ class AIAgent:
         def _call_chat_completions():
             """Stream a chat completions response."""
             import httpx as _httpx
-            _base_timeout = float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
-            _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
-            # Local providers (Ollama, llama.cpp, vLLM) can take minutes for
-            # prefill on large contexts before producing the first token.
-            # Auto-increase the httpx read timeout unless the user explicitly
-            # overrode HERMES_STREAM_READ_TIMEOUT.
-            if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url):
-                _stream_read_timeout = _base_timeout
-                logger.debug(
-                    "Local provider detected (%s) — stream read timeout raised to %.0fs",
-                    self.base_url, _stream_read_timeout,
-                )
+            # Per-provider / per-model request_timeout_seconds (from config.yaml)
+            # wins over the HERMES_API_TIMEOUT env default if the user set it.
+            _provider_timeout_cfg = get_provider_request_timeout(self.provider, self.model)
+            _base_timeout = (
+                _provider_timeout_cfg
+                if _provider_timeout_cfg is not None
+                else float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
+            )
+            # Read timeout: config wins here too.  Otherwise use
+            # HERMES_STREAM_READ_TIMEOUT (default 120s) for cloud providers.
+            if _provider_timeout_cfg is not None:
+                _stream_read_timeout = _provider_timeout_cfg
+            else:
+                _stream_read_timeout = float(os.getenv("HERMES_STREAM_READ_TIMEOUT", 120.0))
+                # Local providers (Ollama, llama.cpp, vLLM) can take minutes for
+                # prefill on large contexts before producing the first token.
+                # Auto-increase the httpx read timeout unless the user explicitly
+                # overrode HERMES_STREAM_READ_TIMEOUT.
+                if _stream_read_timeout == 120.0 and self.base_url and is_local_endpoint(self.base_url):
+                    _stream_read_timeout = _base_timeout
+                    logger.debug(
+                        "Local provider detected (%s) — stream read timeout raised to %.0fs",
+                        self.base_url, _stream_read_timeout,
+                    )
             stream_kwargs = {
                 **api_kwargs,
                 "stream": True,
@@ -7081,7 +7113,7 @@ class AIAgent:
         api_kwargs = {
             "model": self.model,
             "messages": sanitized_messages,
-            "timeout": float(os.getenv("HERMES_API_TIMEOUT", 1800.0)),
+            "timeout": self._resolved_api_call_timeout(),
         }
         try:
             from agent.auxiliary_client import _fixed_temperature_for_model
diff --git a/tests/hermes_cli/test_timeouts.py b/tests/hermes_cli/test_timeouts.py
index e551f0c6c..da8f2a4c2 100644
--- a/tests/hermes_cli/test_timeouts.py
+++ b/tests/hermes_cli/test_timeouts.py
@@ -95,3 +95,66 @@ def test_anthropic_adapter_honors_timeout_kwarg():
     # Connect timeout always stays at 10s regardless
     assert c_default.timeout.connect == 10.0
     assert c_custom.timeout.connect == 10.0
+
+
+def test_resolved_api_call_timeout_priority(monkeypatch, tmp_path):
+    """AIAgent._resolved_api_call_timeout() honors config > env > default priority."""
+    # Isolate HERMES_HOME
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    (tmp_path / ".env").write_text("", encoding="utf-8")
+
+    # Case A: config wins over env var
+    _write_config(tmp_path, """\
+        providers:
+          openrouter:
+            request_timeout_seconds: 77
+            models:
+              openai/gpt-4o-mini:
+                timeout_seconds: 42
+        """)
+    monkeypatch.setenv("HERMES_API_TIMEOUT", "999")
+
+    from run_agent import AIAgent
+    agent = AIAgent(
+        model="openai/gpt-4o-mini",
+        provider="openrouter",
+        api_key="sk-dummy",
+        base_url="https://openrouter.ai/api/v1",
+        quiet_mode=True,
+        skip_context_files=True,
+        skip_memory=True,
+        platform="cli",
+    )
+    # Per-model override wins
+    assert agent._resolved_api_call_timeout() == 42.0
+
+    # Provider-level (different model, no per-model override)
+    agent.model = "some/other-model"
+    assert agent._resolved_api_call_timeout() == 77.0
+
+    # Case B: no config → env wins
+    _write_config(tmp_path, "")
+    # Clear the cached config load
+    import importlib
+    from hermes_cli import config as cfg_mod
+    importlib.reload(cfg_mod)
+    from hermes_cli import timeouts as to_mod
+    importlib.reload(to_mod)
+    import run_agent as ra_mod
+    importlib.reload(ra_mod)
+
+    agent2 = ra_mod.AIAgent(
+        model="some/model",
+        provider="openrouter",
+        api_key="sk-dummy",
+        base_url="https://openrouter.ai/api/v1",
+        quiet_mode=True,
+        skip_context_files=True,
+        skip_memory=True,
+        platform="cli",
+    )
+    assert agent2._resolved_api_call_timeout() == 999.0
+
+    # Case C: no config, no env → 1800.0 default
+    monkeypatch.delenv("HERMES_API_TIMEOUT", raising=False)
+    assert agent2._resolved_api_call_timeout() == 1800.0
diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md
index ebba920dc..007794554 100644
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -75,7 +75,7 @@ For AI provider setup (OpenRouter, Anthropic, Copilot, custom endpoints, self-ho
 
 ### Provider Request Timeouts
 
-You can set `providers.<id>.request_timeout_seconds` for a provider-wide timeout, plus `providers.<id>.models.<model>.timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, and rebuilds after credential rotation. Leaving these unset keeps SDK defaults (OpenAI ≈ 600s, native Anthropic 900s). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example).
+You can set `providers.<id>.request_timeout_seconds` for a provider-wide timeout, plus `providers.<id>.models.<model>.timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, rebuilds after credential rotation, and (for OpenAI-wire) the per-request timeout kwarg — so the configured value wins over the legacy `HERMES_API_TIMEOUT` env var. Leaving these unset keeps legacy defaults (`HERMES_API_TIMEOUT=1800`s, native Anthropic 900s). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example).
 
 ## Terminal Backend Configuration