fix(config): add stale timeout settings

2026-04-25 00:51:20 +00:00 · 2026-04-19 13:40:09 -06:00 · 2026-04-19 13:40:09 -06:00 · 03e3c22e86
commit 03e3c22e86
parent 440764e013
6 changed files with 267 additions and 31 deletions
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@ -66,15 +66,18 @@ model:
 # max_tokens: 8192
 # Named provider overrides (optional)
-# Use this for per-provider request timeouts and per-model exceptions.
+# Use this for per-provider request timeouts, non-stream stale timeouts,
 # and per-model exceptions.
 # Applies to the primary turn client on every api_mode (OpenAI-wire, native
 # Anthropic, and Anthropic-compatible providers), the fallback chain, and
 # client rebuilds during credential rotation.  For OpenAI-wire chat
 # completions (streaming and non-streaming) the configured value is also
 # used as the per-request ``timeout=`` kwarg so it wins over the legacy
 # HERMES_API_TIMEOUT env var (which still applies when no config is set).
-# Leaving these unset keeps the legacy defaults (HERMES_API_TIMEOUT=1800s,
+# ``stale_timeout_seconds`` controls the non-streaming stale-call detector and
-# native Anthropic 900s).
+# wins over the legacy HERMES_API_CALL_STALE_TIMEOUT env var. Leaving these
 # unset keeps the legacy defaults (HERMES_API_TIMEOUT=1800s,
 # HERMES_API_CALL_STALE_TIMEOUT=300s, native Anthropic 900s).
 #
 # Not currently wired for AWS Bedrock (bedrock_converse + AnthropicBedrock
 # SDK paths) — those use boto3 with its own timeout configuration.
@ -82,11 +85,16 @@ model:
 # providers:
 #   ollama-local:
 #     request_timeout_seconds: 300   # Longer timeout for local cold-starts
 #     stale_timeout_seconds: 900     # Explicitly re-enable stale detection on local endpoints
 #   anthropic:
 #     request_timeout_seconds: 30    # Fast-fail cloud requests
 #     models:
 #       claude-opus-4.6:
 #         timeout_seconds: 600       # Longer timeout for extended-thinking Opus calls
 #   openai-codex:
 #     models:
 #       gpt-5.4:
 #         stale_timeout_seconds: 1800  # Longer non-stream stale timeout for slow large-context turns
 # =============================================================================
 # OpenRouter Provider Routing (only applies when using OpenRouter)
--- a/hermes_cli/timeouts.py
+++ b/hermes_cli/timeouts.py
@ -31,12 +31,52 @@ def get_provider_request_timeout(
    if not isinstance(provider_config, dict):
        return None
-    if model:
+    model_config = _get_model_config(provider_config, model)
-        models = provider_config.get("models", {})
+    if model_config is not None:
-        model_config = models.get(model, {}) if isinstance(models, dict) else {}
+        timeout = _coerce_timeout(model_config.get("timeout_seconds"))
-        if isinstance(model_config, dict):
+        if timeout is not None:
-            timeout = _coerce_timeout(model_config.get("timeout_seconds"))
+            return timeout
            if timeout is not None:
                return timeout
    return _coerce_timeout(provider_config.get("request_timeout_seconds"))
 def get_provider_stale_timeout(
    provider_id: str, model: str | None = None
 ) -> float | None:
    """Return a configured non-stream stale timeout in seconds, if any."""
    if not provider_id:
        return None
    try:
        from hermes_cli.config import load_config
    except ImportError:
        return None
    config = load_config()
    providers = config.get("providers", {}) if isinstance(config, dict) else {}
    provider_config = (
        providers.get(provider_id, {}) if isinstance(providers, dict) else {}
    )
    if not isinstance(provider_config, dict):
        return None
    model_config = _get_model_config(provider_config, model)
    if model_config is not None:
        timeout = _coerce_timeout(model_config.get("stale_timeout_seconds"))
        if timeout is not None:
            return timeout
    return _coerce_timeout(provider_config.get("stale_timeout_seconds"))
 def _get_model_config(
    provider_config: dict[str, object], model: str | None
 ) -> dict[str, object] | None:
    if not model:
        return None
    models = provider_config.get("models", {})
    model_config = models.get(model, {}) if isinstance(models, dict) else {}
    if isinstance(model_config, dict):
        return model_config
    return None
--- a/run_agent.py
+++ b/run_agent.py
@ -48,7 +48,10 @@ from hermes_constants import get_hermes_home
 # Load .env from ~/.hermes/.env first, then project root as dev fallback.
 # User-managed env files should override stale shell exports on restart.
 from hermes_cli.env_loader import load_hermes_dotenv
-from hermes_cli.timeouts import get_provider_request_timeout
+from hermes_cli.timeouts import (
    get_provider_request_timeout,
    get_provider_stale_timeout,
 )
 _hermes_home = get_hermes_home()
 _project_env = Path(__file__).parent / '.env'
@ -2158,6 +2161,44 @@ class AIAgent:
            return cfg
        return float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
    def _resolved_api_call_stale_timeout_base(self) -> tuple[float, bool]:
        """Resolve the base non-stream stale timeout and whether it is implicit.
        Priority:
          1. ``providers.<id>.models.<model>.stale_timeout_seconds``
          2. ``providers.<id>.stale_timeout_seconds``
          3. ``HERMES_API_CALL_STALE_TIMEOUT`` env var
          4. 300.0s default
        Returns ``(timeout_seconds, uses_implicit_default)`` so the caller can
        preserve legacy behaviors that only apply when the user has *not*
        explicitly configured a stale timeout, such as auto-disabling the
        detector for local endpoints.
        """
        cfg = get_provider_stale_timeout(self.provider, self.model)
        if cfg is not None:
            return cfg, False
        env_timeout = os.getenv("HERMES_API_CALL_STALE_TIMEOUT")
        if env_timeout is not None:
            return float(env_timeout), False
        return 300.0, True
    def _compute_non_stream_stale_timeout(self, messages: list[dict[str, Any]]) -> float:
        """Compute the effective non-stream stale timeout for this request."""
        stale_base, uses_implicit_default = self._resolved_api_call_stale_timeout_base()
        base_url = getattr(self, "_base_url", None) or self.base_url or ""
        if uses_implicit_default and base_url and is_local_endpoint(base_url):
            return float("inf")
        est_tokens = sum(len(str(v)) for v in messages) // 4
        if est_tokens > 100_000:
            return max(stale_base, 600.0)
        if est_tokens > 50_000:
            return max(stale_base, 450.0)
        return stale_base
    def _is_openrouter_url(self) -> bool:
        """Return True when the base URL targets OpenRouter."""
        return "openrouter" in self._base_url_lower
@ -5594,18 +5635,9 @@ class AIAgent:
        # httpx timeout (default 1800s) with zero feedback.  The stale
        # detector kills the connection early so the main retry loop can
        # apply richer recovery (credential rotation, provider fallback).
-        _stale_base = float(os.getenv("HERMES_API_CALL_STALE_TIMEOUT", 300.0))
+        _stale_timeout = self._compute_non_stream_stale_timeout(
-        _base_url = getattr(self, "_base_url", None) or ""
+            api_kwargs.get("messages", [])
-        if _stale_base == 300.0 and _base_url and is_local_endpoint(_base_url):
+        )
            _stale_timeout = float("inf")
        else:
            _est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
            if _est_tokens > 100_000:
                _stale_timeout = max(_stale_base, 600.0)
            elif _est_tokens > 50_000:
                _stale_timeout = max(_stale_base, 450.0)
            else:
                _stale_timeout = _stale_base
        _call_start = time.time()
        self._touch_activity("waiting for non-streaming API response")
--- a/tests/hermes_cli/test_timeouts.py
+++ b/tests/hermes_cli/test_timeouts.py
@ -2,7 +2,10 @@ from __future__ import annotations
 import textwrap
-from hermes_cli.timeouts import get_provider_request_timeout
+from hermes_cli.timeouts import (
    get_provider_request_timeout,
    get_provider_stale_timeout,
 )
 def _write_config(tmp_path, body: str) -> None:
@ -40,6 +43,37 @@ def test_provider_timeout_used_when_no_model_override(monkeypatch, tmp_path):
    assert get_provider_request_timeout("ollama-local", "qwen3:32b") == 300.0
 def test_model_stale_timeout_override_wins(monkeypatch, tmp_path):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    _write_config(
        tmp_path,
        """\
        providers:
          openai-codex:
            stale_timeout_seconds: 600
            models:
              gpt-5.4:
                stale_timeout_seconds: 1800
        """,
    )
    assert get_provider_stale_timeout("openai-codex", "gpt-5.4") == 1800.0
 def test_provider_stale_timeout_used_when_no_model_override(monkeypatch, tmp_path):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    _write_config(
        tmp_path,
        """\
        providers:
          openai-codex:
            stale_timeout_seconds: 900
        """,
    )
    assert get_provider_stale_timeout("openai-codex", "gpt-5.4") == 900.0
 def test_missing_timeout_returns_none(monkeypatch, tmp_path):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    _write_config(
@ -78,6 +112,24 @@ def test_invalid_timeout_values_return_none(monkeypatch, tmp_path):
    assert get_provider_request_timeout("ollama-local") is None
 def test_invalid_stale_timeout_values_return_none(monkeypatch, tmp_path):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    _write_config(
        tmp_path,
        """\
        providers:
          openai-codex:
            stale_timeout_seconds: "slow"
            models:
              gpt-5.4:
                stale_timeout_seconds: -1
        """,
    )
    assert get_provider_stale_timeout("openai-codex", "gpt-5.4") is None
    assert get_provider_stale_timeout("openai-codex", "gpt-5.5") is None
 def test_anthropic_adapter_honors_timeout_kwarg():
    """build_anthropic_client(timeout=X) overrides the 900s default read timeout."""
    pytest = __import__("pytest")
@ -158,3 +210,99 @@ def test_resolved_api_call_timeout_priority(monkeypatch, tmp_path):
    # Case C: no config, no env → 1800.0 default
    monkeypatch.delenv("HERMES_API_TIMEOUT", raising=False)
    assert agent2._resolved_api_call_timeout() == 1800.0
 def test_resolved_api_call_stale_timeout_priority(monkeypatch, tmp_path):
    """AIAgent stale timeout honors config > env > default priority."""
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    (tmp_path / ".env").write_text("", encoding="utf-8")
    _write_config(tmp_path, """\
        providers:
          openai-codex:
            stale_timeout_seconds: 600
            models:
              gpt-5.4:
                stale_timeout_seconds: 1800
        """)
    monkeypatch.setenv("HERMES_API_CALL_STALE_TIMEOUT", "999")
    from run_agent import AIAgent
    agent = AIAgent(
        model="gpt-5.4",
        provider="openai-codex",
        api_key="sk-dummy",
        base_url="https://chatgpt.com/backend-api/codex",
        quiet_mode=True,
        skip_context_files=True,
        skip_memory=True,
        platform="cli",
    )
    assert agent._resolved_api_call_stale_timeout_base() == (1800.0, False)
    agent.model = "gpt-5.5"
    assert agent._resolved_api_call_stale_timeout_base() == (600.0, False)
    _write_config(tmp_path, "")
    import importlib
    from hermes_cli import config as cfg_mod
    importlib.reload(cfg_mod)
    from hermes_cli import timeouts as to_mod
    importlib.reload(to_mod)
    import run_agent as ra_mod
    importlib.reload(ra_mod)
    agent2 = ra_mod.AIAgent(
        model="gpt-5.4",
        provider="openai-codex",
        api_key="sk-dummy",
        base_url="https://chatgpt.com/backend-api/codex",
        quiet_mode=True,
        skip_context_files=True,
        skip_memory=True,
        platform="cli",
    )
    assert agent2._resolved_api_call_stale_timeout_base() == (999.0, False)
    monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
    assert agent2._resolved_api_call_stale_timeout_base() == (300.0, True)
 def test_default_non_stream_stale_timeout_auto_disables_for_local_endpoints(monkeypatch, tmp_path):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    (tmp_path / ".env").write_text("", encoding="utf-8")
    monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
    from run_agent import AIAgent
    agent = AIAgent(
        model="qwen3:32b",
        provider="ollama-local",
        api_key="sk-dummy",
        base_url="http://127.0.0.1:11434/v1",
        quiet_mode=True,
        skip_context_files=True,
        skip_memory=True,
        platform="cli",
    )
    assert agent._compute_non_stream_stale_timeout([]) == float("inf")
 def test_explicit_non_stream_stale_timeout_is_honored_for_local_endpoints(monkeypatch, tmp_path):
    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
    (tmp_path / ".env").write_text("", encoding="utf-8")
    monkeypatch.setenv("HERMES_API_CALL_STALE_TIMEOUT", "300")
    from run_agent import AIAgent
    agent = AIAgent(
        model="qwen3:32b",
        provider="ollama-local",
        api_key="sk-dummy",
        base_url="http://127.0.0.1:11434/v1",
        quiet_mode=True,
        skip_context_files=True,
        skip_memory=True,
        platform="cli",
    )
    assert agent._compute_non_stream_stale_timeout([]) == 300.0
--- a/website/docs/reference/environment-variables.md
+++ b/website/docs/reference/environment-variables.md
@ -360,6 +360,7 @@ For cloud sandbox backends, persistence is filesystem-oriented. `TERMINAL_LIFETI
 | `HERMES_HUMAN_DELAY_MAX_MS` | Custom delay range maximum (ms) |
 | `HERMES_QUIET` | Suppress non-essential output (`true`/`false`) |
 | `HERMES_API_TIMEOUT` | LLM API call timeout in seconds (default: `1800`) |
 | `HERMES_API_CALL_STALE_TIMEOUT` | Non-streaming stale-call timeout in seconds (default: `300`). Auto-disabled for local providers when left unset. Also configurable via `providers.<id>.stale_timeout_seconds` or `providers.<id>.models.<model>.stale_timeout_seconds` in `config.yaml`. |
 | `HERMES_STREAM_READ_TIMEOUT` | Streaming socket read timeout in seconds (default: `120`). Auto-increased to `HERMES_API_TIMEOUT` for local providers. Increase if local LLMs time out during long code generation. |
 | `HERMES_STREAM_STALE_TIMEOUT` | Stale stream detection timeout in seconds (default: `180`). Auto-disabled for local providers. Triggers connection kill if no chunks arrive within this window. |
 | `HERMES_EXEC_ASK` | Enable execution approval prompts in gateway mode (`true`/`false`) |
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@ -73,9 +73,13 @@ Multiple references in a single value work: `url: "${HOST}:${PORT}"`. If a refer
 For AI provider setup (OpenRouter, Anthropic, Copilot, custom endpoints, self-hosted LLMs, fallback models, etc.), see [AI Providers](/docs/integrations/providers).
-### Provider Request Timeouts
+### Provider Timeouts
-You can set `providers.<id>.request_timeout_seconds` for a provider-wide timeout, plus `providers.<id>.models.<model>.timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, rebuilds after credential rotation, and (for OpenAI-wire) the per-request timeout kwarg — so the configured value wins over the legacy `HERMES_API_TIMEOUT` env var. Leaving these unset keeps legacy defaults (`HERMES_API_TIMEOUT=1800`s, native Anthropic 900s). Not currently wired for AWS Bedrock (both `bedrock_converse` and AnthropicBedrock SDK paths use boto3 with its own timeout configuration). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example).
+You can set `providers.<id>.request_timeout_seconds` for a provider-wide request timeout, plus `providers.<id>.models.<model>.timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, rebuilds after credential rotation, and (for OpenAI-wire) the per-request timeout kwarg — so the configured value wins over the legacy `HERMES_API_TIMEOUT` env var.
 You can also set `providers.<id>.stale_timeout_seconds` for the non-streaming stale-call detector, plus `providers.<id>.models.<model>.stale_timeout_seconds` for a model-specific override. This wins over the legacy `HERMES_API_CALL_STALE_TIMEOUT` env var.
 Leaving these unset keeps the legacy defaults (`HERMES_API_TIMEOUT=1800`s, `HERMES_API_CALL_STALE_TIMEOUT=300`s, native Anthropic 900s). Not currently wired for AWS Bedrock (both `bedrock_converse` and AnthropicBedrock SDK paths use boto3 with its own timeout configuration). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example).
 ## Terminal Backend Configuration
@ -554,20 +558,23 @@ Budget pressure is enabled by default. The agent sees warnings naturally as part
 When the iteration budget is fully exhausted, the CLI shows a notification to the user: `⚠ Iteration budget reached (90/90) — response may be incomplete`. If the budget runs out during active work, the agent generates a summary of what was accomplished before stopping.
-### Streaming Timeouts
+### API Timeouts
-The LLM streaming connection has two timeout layers. Both auto-adjust for local providers (localhost, LAN IPs) — no configuration needed for most setups.
+Hermes has separate timeout layers for streaming, plus a stale detector for non-streaming calls. The stale detectors auto-adjust for local providers only when you leave them at their implicit defaults.
-| Timeout | Default | Local providers | Env var |
+| Timeout | Default | Local providers | Config / env |
-|---------|---------|----------------|---------|
+|---------|---------|----------------|--------------|
 | Socket read timeout | 120s | Auto-raised to 1800s | `HERMES_STREAM_READ_TIMEOUT` |
 | Stale stream detection | 180s | Auto-disabled | `HERMES_STREAM_STALE_TIMEOUT` |
-| API call (non-streaming) | 1800s | Unchanged | `HERMES_API_TIMEOUT` |
+| Stale non-stream detection | 300s | Auto-disabled when left implicit | `providers.<id>.stale_timeout_seconds` or `HERMES_API_CALL_STALE_TIMEOUT` |
 | API call (non-streaming) | 1800s | Unchanged | `providers.<id>.request_timeout_seconds` / `timeout_seconds` or `HERMES_API_TIMEOUT` |
 The **socket read timeout** controls how long httpx waits for the next chunk of data from the provider. Local LLMs can take minutes for prefill on large contexts before producing the first token, so Hermes raises this to 30 minutes when it detects a local endpoint. If you explicitly set `HERMES_STREAM_READ_TIMEOUT`, that value is always used regardless of endpoint detection.
 The **stale stream detection** kills connections that receive SSE keep-alive pings but no actual content. This is disabled entirely for local providers since they don't send keep-alive pings during prefill.
 The **stale non-stream detection** kills non-streaming calls that produce no response for too long. By default Hermes disables this on local endpoints to avoid false positives during long prefills. If you explicitly set `providers.<id>.stale_timeout_seconds`, `providers.<id>.models.<model>.stale_timeout_seconds`, or `HERMES_API_CALL_STALE_TIMEOUT`, that explicit value is honored even on local endpoints.
 ## Context Pressure Warnings
 Separate from iteration budget pressure, context pressure tracks how close the conversation is to the **compaction threshold** — the point where context compression fires to summarize older messages. This helps both you and the agent understand when the conversation is getting long.