mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(config): add stale timeout settings
This commit is contained in:
parent
440764e013
commit
03e3c22e86
6 changed files with 267 additions and 31 deletions
|
|
@ -66,15 +66,18 @@ model:
|
||||||
# max_tokens: 8192
|
# max_tokens: 8192
|
||||||
|
|
||||||
# Named provider overrides (optional)
|
# Named provider overrides (optional)
|
||||||
# Use this for per-provider request timeouts and per-model exceptions.
|
# Use this for per-provider request timeouts, non-stream stale timeouts,
|
||||||
|
# and per-model exceptions.
|
||||||
# Applies to the primary turn client on every api_mode (OpenAI-wire, native
|
# Applies to the primary turn client on every api_mode (OpenAI-wire, native
|
||||||
# Anthropic, and Anthropic-compatible providers), the fallback chain, and
|
# Anthropic, and Anthropic-compatible providers), the fallback chain, and
|
||||||
# client rebuilds during credential rotation. For OpenAI-wire chat
|
# client rebuilds during credential rotation. For OpenAI-wire chat
|
||||||
# completions (streaming and non-streaming) the configured value is also
|
# completions (streaming and non-streaming) the configured value is also
|
||||||
# used as the per-request ``timeout=`` kwarg so it wins over the legacy
|
# used as the per-request ``timeout=`` kwarg so it wins over the legacy
|
||||||
# HERMES_API_TIMEOUT env var (which still applies when no config is set).
|
# HERMES_API_TIMEOUT env var (which still applies when no config is set).
|
||||||
# Leaving these unset keeps the legacy defaults (HERMES_API_TIMEOUT=1800s,
|
# ``stale_timeout_seconds`` controls the non-streaming stale-call detector and
|
||||||
# native Anthropic 900s).
|
# wins over the legacy HERMES_API_CALL_STALE_TIMEOUT env var. Leaving these
|
||||||
|
# unset keeps the legacy defaults (HERMES_API_TIMEOUT=1800s,
|
||||||
|
# HERMES_API_CALL_STALE_TIMEOUT=300s, native Anthropic 900s).
|
||||||
#
|
#
|
||||||
# Not currently wired for AWS Bedrock (bedrock_converse + AnthropicBedrock
|
# Not currently wired for AWS Bedrock (bedrock_converse + AnthropicBedrock
|
||||||
# SDK paths) — those use boto3 with its own timeout configuration.
|
# SDK paths) — those use boto3 with its own timeout configuration.
|
||||||
|
|
@ -82,11 +85,16 @@ model:
|
||||||
# providers:
|
# providers:
|
||||||
# ollama-local:
|
# ollama-local:
|
||||||
# request_timeout_seconds: 300 # Longer timeout for local cold-starts
|
# request_timeout_seconds: 300 # Longer timeout for local cold-starts
|
||||||
|
# stale_timeout_seconds: 900 # Explicitly re-enable stale detection on local endpoints
|
||||||
# anthropic:
|
# anthropic:
|
||||||
# request_timeout_seconds: 30 # Fast-fail cloud requests
|
# request_timeout_seconds: 30 # Fast-fail cloud requests
|
||||||
# models:
|
# models:
|
||||||
# claude-opus-4.6:
|
# claude-opus-4.6:
|
||||||
# timeout_seconds: 600 # Longer timeout for extended-thinking Opus calls
|
# timeout_seconds: 600 # Longer timeout for extended-thinking Opus calls
|
||||||
|
# openai-codex:
|
||||||
|
# models:
|
||||||
|
# gpt-5.4:
|
||||||
|
# stale_timeout_seconds: 1800 # Longer non-stream stale timeout for slow large-context turns
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# OpenRouter Provider Routing (only applies when using OpenRouter)
|
# OpenRouter Provider Routing (only applies when using OpenRouter)
|
||||||
|
|
|
||||||
|
|
@ -31,12 +31,52 @@ def get_provider_request_timeout(
|
||||||
if not isinstance(provider_config, dict):
|
if not isinstance(provider_config, dict):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if model:
|
model_config = _get_model_config(provider_config, model)
|
||||||
models = provider_config.get("models", {})
|
if model_config is not None:
|
||||||
model_config = models.get(model, {}) if isinstance(models, dict) else {}
|
timeout = _coerce_timeout(model_config.get("timeout_seconds"))
|
||||||
if isinstance(model_config, dict):
|
if timeout is not None:
|
||||||
timeout = _coerce_timeout(model_config.get("timeout_seconds"))
|
return timeout
|
||||||
if timeout is not None:
|
|
||||||
return timeout
|
|
||||||
|
|
||||||
return _coerce_timeout(provider_config.get("request_timeout_seconds"))
|
return _coerce_timeout(provider_config.get("request_timeout_seconds"))
|
||||||
|
|
||||||
|
|
||||||
|
def get_provider_stale_timeout(
|
||||||
|
provider_id: str, model: str | None = None
|
||||||
|
) -> float | None:
|
||||||
|
"""Return a configured non-stream stale timeout in seconds, if any."""
|
||||||
|
if not provider_id:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
config = load_config()
|
||||||
|
providers = config.get("providers", {}) if isinstance(config, dict) else {}
|
||||||
|
provider_config = (
|
||||||
|
providers.get(provider_id, {}) if isinstance(providers, dict) else {}
|
||||||
|
)
|
||||||
|
if not isinstance(provider_config, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
model_config = _get_model_config(provider_config, model)
|
||||||
|
if model_config is not None:
|
||||||
|
timeout = _coerce_timeout(model_config.get("stale_timeout_seconds"))
|
||||||
|
if timeout is not None:
|
||||||
|
return timeout
|
||||||
|
|
||||||
|
return _coerce_timeout(provider_config.get("stale_timeout_seconds"))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_model_config(
|
||||||
|
provider_config: dict[str, object], model: str | None
|
||||||
|
) -> dict[str, object] | None:
|
||||||
|
if not model:
|
||||||
|
return None
|
||||||
|
|
||||||
|
models = provider_config.get("models", {})
|
||||||
|
model_config = models.get(model, {}) if isinstance(models, dict) else {}
|
||||||
|
if isinstance(model_config, dict):
|
||||||
|
return model_config
|
||||||
|
return None
|
||||||
|
|
|
||||||
58
run_agent.py
58
run_agent.py
|
|
@ -48,7 +48,10 @@ from hermes_constants import get_hermes_home
|
||||||
# Load .env from ~/.hermes/.env first, then project root as dev fallback.
|
# Load .env from ~/.hermes/.env first, then project root as dev fallback.
|
||||||
# User-managed env files should override stale shell exports on restart.
|
# User-managed env files should override stale shell exports on restart.
|
||||||
from hermes_cli.env_loader import load_hermes_dotenv
|
from hermes_cli.env_loader import load_hermes_dotenv
|
||||||
from hermes_cli.timeouts import get_provider_request_timeout
|
from hermes_cli.timeouts import (
|
||||||
|
get_provider_request_timeout,
|
||||||
|
get_provider_stale_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
_hermes_home = get_hermes_home()
|
_hermes_home = get_hermes_home()
|
||||||
_project_env = Path(__file__).parent / '.env'
|
_project_env = Path(__file__).parent / '.env'
|
||||||
|
|
@ -2158,6 +2161,44 @@ class AIAgent:
|
||||||
return cfg
|
return cfg
|
||||||
return float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
return float(os.getenv("HERMES_API_TIMEOUT", 1800.0))
|
||||||
|
|
||||||
|
def _resolved_api_call_stale_timeout_base(self) -> tuple[float, bool]:
|
||||||
|
"""Resolve the base non-stream stale timeout and whether it is implicit.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1. ``providers.<id>.models.<model>.stale_timeout_seconds``
|
||||||
|
2. ``providers.<id>.stale_timeout_seconds``
|
||||||
|
3. ``HERMES_API_CALL_STALE_TIMEOUT`` env var
|
||||||
|
4. 300.0s default
|
||||||
|
|
||||||
|
Returns ``(timeout_seconds, uses_implicit_default)`` so the caller can
|
||||||
|
preserve legacy behaviors that only apply when the user has *not*
|
||||||
|
explicitly configured a stale timeout, such as auto-disabling the
|
||||||
|
detector for local endpoints.
|
||||||
|
"""
|
||||||
|
cfg = get_provider_stale_timeout(self.provider, self.model)
|
||||||
|
if cfg is not None:
|
||||||
|
return cfg, False
|
||||||
|
|
||||||
|
env_timeout = os.getenv("HERMES_API_CALL_STALE_TIMEOUT")
|
||||||
|
if env_timeout is not None:
|
||||||
|
return float(env_timeout), False
|
||||||
|
|
||||||
|
return 300.0, True
|
||||||
|
|
||||||
|
def _compute_non_stream_stale_timeout(self, messages: list[dict[str, Any]]) -> float:
|
||||||
|
"""Compute the effective non-stream stale timeout for this request."""
|
||||||
|
stale_base, uses_implicit_default = self._resolved_api_call_stale_timeout_base()
|
||||||
|
base_url = getattr(self, "_base_url", None) or self.base_url or ""
|
||||||
|
if uses_implicit_default and base_url and is_local_endpoint(base_url):
|
||||||
|
return float("inf")
|
||||||
|
|
||||||
|
est_tokens = sum(len(str(v)) for v in messages) // 4
|
||||||
|
if est_tokens > 100_000:
|
||||||
|
return max(stale_base, 600.0)
|
||||||
|
if est_tokens > 50_000:
|
||||||
|
return max(stale_base, 450.0)
|
||||||
|
return stale_base
|
||||||
|
|
||||||
def _is_openrouter_url(self) -> bool:
|
def _is_openrouter_url(self) -> bool:
|
||||||
"""Return True when the base URL targets OpenRouter."""
|
"""Return True when the base URL targets OpenRouter."""
|
||||||
return "openrouter" in self._base_url_lower
|
return "openrouter" in self._base_url_lower
|
||||||
|
|
@ -5594,18 +5635,9 @@ class AIAgent:
|
||||||
# httpx timeout (default 1800s) with zero feedback. The stale
|
# httpx timeout (default 1800s) with zero feedback. The stale
|
||||||
# detector kills the connection early so the main retry loop can
|
# detector kills the connection early so the main retry loop can
|
||||||
# apply richer recovery (credential rotation, provider fallback).
|
# apply richer recovery (credential rotation, provider fallback).
|
||||||
_stale_base = float(os.getenv("HERMES_API_CALL_STALE_TIMEOUT", 300.0))
|
_stale_timeout = self._compute_non_stream_stale_timeout(
|
||||||
_base_url = getattr(self, "_base_url", None) or ""
|
api_kwargs.get("messages", [])
|
||||||
if _stale_base == 300.0 and _base_url and is_local_endpoint(_base_url):
|
)
|
||||||
_stale_timeout = float("inf")
|
|
||||||
else:
|
|
||||||
_est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
|
|
||||||
if _est_tokens > 100_000:
|
|
||||||
_stale_timeout = max(_stale_base, 600.0)
|
|
||||||
elif _est_tokens > 50_000:
|
|
||||||
_stale_timeout = max(_stale_base, 450.0)
|
|
||||||
else:
|
|
||||||
_stale_timeout = _stale_base
|
|
||||||
|
|
||||||
_call_start = time.time()
|
_call_start = time.time()
|
||||||
self._touch_activity("waiting for non-streaming API response")
|
self._touch_activity("waiting for non-streaming API response")
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,10 @@ from __future__ import annotations
|
||||||
|
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
from hermes_cli.timeouts import get_provider_request_timeout
|
from hermes_cli.timeouts import (
|
||||||
|
get_provider_request_timeout,
|
||||||
|
get_provider_stale_timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _write_config(tmp_path, body: str) -> None:
|
def _write_config(tmp_path, body: str) -> None:
|
||||||
|
|
@ -40,6 +43,37 @@ def test_provider_timeout_used_when_no_model_override(monkeypatch, tmp_path):
|
||||||
assert get_provider_request_timeout("ollama-local", "qwen3:32b") == 300.0
|
assert get_provider_request_timeout("ollama-local", "qwen3:32b") == 300.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_stale_timeout_override_wins(monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
_write_config(
|
||||||
|
tmp_path,
|
||||||
|
"""\
|
||||||
|
providers:
|
||||||
|
openai-codex:
|
||||||
|
stale_timeout_seconds: 600
|
||||||
|
models:
|
||||||
|
gpt-5.4:
|
||||||
|
stale_timeout_seconds: 1800
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert get_provider_stale_timeout("openai-codex", "gpt-5.4") == 1800.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_provider_stale_timeout_used_when_no_model_override(monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
_write_config(
|
||||||
|
tmp_path,
|
||||||
|
"""\
|
||||||
|
providers:
|
||||||
|
openai-codex:
|
||||||
|
stale_timeout_seconds: 900
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert get_provider_stale_timeout("openai-codex", "gpt-5.4") == 900.0
|
||||||
|
|
||||||
|
|
||||||
def test_missing_timeout_returns_none(monkeypatch, tmp_path):
|
def test_missing_timeout_returns_none(monkeypatch, tmp_path):
|
||||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
_write_config(
|
_write_config(
|
||||||
|
|
@ -78,6 +112,24 @@ def test_invalid_timeout_values_return_none(monkeypatch, tmp_path):
|
||||||
assert get_provider_request_timeout("ollama-local") is None
|
assert get_provider_request_timeout("ollama-local") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_invalid_stale_timeout_values_return_none(monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
_write_config(
|
||||||
|
tmp_path,
|
||||||
|
"""\
|
||||||
|
providers:
|
||||||
|
openai-codex:
|
||||||
|
stale_timeout_seconds: "slow"
|
||||||
|
models:
|
||||||
|
gpt-5.4:
|
||||||
|
stale_timeout_seconds: -1
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert get_provider_stale_timeout("openai-codex", "gpt-5.4") is None
|
||||||
|
assert get_provider_stale_timeout("openai-codex", "gpt-5.5") is None
|
||||||
|
|
||||||
|
|
||||||
def test_anthropic_adapter_honors_timeout_kwarg():
|
def test_anthropic_adapter_honors_timeout_kwarg():
|
||||||
"""build_anthropic_client(timeout=X) overrides the 900s default read timeout."""
|
"""build_anthropic_client(timeout=X) overrides the 900s default read timeout."""
|
||||||
pytest = __import__("pytest")
|
pytest = __import__("pytest")
|
||||||
|
|
@ -158,3 +210,99 @@ def test_resolved_api_call_timeout_priority(monkeypatch, tmp_path):
|
||||||
# Case C: no config, no env → 1800.0 default
|
# Case C: no config, no env → 1800.0 default
|
||||||
monkeypatch.delenv("HERMES_API_TIMEOUT", raising=False)
|
monkeypatch.delenv("HERMES_API_TIMEOUT", raising=False)
|
||||||
assert agent2._resolved_api_call_timeout() == 1800.0
|
assert agent2._resolved_api_call_timeout() == 1800.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolved_api_call_stale_timeout_priority(monkeypatch, tmp_path):
|
||||||
|
"""AIAgent stale timeout honors config > env > default priority."""
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
(tmp_path / ".env").write_text("", encoding="utf-8")
|
||||||
|
|
||||||
|
_write_config(tmp_path, """\
|
||||||
|
providers:
|
||||||
|
openai-codex:
|
||||||
|
stale_timeout_seconds: 600
|
||||||
|
models:
|
||||||
|
gpt-5.4:
|
||||||
|
stale_timeout_seconds: 1800
|
||||||
|
""")
|
||||||
|
monkeypatch.setenv("HERMES_API_CALL_STALE_TIMEOUT", "999")
|
||||||
|
|
||||||
|
from run_agent import AIAgent
|
||||||
|
agent = AIAgent(
|
||||||
|
model="gpt-5.4",
|
||||||
|
provider="openai-codex",
|
||||||
|
api_key="sk-dummy",
|
||||||
|
base_url="https://chatgpt.com/backend-api/codex",
|
||||||
|
quiet_mode=True,
|
||||||
|
skip_context_files=True,
|
||||||
|
skip_memory=True,
|
||||||
|
platform="cli",
|
||||||
|
)
|
||||||
|
assert agent._resolved_api_call_stale_timeout_base() == (1800.0, False)
|
||||||
|
|
||||||
|
agent.model = "gpt-5.5"
|
||||||
|
assert agent._resolved_api_call_stale_timeout_base() == (600.0, False)
|
||||||
|
|
||||||
|
_write_config(tmp_path, "")
|
||||||
|
import importlib
|
||||||
|
from hermes_cli import config as cfg_mod
|
||||||
|
importlib.reload(cfg_mod)
|
||||||
|
from hermes_cli import timeouts as to_mod
|
||||||
|
importlib.reload(to_mod)
|
||||||
|
import run_agent as ra_mod
|
||||||
|
importlib.reload(ra_mod)
|
||||||
|
|
||||||
|
agent2 = ra_mod.AIAgent(
|
||||||
|
model="gpt-5.4",
|
||||||
|
provider="openai-codex",
|
||||||
|
api_key="sk-dummy",
|
||||||
|
base_url="https://chatgpt.com/backend-api/codex",
|
||||||
|
quiet_mode=True,
|
||||||
|
skip_context_files=True,
|
||||||
|
skip_memory=True,
|
||||||
|
platform="cli",
|
||||||
|
)
|
||||||
|
assert agent2._resolved_api_call_stale_timeout_base() == (999.0, False)
|
||||||
|
|
||||||
|
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
|
||||||
|
assert agent2._resolved_api_call_stale_timeout_base() == (300.0, True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_non_stream_stale_timeout_auto_disables_for_local_endpoints(monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
(tmp_path / ".env").write_text("", encoding="utf-8")
|
||||||
|
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
|
||||||
|
|
||||||
|
from run_agent import AIAgent
|
||||||
|
agent = AIAgent(
|
||||||
|
model="qwen3:32b",
|
||||||
|
provider="ollama-local",
|
||||||
|
api_key="sk-dummy",
|
||||||
|
base_url="http://127.0.0.1:11434/v1",
|
||||||
|
quiet_mode=True,
|
||||||
|
skip_context_files=True,
|
||||||
|
skip_memory=True,
|
||||||
|
platform="cli",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert agent._compute_non_stream_stale_timeout([]) == float("inf")
|
||||||
|
|
||||||
|
|
||||||
|
def test_explicit_non_stream_stale_timeout_is_honored_for_local_endpoints(monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
(tmp_path / ".env").write_text("", encoding="utf-8")
|
||||||
|
monkeypatch.setenv("HERMES_API_CALL_STALE_TIMEOUT", "300")
|
||||||
|
|
||||||
|
from run_agent import AIAgent
|
||||||
|
agent = AIAgent(
|
||||||
|
model="qwen3:32b",
|
||||||
|
provider="ollama-local",
|
||||||
|
api_key="sk-dummy",
|
||||||
|
base_url="http://127.0.0.1:11434/v1",
|
||||||
|
quiet_mode=True,
|
||||||
|
skip_context_files=True,
|
||||||
|
skip_memory=True,
|
||||||
|
platform="cli",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert agent._compute_non_stream_stale_timeout([]) == 300.0
|
||||||
|
|
|
||||||
|
|
@ -360,6 +360,7 @@ For cloud sandbox backends, persistence is filesystem-oriented. `TERMINAL_LIFETI
|
||||||
| `HERMES_HUMAN_DELAY_MAX_MS` | Custom delay range maximum (ms) |
|
| `HERMES_HUMAN_DELAY_MAX_MS` | Custom delay range maximum (ms) |
|
||||||
| `HERMES_QUIET` | Suppress non-essential output (`true`/`false`) |
|
| `HERMES_QUIET` | Suppress non-essential output (`true`/`false`) |
|
||||||
| `HERMES_API_TIMEOUT` | LLM API call timeout in seconds (default: `1800`) |
|
| `HERMES_API_TIMEOUT` | LLM API call timeout in seconds (default: `1800`) |
|
||||||
|
| `HERMES_API_CALL_STALE_TIMEOUT` | Non-streaming stale-call timeout in seconds (default: `300`). Auto-disabled for local providers when left unset. Also configurable via `providers.<id>.stale_timeout_seconds` or `providers.<id>.models.<model>.stale_timeout_seconds` in `config.yaml`. |
|
||||||
| `HERMES_STREAM_READ_TIMEOUT` | Streaming socket read timeout in seconds (default: `120`). Auto-increased to `HERMES_API_TIMEOUT` for local providers. Increase if local LLMs time out during long code generation. |
|
| `HERMES_STREAM_READ_TIMEOUT` | Streaming socket read timeout in seconds (default: `120`). Auto-increased to `HERMES_API_TIMEOUT` for local providers. Increase if local LLMs time out during long code generation. |
|
||||||
| `HERMES_STREAM_STALE_TIMEOUT` | Stale stream detection timeout in seconds (default: `180`). Auto-disabled for local providers. Triggers connection kill if no chunks arrive within this window. |
|
| `HERMES_STREAM_STALE_TIMEOUT` | Stale stream detection timeout in seconds (default: `180`). Auto-disabled for local providers. Triggers connection kill if no chunks arrive within this window. |
|
||||||
| `HERMES_EXEC_ASK` | Enable execution approval prompts in gateway mode (`true`/`false`) |
|
| `HERMES_EXEC_ASK` | Enable execution approval prompts in gateway mode (`true`/`false`) |
|
||||||
|
|
|
||||||
|
|
@ -73,9 +73,13 @@ Multiple references in a single value work: `url: "${HOST}:${PORT}"`. If a refer
|
||||||
|
|
||||||
For AI provider setup (OpenRouter, Anthropic, Copilot, custom endpoints, self-hosted LLMs, fallback models, etc.), see [AI Providers](/docs/integrations/providers).
|
For AI provider setup (OpenRouter, Anthropic, Copilot, custom endpoints, self-hosted LLMs, fallback models, etc.), see [AI Providers](/docs/integrations/providers).
|
||||||
|
|
||||||
### Provider Request Timeouts
|
### Provider Timeouts
|
||||||
|
|
||||||
You can set `providers.<id>.request_timeout_seconds` for a provider-wide timeout, plus `providers.<id>.models.<model>.timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, rebuilds after credential rotation, and (for OpenAI-wire) the per-request timeout kwarg — so the configured value wins over the legacy `HERMES_API_TIMEOUT` env var. Leaving these unset keeps legacy defaults (`HERMES_API_TIMEOUT=1800`s, native Anthropic 900s). Not currently wired for AWS Bedrock (both `bedrock_converse` and AnthropicBedrock SDK paths use boto3 with its own timeout configuration). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example).
|
You can set `providers.<id>.request_timeout_seconds` for a provider-wide request timeout, plus `providers.<id>.models.<model>.timeout_seconds` for a model-specific override. Applies to the primary turn client on every transport (OpenAI-wire, native Anthropic, Anthropic-compatible), the fallback chain, rebuilds after credential rotation, and (for OpenAI-wire) the per-request timeout kwarg — so the configured value wins over the legacy `HERMES_API_TIMEOUT` env var.
|
||||||
|
|
||||||
|
You can also set `providers.<id>.stale_timeout_seconds` for the non-streaming stale-call detector, plus `providers.<id>.models.<model>.stale_timeout_seconds` for a model-specific override. This wins over the legacy `HERMES_API_CALL_STALE_TIMEOUT` env var.
|
||||||
|
|
||||||
|
Leaving these unset keeps the legacy defaults (`HERMES_API_TIMEOUT=1800`s, `HERMES_API_CALL_STALE_TIMEOUT=300`s, native Anthropic 900s). Not currently wired for AWS Bedrock (both `bedrock_converse` and AnthropicBedrock SDK paths use boto3 with its own timeout configuration). See the commented example in [`cli-config.yaml.example`](https://github.com/NousResearch/hermes-agent/blob/main/cli-config.yaml.example).
|
||||||
|
|
||||||
## Terminal Backend Configuration
|
## Terminal Backend Configuration
|
||||||
|
|
||||||
|
|
@ -554,20 +558,23 @@ Budget pressure is enabled by default. The agent sees warnings naturally as part
|
||||||
|
|
||||||
When the iteration budget is fully exhausted, the CLI shows a notification to the user: `⚠ Iteration budget reached (90/90) — response may be incomplete`. If the budget runs out during active work, the agent generates a summary of what was accomplished before stopping.
|
When the iteration budget is fully exhausted, the CLI shows a notification to the user: `⚠ Iteration budget reached (90/90) — response may be incomplete`. If the budget runs out during active work, the agent generates a summary of what was accomplished before stopping.
|
||||||
|
|
||||||
### Streaming Timeouts
|
### API Timeouts
|
||||||
|
|
||||||
The LLM streaming connection has two timeout layers. Both auto-adjust for local providers (localhost, LAN IPs) — no configuration needed for most setups.
|
Hermes has separate timeout layers for streaming, plus a stale detector for non-streaming calls. The stale detectors auto-adjust for local providers only when you leave them at their implicit defaults.
|
||||||
|
|
||||||
| Timeout | Default | Local providers | Env var |
|
| Timeout | Default | Local providers | Config / env |
|
||||||
|---------|---------|----------------|---------|
|
|---------|---------|----------------|--------------|
|
||||||
| Socket read timeout | 120s | Auto-raised to 1800s | `HERMES_STREAM_READ_TIMEOUT` |
|
| Socket read timeout | 120s | Auto-raised to 1800s | `HERMES_STREAM_READ_TIMEOUT` |
|
||||||
| Stale stream detection | 180s | Auto-disabled | `HERMES_STREAM_STALE_TIMEOUT` |
|
| Stale stream detection | 180s | Auto-disabled | `HERMES_STREAM_STALE_TIMEOUT` |
|
||||||
| API call (non-streaming) | 1800s | Unchanged | `HERMES_API_TIMEOUT` |
|
| Stale non-stream detection | 300s | Auto-disabled when left implicit | `providers.<id>.stale_timeout_seconds` or `HERMES_API_CALL_STALE_TIMEOUT` |
|
||||||
|
| API call (non-streaming) | 1800s | Unchanged | `providers.<id>.request_timeout_seconds` / `timeout_seconds` or `HERMES_API_TIMEOUT` |
|
||||||
|
|
||||||
The **socket read timeout** controls how long httpx waits for the next chunk of data from the provider. Local LLMs can take minutes for prefill on large contexts before producing the first token, so Hermes raises this to 30 minutes when it detects a local endpoint. If you explicitly set `HERMES_STREAM_READ_TIMEOUT`, that value is always used regardless of endpoint detection.
|
The **socket read timeout** controls how long httpx waits for the next chunk of data from the provider. Local LLMs can take minutes for prefill on large contexts before producing the first token, so Hermes raises this to 30 minutes when it detects a local endpoint. If you explicitly set `HERMES_STREAM_READ_TIMEOUT`, that value is always used regardless of endpoint detection.
|
||||||
|
|
||||||
The **stale stream detection** kills connections that receive SSE keep-alive pings but no actual content. This is disabled entirely for local providers since they don't send keep-alive pings during prefill.
|
The **stale stream detection** kills connections that receive SSE keep-alive pings but no actual content. This is disabled entirely for local providers since they don't send keep-alive pings during prefill.
|
||||||
|
|
||||||
|
The **stale non-stream detection** kills non-streaming calls that produce no response for too long. By default Hermes disables this on local endpoints to avoid false positives during long prefills. If you explicitly set `providers.<id>.stale_timeout_seconds`, `providers.<id>.models.<model>.stale_timeout_seconds`, or `HERMES_API_CALL_STALE_TIMEOUT`, that explicit value is honored even on local endpoints.
|
||||||
|
|
||||||
## Context Pressure Warnings
|
## Context Pressure Warnings
|
||||||
|
|
||||||
Separate from iteration budget pressure, context pressure tracks how close the conversation is to the **compaction threshold** — the point where context compression fires to summarize older messages. This helps both you and the agent understand when the conversation is getting long.
|
Separate from iteration budget pressure, context pressure tracks how close the conversation is to the **compaction threshold** — the point where context compression fires to summarize older messages. This helps both you and the agent understand when the conversation is getting long.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue