feat(agent): configurable timeouts for auxiliary LLM calls via config.yaml (#3597)

Add per-task timeout settings under auxiliary.{task}.timeout in config.yaml instead of hardcoded values. Users with slow local models (Ollama, llama.cpp) can now increase timeouts for compression, vision, session search, etc. Defaults: - auxiliary.compression.timeout: 120s (was hardcoded 45s) - auxiliary.vision.timeout: 30s (unchanged) - all other aux tasks: 30s (was hardcoded 30s) - title_generator: 30s (was hardcoded 15s) call_llm/async_call_llm now auto-resolve timeout from config when not explicitly passed. Callers can still override with an explicit timeout arg. Based on PR #3406 by alanfwilliams. Converted from env vars to config.yaml per project conventions. Co-authored-by: alanfwilliams <alanfwilliams@users.noreply.github.com>
2026-05-09 03:11:58 +00:00 · 2026-03-28 14:35:28 -07:00 · 2026-03-28 14:35:28 -07:00 · 839d9d7471
commit 839d9d7471
parent 404a0b823e
4 changed files with 41 additions and 7 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -1458,6 +1458,29 @@ def _resolve_task_provider_model(
    return "auto", resolved_model, None, None
 _DEFAULT_AUX_TIMEOUT = 30.0
 def _get_task_timeout(task: str, default: float = _DEFAULT_AUX_TIMEOUT) -> float:
    """Read timeout from auxiliary.{task}.timeout in config, falling back to *default*."""
    if not task:
        return default
    try:
        from hermes_cli.config import load_config
        config = load_config()
    except ImportError:
        return default
    aux = config.get("auxiliary", {}) if isinstance(config, dict) else {}
    task_config = aux.get(task, {}) if isinstance(aux, dict) else {}
    raw = task_config.get("timeout")
    if raw is not None:
        try:
            return float(raw)
        except (ValueError, TypeError):
            pass
    return default
 def _build_call_kwargs(
    provider: str,
    model: str,
@ -1515,7 +1538,7 @@ def call_llm(
    temperature: float = None,
    max_tokens: int = None,
    tools: list = None,
-    timeout: float = 30.0,
+    timeout: float = None,
    extra_body: dict = None,
 ) -> Any:
    """Centralized synchronous LLM call.
@ -1533,7 +1556,7 @@ def call_llm(
        temperature: Sampling temperature (None = provider default).
        max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens).
        tools: Tool definitions (for function calling).
-        timeout: Request timeout in seconds.
+        timeout: Request timeout in seconds (None = read from auxiliary.{task}.timeout config).
        extra_body: Additional request body fields.
    Returns:
@ -1598,10 +1621,12 @@ def call_llm(
                f"No LLM provider configured for task={task} provider={resolved_provider}. "
                f"Run: hermes setup")
    effective_timeout = timeout if timeout is not None else _get_task_timeout(task)
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
-        tools=tools, timeout=timeout, extra_body=extra_body,
+        tools=tools, timeout=effective_timeout, extra_body=extra_body,
        base_url=resolved_base_url)
    # Handle max_tokens vs max_completion_tokens retry
@ -1683,7 +1708,7 @@ async def async_call_llm(
    temperature: float = None,
    max_tokens: int = None,
    tools: list = None,
-    timeout: float = 30.0,
+    timeout: float = None,
    extra_body: dict = None,
 ) -> Any:
    """Centralized asynchronous LLM call.
@ -1744,10 +1769,12 @@ async def async_call_llm(
                f"No LLM provider configured for task={task} provider={resolved_provider}. "
                f"Run: hermes setup")
    effective_timeout = timeout if timeout is not None else _get_task_timeout(task)
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
-        tools=tools, timeout=timeout, extra_body=extra_body,
+        tools=tools, timeout=effective_timeout, extra_body=extra_body,
        base_url=resolved_base_url)
    try:
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -347,7 +347,7 @@ Write only the summary body. Do not include any preamble or prefix."""
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0.3,
                "max_tokens": summary_budget * 2,
-                "timeout": 45.0,
+                # timeout resolved from auxiliary.compression.timeout config by call_llm
            }
            if self.summary_model:
                call_kwargs["model"] = self.summary_model
--- a/agent/title_generator.py
+++ b/agent/title_generator.py
@ -19,7 +19,7 @@ _TITLE_PROMPT = (
 )
-def generate_title(user_message: str, assistant_response: str, timeout: float = 15.0) -> Optional[str]:
+def generate_title(user_message: str, assistant_response: str, timeout: float = 30.0) -> Optional[str]:
    """Generate a session title from the first exchange.
    Uses the auxiliary LLM client (cheapest/fastest available model).
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -227,42 +227,49 @@ DEFAULT_CONFIG = {
            "model": "",
            "base_url": "",
            "api_key": "",
            "timeout": 30,         # seconds — increase for slow local models
        },
        "compression": {
            "provider": "auto",
            "model": "",
            "base_url": "",
            "api_key": "",
            "timeout": 120,        # seconds — compression summarises large contexts; increase for local models
        },
        "session_search": {
            "provider": "auto",
            "model": "",
            "base_url": "",
            "api_key": "",
            "timeout": 30,
        },
        "skills_hub": {
            "provider": "auto",
            "model": "",
            "base_url": "",
            "api_key": "",
            "timeout": 30,
        },
        "approval": {
            "provider": "auto",
            "model": "",           # fast/cheap model recommended (e.g. gemini-flash, haiku)
            "base_url": "",
            "api_key": "",
            "timeout": 30,
        },
        "mcp": {
            "provider": "auto",
            "model": "",
            "base_url": "",
            "api_key": "",
            "timeout": 30,
        },
        "flush_memories": {
            "provider": "auto",
            "model": "",
            "base_url": "",
            "api_key": "",
            "timeout": 30,
        },
    },