fix(codex): size and propagate timeouts for Responses-API requests; lower stale defaults

Codex / Responses-API requests had three latent timeout bugs that combined into the long silent hangs reported on #21444: 1. The non-stream stale-call detector estimated context tokens from ``api_kwargs["messages"]`` only. Codex / Responses-API payloads carry their conversational load in ``input`` (with ``instructions`` and ``tools``), so every Codex turn logged ``context=~0 tokens`` and the detector never applied its >50k / >100k tier bumps. 2. ``providers.<id>.request_timeout_seconds`` was silently dropped on the main Codex path. The chat_completions path and the auxiliary Codex adapter both forwarded it; the main path skipped it through three places (``build_api_kwargs``, ``ResponsesApiTransport.build_kwargs``, ``_preflight_codex_api_kwargs``). 3. The streaming stale detector had the same payload-shape bug for ``codex_responses`` requests, which route through the non-streaming detector (it's the path that emits the user-facing "No response from provider for 300s (non-streaming, ...)" warning that reporters keep pasting). This commit: - Adds ``estimate_request_context_tokens`` in ``chat_completion_helpers``, used by both the non-stream and stream detectors. Handles ``messages`` (Chat Completions), ``input + instructions + tools`` (Responses API), bare lists, and an unknown-dict fallback. - Forwards ``timeout`` through ``ResponsesApiTransport.build_kwargs`` and ``_preflight_codex_api_kwargs`` (with guards against zero/negative/inf/bool values), and wires ``_resolved_api_call_timeout()`` into the Codex branch of ``build_api_kwargs``. - Lowers the implicit non-stream stale defaults so fallback providers kick in faster when upstream stalls: * base 300s -> 90s * >50k 450s -> 150s * >100k 600s -> 240s These only apply when the user has *not* set ``providers.<id>.stale_timeout_seconds`` or ``HERMES_API_CALL_STALE_TIMEOUT``. Explicit config still wins. - Adds regression tests for the estimator shapes, the new defaults, the context-tier scaling, transport timeout pass-through, and preflight timeout pass-through / rejection of invalid values. Closes #21444 Supersedes #21652 #24126 #31855 Co-authored-by: Hoang V. Pham <26063003+hehehe0803@users.noreply.github.com>
2026-07-18 14:52:04 +00:00 · 2026-05-25 01:36:22 -07:00 · 2026-05-25 01:36:22 -07:00 · 2d422720b5
commit 2d422720b5
parent 76135b329d
10 changed files with 383 additions and 17 deletions
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@ -75,6 +75,59 @@ def _ra():
    return run_agent


+def estimate_request_context_tokens(api_payload: Any) -> int:
+    """Estimate context/load tokens from an API payload, dict or messages list.
+
+    The stale-call detectors historically assumed a Chat Completions request:
+    they pulled ``api_kwargs["messages"]`` and ran a cheap char/4 estimate.
+    Codex / Responses API requests carry the conversational payload in
+    ``input`` (with additional load in ``instructions`` and ``tools``), so the
+    legacy estimator reported ~0 tokens for every Codex turn and the
+    context-tier scaling never fired.
+
+    This helper handles both shapes:
+      - bare list -> treat as Chat Completions ``messages``
+      - dict with ``messages`` -> Chat Completions (+ ``tools`` if present)
+      - dict with ``input`` -> Responses API (+ ``instructions``/``tools``)
+      - any other dict -> fall back to summing string values
+    """
+
+    def _chars(value: Any) -> int:
+        if value is None:
+            return 0
+        if isinstance(value, str):
+            return len(value)
+        return len(str(value))
+
+    def _message_chars(messages: Any) -> int:
+        if not isinstance(messages, list):
+            return _chars(messages)
+        return sum(_chars(item) for item in messages)
+
+    if isinstance(api_payload, list):
+        return _message_chars(api_payload) // 4
+
+    if isinstance(api_payload, dict):
+        messages = api_payload.get("messages")
+        if isinstance(messages, list):
+            total_chars = _message_chars(messages)
+            if "tools" in api_payload:
+                total_chars += _chars(api_payload.get("tools"))
+            return total_chars // 4
+
+        if "input" in api_payload:
+            total_chars = (
+                _chars(api_payload.get("input"))
+                + _chars(api_payload.get("instructions"))
+                + _chars(api_payload.get("tools"))
+            )
+            return total_chars // 4
+
+        return sum(_chars(value) for value in api_payload.values()) // 4
+
+    return _chars(api_payload) // 4
+
+

 def interruptible_api_call(agent, api_kwargs: dict):
    """
@ -200,9 +253,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
    # httpx timeout (default 1800s) with zero feedback.  The stale
    # detector kills the connection early so the main retry loop can
    # apply richer recovery (credential rotation, provider fallback).
-    _stale_timeout = agent._compute_non_stream_stale_timeout(
-        api_kwargs.get("messages", [])
-    )
+    _stale_timeout = agent._compute_non_stream_stale_timeout(api_kwargs)

    _call_start = time.time()
    agent._touch_activity("waiting for non-streaming API response")
@ -226,7 +277,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
        # arrives within the configured timeout.
        _elapsed = time.time() - _call_start
        if _elapsed > _stale_timeout:
-            _est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
+            _est_ctx = estimate_request_context_tokens(api_kwargs)
            logger.warning(
                "Non-streaming API call stale for %.0fs (threshold %.0fs). "
                "model=%s context=~%s tokens. Killing connection.",
@ -362,6 +413,7 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
            reasoning_config=agent.reasoning_config,
            session_id=getattr(agent, "session_id", None),
            max_tokens=agent.max_tokens,
+            timeout=agent._resolved_api_call_timeout(),
            request_overrides=agent.request_overrides,
            is_github_responses=is_github_responses,
            is_codex_backend=is_codex_backend,
@ -2019,7 +2071,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
        # when the context is large.  Without this, the stale detector kills
        # healthy connections during the model's thinking phase, producing
        # spurious RemoteProtocolError ("peer closed connection").
-        _est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
+        _est_tokens = estimate_request_context_tokens(api_kwargs)
        if _est_tokens > 100_000:
            _stream_stale_timeout = max(_stream_stale_timeout_base, 300.0)
        elif _est_tokens > 50_000:
@ -2055,7 +2107,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
        # inner retry loop can start a fresh connection.
        _stale_elapsed = time.time() - last_chunk_time["t"]
        if _stale_elapsed > _stream_stale_timeout:
-            _est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
+            _est_ctx = estimate_request_context_tokens(api_kwargs)
            logger.warning(
                "Stream stale for %.0fs (threshold %.0fs) — no chunks received. "
                "model=%s context=~%s tokens. Killing connection.",
--- a/agent/codex_responses_adapter.py
+++ b/agent/codex_responses_adapter.py
@ -745,7 +745,7 @@ def _preflight_codex_api_kwargs(
        "model", "instructions", "input", "tools", "store",
        "reasoning", "include", "max_output_tokens", "temperature",
        "tool_choice", "parallel_tool_calls", "prompt_cache_key", "service_tier",
-        "extra_headers", "extra_body",
+        "extra_headers", "extra_body", "timeout",
    }
    normalized: Dict[str, Any] = {
        "model": model,
@ -771,6 +771,13 @@ def _preflight_codex_api_kwargs(
    max_output_tokens = api_kwargs.get("max_output_tokens")
    if isinstance(max_output_tokens, (int, float)) and max_output_tokens > 0:
        normalized["max_output_tokens"] = int(max_output_tokens)
+    timeout = api_kwargs.get("timeout")
+    if (
+        isinstance(timeout, (int, float))
+        and not isinstance(timeout, bool)
+        and 0 < float(timeout) < float("inf")
+    ):
+        normalized["timeout"] = float(timeout)
    temperature = api_kwargs.get("temperature")
    if isinstance(temperature, (int, float)):
        normalized["temperature"] = float(temperature)
--- a/agent/transports/codex.py
+++ b/agent/transports/codex.py
@ -50,6 +50,7 @@ class ResponsesApiTransport(ProviderTransport):
            reasoning_config: dict | None — {effort, enabled}
            session_id: str | None — used for prompt_cache_key + xAI conv header
            max_tokens: int | None — max_output_tokens
+            timeout: float | None — per-request timeout forwarded to the SDK
            request_overrides: dict | None — extra kwargs merged in
            provider: str | None — provider name for backend-specific logic
            base_url: str | None — endpoint URL
@ -143,6 +144,20 @@ class ResponsesApiTransport(ProviderTransport):
        if request_overrides:
            kwargs.update(request_overrides)

+        # Forward per-request timeout to the SDK so OpenAI/Anthropic clients
+        # honor it.  Without this, ``providers.<id>.request_timeout_seconds``
+        # is silently dropped on the main agent Codex path while the
+        # chat_completions path and auxiliary Codex adapter both forward it.
+        timeout = kwargs.get("timeout", params.get("timeout"))
+        if (
+            isinstance(timeout, (int, float))
+            and not isinstance(timeout, bool)
+            and 0 < float(timeout) < float("inf")
+        ):
+            kwargs["timeout"] = float(timeout)
+        else:
+            kwargs.pop("timeout", None)
+
        if is_codex_backend:
            prompt_cache_key = kwargs.get("prompt_cache_key")
            cache_scope_id = str(prompt_cache_key or session_id or "").strip()