From 2d422720b53207208efacfdb2e32dc92048b6edd Mon Sep 17 00:00:00 2001
From: Kasun Athaudahetti <kasunvinod@users.noreply.github.com>
Date: Mon, 25 May 2026 01:36:22 -0700
Subject: [PATCH] fix(codex): size and propagate timeouts for Responses-API
 requests; lower stale defaults

Codex / Responses-API requests had three latent timeout bugs that combined
into the long silent hangs reported on #21444:

1. The non-stream stale-call detector estimated context tokens from
   ``api_kwargs["messages"]`` only. Codex / Responses-API payloads carry
   their conversational load in ``input`` (with ``instructions`` and
   ``tools``), so every Codex turn logged ``context=~0 tokens`` and the
   detector never applied its >50k / >100k tier bumps.

2. ``providers.<id>.request_timeout_seconds`` was silently dropped on the
   main Codex path. The chat_completions path and the auxiliary Codex
   adapter both forwarded it; the main path skipped it through three
   places (``build_api_kwargs``, ``ResponsesApiTransport.build_kwargs``,
   ``_preflight_codex_api_kwargs``).

3. The streaming stale detector had the same payload-shape bug for
   ``codex_responses`` requests, which route through the non-streaming
   detector (it's the path that emits the user-facing
   "No response from provider for 300s (non-streaming, ...)" warning that
   reporters keep pasting).

This commit:

- Adds ``estimate_request_context_tokens`` in ``chat_completion_helpers``,
  used by both the non-stream and stream detectors. Handles ``messages``
  (Chat Completions), ``input + instructions + tools`` (Responses API),
  bare lists, and an unknown-dict fallback.
- Forwards ``timeout`` through ``ResponsesApiTransport.build_kwargs``
  and ``_preflight_codex_api_kwargs`` (with guards against
  zero/negative/inf/bool values), and wires
  ``_resolved_api_call_timeout()`` into the Codex branch of
  ``build_api_kwargs``.
- Lowers the implicit non-stream stale defaults so fallback providers
  kick in faster when upstream stalls:
    * base   300s -> 90s
    * >50k   450s -> 150s
    * >100k  600s -> 240s
  These only apply when the user has *not* set
  ``providers.<id>.stale_timeout_seconds`` or
  ``HERMES_API_CALL_STALE_TIMEOUT``. Explicit config still wins.
- Adds regression tests for the estimator shapes, the new defaults, the
  context-tier scaling, transport timeout pass-through, and preflight
  timeout pass-through / rejection of invalid values.

Closes #21444
Supersedes #21652 #24126 #31855

Co-authored-by: Hoang V. Pham <26063003+hehehe0803@users.noreply.github.com>
---
 agent/chat_completion_helpers.py              |  64 +++++-
 agent/codex_responses_adapter.py              |   9 +-
 agent/transports/codex.py                     |  15 ++
 run_agent.py                                  |  25 ++-
 scripts/release.py                            |   2 +
 tests/agent/test_non_stream_stale_timeout.py  | 192 ++++++++++++++++++
 .../agent/transports/test_codex_transport.py  |  61 ++++++
 tests/hermes_cli/test_timeouts.py             |   2 +-
 .../run_agent/test_openai_client_lifecycle.py |   2 +-
 .../test_run_agent_codex_responses.py         |  28 ++-
 10 files changed, 383 insertions(+), 17 deletions(-)
 create mode 100644 tests/agent/test_non_stream_stale_timeout.py
diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py
index b52bd6a1fb1..5cdb590dd5d 100644
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -75,6 +75,59 @@ def _ra():
     return run_agent
 
 
+def estimate_request_context_tokens(api_payload: Any) -> int:
+    """Estimate context/load tokens from an API payload, dict or messages list.
+
+    The stale-call detectors historically assumed a Chat Completions request:
+    they pulled ``api_kwargs["messages"]`` and ran a cheap char/4 estimate.
+    Codex / Responses API requests carry the conversational payload in
+    ``input`` (with additional load in ``instructions`` and ``tools``), so the
+    legacy estimator reported ~0 tokens for every Codex turn and the
+    context-tier scaling never fired.
+
+    This helper handles both shapes:
+      - bare list -> treat as Chat Completions ``messages``
+      - dict with ``messages`` -> Chat Completions (+ ``tools`` if present)
+      - dict with ``input`` -> Responses API (+ ``instructions``/``tools``)
+      - any other dict -> fall back to summing string values
+    """
+
+    def _chars(value: Any) -> int:
+        if value is None:
+            return 0
+        if isinstance(value, str):
+            return len(value)
+        return len(str(value))
+
+    def _message_chars(messages: Any) -> int:
+        if not isinstance(messages, list):
+            return _chars(messages)
+        return sum(_chars(item) for item in messages)
+
+    if isinstance(api_payload, list):
+        return _message_chars(api_payload) // 4
+
+    if isinstance(api_payload, dict):
+        messages = api_payload.get("messages")
+        if isinstance(messages, list):
+            total_chars = _message_chars(messages)
+            if "tools" in api_payload:
+                total_chars += _chars(api_payload.get("tools"))
+            return total_chars // 4
+
+        if "input" in api_payload:
+            total_chars = (
+                _chars(api_payload.get("input"))
+                + _chars(api_payload.get("instructions"))
+                + _chars(api_payload.get("tools"))
+            )
+            return total_chars // 4
+
+        return sum(_chars(value) for value in api_payload.values()) // 4
+
+    return _chars(api_payload) // 4
+
+
 
 def interruptible_api_call(agent, api_kwargs: dict):
     """
@@ -200,9 +253,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
     # httpx timeout (default 1800s) with zero feedback.  The stale
     # detector kills the connection early so the main retry loop can
     # apply richer recovery (credential rotation, provider fallback).
-    _stale_timeout = agent._compute_non_stream_stale_timeout(
-        api_kwargs.get("messages", [])
-    )
+    _stale_timeout = agent._compute_non_stream_stale_timeout(api_kwargs)
 
     _call_start = time.time()
     agent._touch_activity("waiting for non-streaming API response")
@@ -226,7 +277,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
         # arrives within the configured timeout.
         _elapsed = time.time() - _call_start
         if _elapsed > _stale_timeout:
-            _est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
+            _est_ctx = estimate_request_context_tokens(api_kwargs)
             logger.warning(
                 "Non-streaming API call stale for %.0fs (threshold %.0fs). "
                 "model=%s context=~%s tokens. Killing connection.",
@@ -362,6 +413,7 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
             reasoning_config=agent.reasoning_config,
             session_id=getattr(agent, "session_id", None),
             max_tokens=agent.max_tokens,
+            timeout=agent._resolved_api_call_timeout(),
             request_overrides=agent.request_overrides,
             is_github_responses=is_github_responses,
             is_codex_backend=is_codex_backend,
@@ -2019,7 +2071,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
         # when the context is large.  Without this, the stale detector kills
         # healthy connections during the model's thinking phase, producing
         # spurious RemoteProtocolError ("peer closed connection").
-        _est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
+        _est_tokens = estimate_request_context_tokens(api_kwargs)
         if _est_tokens > 100_000:
             _stream_stale_timeout = max(_stream_stale_timeout_base, 300.0)
         elif _est_tokens > 50_000:
@@ -2055,7 +2107,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
         # inner retry loop can start a fresh connection.
         _stale_elapsed = time.time() - last_chunk_time["t"]
         if _stale_elapsed > _stream_stale_timeout:
-            _est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
+            _est_ctx = estimate_request_context_tokens(api_kwargs)
             logger.warning(
                 "Stream stale for %.0fs (threshold %.0fs) — no chunks received. "
                 "model=%s context=~%s tokens. Killing connection.",
diff --git a/agent/codex_responses_adapter.py b/agent/codex_responses_adapter.py
index adea34d094c..07ae5cc9506 100644
--- a/agent/codex_responses_adapter.py
+++ b/agent/codex_responses_adapter.py
@@ -745,7 +745,7 @@ def _preflight_codex_api_kwargs(
         "model", "instructions", "input", "tools", "store",
         "reasoning", "include", "max_output_tokens", "temperature",
         "tool_choice", "parallel_tool_calls", "prompt_cache_key", "service_tier",
-        "extra_headers", "extra_body",
+        "extra_headers", "extra_body", "timeout",
     }
     normalized: Dict[str, Any] = {
         "model": model,
@@ -771,6 +771,13 @@ def _preflight_codex_api_kwargs(
     max_output_tokens = api_kwargs.get("max_output_tokens")
     if isinstance(max_output_tokens, (int, float)) and max_output_tokens > 0:
         normalized["max_output_tokens"] = int(max_output_tokens)
+    timeout = api_kwargs.get("timeout")
+    if (
+        isinstance(timeout, (int, float))
+        and not isinstance(timeout, bool)
+        and 0 < float(timeout) < float("inf")
+    ):
+        normalized["timeout"] = float(timeout)
     temperature = api_kwargs.get("temperature")
     if isinstance(temperature, (int, float)):
         normalized["temperature"] = float(temperature)
diff --git a/agent/transports/codex.py b/agent/transports/codex.py
index 27264f2f38f..970692c0394 100644
--- a/agent/transports/codex.py
+++ b/agent/transports/codex.py
@@ -50,6 +50,7 @@ class ResponsesApiTransport(ProviderTransport):
             reasoning_config: dict | None — {effort, enabled}
             session_id: str | None — used for prompt_cache_key + xAI conv header
             max_tokens: int | None — max_output_tokens
+            timeout: float | None — per-request timeout forwarded to the SDK
             request_overrides: dict | None — extra kwargs merged in
             provider: str | None — provider name for backend-specific logic
             base_url: str | None — endpoint URL
@@ -143,6 +144,20 @@ class ResponsesApiTransport(ProviderTransport):
         if request_overrides:
             kwargs.update(request_overrides)
 
+        # Forward per-request timeout to the SDK so OpenAI/Anthropic clients
+        # honor it.  Without this, ``providers.<id>.request_timeout_seconds``
+        # is silently dropped on the main agent Codex path while the
+        # chat_completions path and auxiliary Codex adapter both forward it.
+        timeout = kwargs.get("timeout", params.get("timeout"))
+        if (
+            isinstance(timeout, (int, float))
+            and not isinstance(timeout, bool)
+            and 0 < float(timeout) < float("inf")
+        ):
+            kwargs["timeout"] = float(timeout)
+        else:
+            kwargs.pop("timeout", None)
+
         if is_codex_backend:
             prompt_cache_key = kwargs.get("prompt_cache_key")
             cache_scope_id = str(prompt_cache_key or session_id or "").strip()
diff --git a/run_agent.py b/run_agent.py
index 4a2f3cb3ba4..6c44c2d9a48 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -885,7 +885,11 @@ class AIAgent:
           1. ``providers.<id>.models.<model>.stale_timeout_seconds``
           2. ``providers.<id>.stale_timeout_seconds``
           3. ``HERMES_API_CALL_STALE_TIMEOUT`` env var
-          4. 300.0s default
+          4. 90.0s default (time-to-first-byte for non-streaming / Codex
+             internal-streaming requests; lowered from 300s in May 2026 so
+             fallback providers kick in faster when upstream providers
+             stall).  The detector still scales up for large contexts in
+             ``_compute_non_stream_stale_timeout``.
 
         Returns ``(timeout_seconds, uses_implicit_default)`` so the caller can
         preserve legacy behaviors that only apply when the user has *not*
@@ -900,20 +904,27 @@ class AIAgent:
         if env_timeout is not None:
             return float(env_timeout), False
 
-        return 300.0, True
+        return 90.0, True
 
-    def _compute_non_stream_stale_timeout(self, messages: list[dict[str, Any]]) -> float:
-        """Compute the effective non-stream stale timeout for this request."""
+    def _compute_non_stream_stale_timeout(self, api_payload: Any) -> float:
+        """Compute the effective non-stream stale timeout for this request.
+
+        Accepts either the full ``api_kwargs`` dict (Chat Completions or
+        Responses API) or a legacy ``messages`` list.  Context-size scaling
+        applies the same way to both shapes via
+        :func:`agent.chat_completion_helpers.estimate_request_context_tokens`.
+        """
         stale_base, uses_implicit_default = self._resolved_api_call_stale_timeout_base()
         base_url = getattr(self, "_base_url", None) or self.base_url or ""
         if uses_implicit_default and base_url and is_local_endpoint(base_url):
             return float("inf")
 
-        est_tokens = sum(len(str(v)) for v in messages) // 4
+        from agent.chat_completion_helpers import estimate_request_context_tokens
+        est_tokens = estimate_request_context_tokens(api_payload)
         if est_tokens > 100_000:
-            return max(stale_base, 600.0)
+            return max(stale_base, 240.0)
         if est_tokens > 50_000:
-            return max(stale_base, 450.0)
+            return max(stale_base, 150.0)
         return stale_base
 
     def _is_openrouter_url(self) -> bool:
diff --git a/scripts/release.py b/scripts/release.py
index f2da64997ea..82c0f2f98e3 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -1240,6 +1240,8 @@ AUTHOR_MAP = {
     "165905879+davidcampbelldc@users.noreply.github.com": "davidcampbelldc",
     "hoangv.pham0803@gmail.com": "hehehe0803",  # PR #26212 salvage (codex kanban writable root)
     "26063003+hehehe0803@users.noreply.github.com": "hehehe0803",
+    "kasunvinod@users.noreply.github.com": "kasunvinod",  # PR #24126 salvage (codex timeout propagation)
+    "15059870+kasunvinod@users.noreply.github.com": "kasunvinod",
     "38348871+vaddisrinivas@users.noreply.github.com": "vaddisrinivas",  # PR #26394 salvage (Docker messaging extra)
     # batch salvage (May 2026 LHF run, group 7)
     "198679067+02356abc@users.noreply.github.com": "02356abc",  # PR #28286 salvage (wecom CLOSING)
diff --git a/tests/agent/test_non_stream_stale_timeout.py b/tests/agent/test_non_stream_stale_timeout.py
new file mode 100644
index 00000000000..702856275f6
--- /dev/null
+++ b/tests/agent/test_non_stream_stale_timeout.py
@@ -0,0 +1,192 @@
+"""Tests for the non-stream stale-call detector context estimator.
+
+Covers:
+- ``estimate_request_context_tokens`` for Chat Completions, Responses API,
+  bare lists, and mixed-shape dicts.
+- ``AIAgent._compute_non_stream_stale_timeout`` with both legacy ``messages``
+  list and full ``api_kwargs`` dicts.
+- The May 2026 default-base change (300s -> 90s) and the lowered
+  context-tier ceilings (450/600 -> 150/240).
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+
+
+def _write_config(tmp_path: Path, body: str) -> None:
+    hermes_home = tmp_path
+    (hermes_home / "config.yaml").write_text(body or "{}\n", encoding="utf-8")
+
+
+def _make_agent(tmp_path: Path, **overrides):
+    from run_agent import AIAgent
+    kwargs = dict(
+        model="gpt-5.5",
+        provider="openai-codex",
+        api_key="sk-dummy",
+        base_url="https://chatgpt.com/backend-api/codex",
+        quiet_mode=True,
+        skip_context_files=True,
+        skip_memory=True,
+        platform="cli",
+    )
+    kwargs.update(overrides)
+    return AIAgent(**kwargs)
+
+
+# ── estimator ──────────────────────────────────────────────────────────────
+
+
+def test_estimator_chat_completions_messages():
+    from agent.chat_completion_helpers import estimate_request_context_tokens
+    payload = {
+        "model": "gpt-5.4",
+        "messages": [
+            {"role": "user", "content": "x" * 400},
+            {"role": "assistant", "content": "y" * 400},
+        ],
+    }
+    # 800+ chars from messages -> ~200 tokens (char/4 estimate)
+    assert estimate_request_context_tokens(payload) >= 200
+
+
+def test_estimator_responses_api_input():
+    from agent.chat_completion_helpers import estimate_request_context_tokens
+    payload = {
+        "model": "gpt-5.5",
+        "instructions": "i" * 1000,
+        "input": "x" * 4000,
+        "tools": [{"name": "t", "description": "d" * 200}],
+    }
+    # input(4000) + instructions(1000) + tools (~stringified) -> well over 1000 tokens
+    tokens = estimate_request_context_tokens(payload)
+    assert tokens >= 1200, f"Responses API estimator returned {tokens}"
+
+
+def test_estimator_responses_api_long_session_triggers_tier():
+    """A real long Codex session (large ``input``) should clear the 50k boundary."""
+    from agent.chat_completion_helpers import estimate_request_context_tokens
+    payload = {
+        "model": "gpt-5.5",
+        "input": "x" * 240_000,  # ~60k tokens (240k chars / 4)
+        "instructions": "s" * 4000,
+    }
+    assert estimate_request_context_tokens(payload) > 50_000
+
+
+def test_estimator_bare_list_back_compat():
+    from agent.chat_completion_helpers import estimate_request_context_tokens
+    messages = [
+        {"role": "user", "content": "x" * 800},
+    ]
+    assert estimate_request_context_tokens(messages) >= 200
+
+
+def test_estimator_empty_inputs():
+    from agent.chat_completion_helpers import estimate_request_context_tokens
+    assert estimate_request_context_tokens({}) == 0
+    assert estimate_request_context_tokens([]) == 0
+    assert estimate_request_context_tokens(None) == 0
+
+
+def test_estimator_unknown_dict_fallback():
+    from agent.chat_completion_helpers import estimate_request_context_tokens
+    payload = {"random_field": "z" * 400}
+    assert estimate_request_context_tokens(payload) > 50
+
+
+# ── default base + tier scaling ────────────────────────────────────────────
+
+
+def test_default_base_is_90s(monkeypatch, tmp_path):
+    """Default base stale timeout dropped from 300s to 90s (May 2026)."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    (tmp_path / ".env").write_text("", encoding="utf-8")
+    monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
+    _write_config(tmp_path, "")
+
+    agent = _make_agent(tmp_path)
+    base, implicit = agent._resolved_api_call_stale_timeout_base()
+    assert base == 90.0
+    assert implicit is True
+
+
+def test_short_codex_request_uses_base_only(monkeypatch, tmp_path):
+    """Codex payload below 50k tokens -> default 90s base."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    (tmp_path / ".env").write_text("", encoding="utf-8")
+    monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
+    _write_config(tmp_path, "")
+
+    agent = _make_agent(tmp_path)
+    payload = {"model": "gpt-5.5", "input": "hi", "instructions": ""}
+    assert agent._compute_non_stream_stale_timeout(payload) == 90.0
+
+
+def test_long_codex_request_bumps_to_50k_tier(monkeypatch, tmp_path):
+    """Codex payload > 50k tokens -> at least 150s."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    (tmp_path / ".env").write_text("", encoding="utf-8")
+    monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
+    _write_config(tmp_path, "")
+
+    agent = _make_agent(tmp_path)
+    payload = {"model": "gpt-5.5", "input": "x" * 240_000, "instructions": ""}
+    timeout = agent._compute_non_stream_stale_timeout(payload)
+    assert timeout >= 150.0
+    assert timeout < 240.0
+
+
+def test_very_long_codex_request_bumps_to_100k_tier(monkeypatch, tmp_path):
+    """Codex payload > 100k tokens -> at least 240s."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    (tmp_path / ".env").write_text("", encoding="utf-8")
+    monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
+    _write_config(tmp_path, "")
+
+    agent = _make_agent(tmp_path)
+    payload = {"model": "gpt-5.5", "input": "x" * 500_000, "instructions": ""}
+    assert agent._compute_non_stream_stale_timeout(payload) >= 240.0
+
+
+def test_chat_completions_long_messages_bumps_tier(monkeypatch, tmp_path):
+    """Chat Completions estimator still works for the legacy messages path."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    (tmp_path / ".env").write_text("", encoding="utf-8")
+    monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
+    _write_config(tmp_path, "")
+
+    agent = _make_agent(
+        tmp_path,
+        provider="openai",
+        base_url="https://api.openai.com/v1",
+        model="gpt-5.4",
+    )
+    payload = {
+        "model": "gpt-5.4",
+        "messages": [{"role": "user", "content": "x" * 240_000}],
+    }
+    assert agent._compute_non_stream_stale_timeout(payload) >= 150.0
+
+
+def test_explicit_user_config_overrides_default(monkeypatch, tmp_path):
+    """If the user explicitly sets a stale_timeout, the new defaults don't apply."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    (tmp_path / ".env").write_text("", encoding="utf-8")
+    _write_config(tmp_path, """\
+providers:
+  openai-codex:
+    stale_timeout_seconds: 1800
+""")
+    monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
+
+    import importlib
+    from hermes_cli import timeouts as to_mod
+    importlib.reload(to_mod)
+
+    agent = _make_agent(tmp_path)
+    assert agent._compute_non_stream_stale_timeout({"input": "hi"}) == 1800.0
diff --git a/tests/agent/transports/test_codex_transport.py b/tests/agent/transports/test_codex_transport.py
index a0470fa8de8..96a80827204 100644
--- a/tests/agent/transports/test_codex_transport.py
+++ b/tests/agent/transports/test_codex_transport.py
@@ -452,3 +452,64 @@ class TestCodexNormalizeResponse:
         tc = nr.tool_calls[0]
         assert tc.name == "terminal"
         assert '"command"' in tc.arguments
+
+
+
+class TestCodexTransportTimeout:
+    """Forward per-request timeout from build_kwargs to the SDK kwargs."""
+
+    def test_positive_timeout_preserved(self, transport):
+        kw = transport.build_kwargs(
+            model="gpt-5.5",
+            messages=[{"role": "user", "content": "hi"}],
+            tools=[],
+            timeout=600.0,
+        )
+        assert kw.get("timeout") == 600.0
+
+    def test_zero_timeout_dropped(self, transport):
+        kw = transport.build_kwargs(
+            model="gpt-5.5",
+            messages=[{"role": "user", "content": "hi"}],
+            tools=[],
+            timeout=0,
+        )
+        assert "timeout" not in kw
+
+    def test_none_timeout_omitted(self, transport):
+        kw = transport.build_kwargs(
+            model="gpt-5.5",
+            messages=[{"role": "user", "content": "hi"}],
+            tools=[],
+            timeout=None,
+        )
+        assert "timeout" not in kw
+
+    def test_inf_timeout_dropped(self, transport):
+        kw = transport.build_kwargs(
+            model="gpt-5.5",
+            messages=[{"role": "user", "content": "hi"}],
+            tools=[],
+            timeout=float("inf"),
+        )
+        assert "timeout" not in kw
+
+    def test_bool_timeout_dropped(self, transport):
+        """``True`` is technically int but must not survive — caller bug guard."""
+        kw = transport.build_kwargs(
+            model="gpt-5.5",
+            messages=[{"role": "user", "content": "hi"}],
+            tools=[],
+            timeout=True,
+        )
+        assert "timeout" not in kw
+
+    def test_request_overrides_can_supply_timeout(self, transport):
+        """request_overrides["timeout"] is honored when no explicit kwarg passed."""
+        kw = transport.build_kwargs(
+            model="gpt-5.5",
+            messages=[{"role": "user", "content": "hi"}],
+            tools=[],
+            request_overrides={"timeout": 450.0},
+        )
+        assert kw.get("timeout") == 450.0
diff --git a/tests/hermes_cli/test_timeouts.py b/tests/hermes_cli/test_timeouts.py
index 0f641a5c1b8..93c8cafc0a9 100644
--- a/tests/hermes_cli/test_timeouts.py
+++ b/tests/hermes_cli/test_timeouts.py
@@ -265,7 +265,7 @@ def test_resolved_api_call_stale_timeout_priority(monkeypatch, tmp_path):
     assert agent2._resolved_api_call_stale_timeout_base() == (999.0, False)
 
     monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
-    assert agent2._resolved_api_call_stale_timeout_base() == (300.0, True)
+    assert agent2._resolved_api_call_stale_timeout_base() == (90.0, True)
 
 
 def test_default_non_stream_stale_timeout_auto_disables_for_local_endpoints(monkeypatch, tmp_path):
diff --git a/tests/run_agent/test_openai_client_lifecycle.py b/tests/run_agent/test_openai_client_lifecycle.py
index 35a8ec7a084..e38c1f726e4 100644
--- a/tests/run_agent/test_openai_client_lifecycle.py
+++ b/tests/run_agent/test_openai_client_lifecycle.py
@@ -105,7 +105,7 @@ def test_stale_non_stream_close_is_single_owner(monkeypatch):
     monkeypatch.setattr(run_agent, "OpenAI", factory)
 
     agent = _build_agent()
-    agent._compute_non_stream_stale_timeout = lambda _messages: 0.01
+    agent._compute_non_stream_stale_timeout = lambda api_payload: 0.01
 
     with pytest.raises(APIConnectionError):
         agent._interruptible_api_call({"model": agent.model, "messages": []})
diff --git a/tests/run_agent/test_run_agent_codex_responses.py b/tests/run_agent/test_run_agent_codex_responses.py
index 42948e1c41e..bc575cc676f 100644
--- a/tests/run_agent/test_run_agent_codex_responses.py
+++ b/tests/run_agent/test_run_agent_codex_responses.py
@@ -306,7 +306,10 @@ def test_build_api_kwargs_codex(monkeypatch):
     assert kwargs["parallel_tool_calls"] is True
     assert isinstance(kwargs["prompt_cache_key"], str)
     assert len(kwargs["prompt_cache_key"]) > 0
-    assert "timeout" not in kwargs
+    # ``timeout`` is now wired from ``_resolved_api_call_timeout`` (default 1800s)
+    # so per-provider ``request_timeout_seconds`` actually reaches the SDK.
+    assert isinstance(kwargs.get("timeout"), float)
+    assert kwargs["timeout"] > 0
     assert "max_tokens" not in kwargs
     assert "extra_body" not in kwargs
 
@@ -1053,6 +1056,29 @@ def test_preflight_codex_api_kwargs_allows_service_tier(monkeypatch):
     assert result["service_tier"] == "priority"
 
 
+def test_preflight_codex_api_kwargs_preserves_positive_timeout(monkeypatch):
+    """Positive numeric timeouts survive preflight so the SDK honors them."""
+    agent = _build_agent(monkeypatch)
+    kwargs = _codex_request_kwargs()
+    kwargs["timeout"] = 600.0
+
+    from agent.codex_responses_adapter import _preflight_codex_api_kwargs
+    result = _preflight_codex_api_kwargs(kwargs)
+    assert result["timeout"] == 600.0
+
+
+def test_preflight_codex_api_kwargs_drops_invalid_timeout(monkeypatch):
+    """Zero, negative, inf, and booleans are all dropped — not passed to SDK."""
+    agent = _build_agent(monkeypatch)
+    from agent.codex_responses_adapter import _preflight_codex_api_kwargs
+
+    for bad in (0, -1, float("inf"), True, False, "300", None):
+        kwargs = _codex_request_kwargs()
+        kwargs["timeout"] = bad
+        result = _preflight_codex_api_kwargs(kwargs)
+        assert "timeout" not in result, f"timeout={bad!r} should be dropped"
+
+
 def test_run_conversation_codex_replay_payload_keeps_call_id(monkeypatch):
     agent = _build_agent(monkeypatch)
     responses = [_codex_tool_call_response(), _codex_message_response("done")]