fix(codex): size and propagate timeouts for Responses-API requests; lower stale defaults

Codex / Responses-API requests had three latent timeout bugs that combined
into the long silent hangs reported on #21444:

1. The non-stream stale-call detector estimated context tokens from
   ``api_kwargs["messages"]`` only. Codex / Responses-API payloads carry
   their conversational load in ``input`` (with ``instructions`` and
   ``tools``), so every Codex turn logged ``context=~0 tokens`` and the
   detector never applied its >50k / >100k tier bumps.

2. ``providers.<id>.request_timeout_seconds`` was silently dropped on the
   main Codex path. The chat_completions path and the auxiliary Codex
   adapter both forwarded it; the main path skipped it through three
   places (``build_api_kwargs``, ``ResponsesApiTransport.build_kwargs``,
   ``_preflight_codex_api_kwargs``).

3. The streaming stale detector had the same payload-shape bug for
   ``codex_responses`` requests, which route through the non-streaming
   detector (it's the path that emits the user-facing
   "No response from provider for 300s (non-streaming, ...)" warning that
   reporters keep pasting).

This commit:

- Adds ``estimate_request_context_tokens`` in ``chat_completion_helpers``,
  used by both the non-stream and stream detectors. Handles ``messages``
  (Chat Completions), ``input + instructions + tools`` (Responses API),
  bare lists, and an unknown-dict fallback.
- Forwards ``timeout`` through ``ResponsesApiTransport.build_kwargs``
  and ``_preflight_codex_api_kwargs`` (with guards against
  zero/negative/inf/bool values), and wires
  ``_resolved_api_call_timeout()`` into the Codex branch of
  ``build_api_kwargs``.
- Lowers the implicit non-stream stale defaults so fallback providers
  kick in faster when upstream stalls:
    * base   300s -> 90s
    * >50k   450s -> 150s
    * >100k  600s -> 240s
  These only apply when the user has *not* set
  ``providers.<id>.stale_timeout_seconds`` or
  ``HERMES_API_CALL_STALE_TIMEOUT``. Explicit config still wins.
- Adds regression tests for the estimator shapes, the new defaults, the
  context-tier scaling, transport timeout pass-through, and preflight
  timeout pass-through / rejection of invalid values.

Closes #21444
Supersedes #21652 #24126 #31855

Co-authored-by: Hoang V. Pham <26063003+hehehe0803@users.noreply.github.com>
This commit is contained in:
Kasun Athaudahetti 2026-05-25 01:36:22 -07:00 committed by Teknium
parent 76135b329d
commit 2d422720b5
10 changed files with 383 additions and 17 deletions

View file

@ -75,6 +75,59 @@ def _ra():
return run_agent
def estimate_request_context_tokens(api_payload: Any) -> int:
"""Estimate context/load tokens from an API payload, dict or messages list.
The stale-call detectors historically assumed a Chat Completions request:
they pulled ``api_kwargs["messages"]`` and ran a cheap char/4 estimate.
Codex / Responses API requests carry the conversational payload in
``input`` (with additional load in ``instructions`` and ``tools``), so the
legacy estimator reported ~0 tokens for every Codex turn and the
context-tier scaling never fired.
This helper handles both shapes:
- bare list -> treat as Chat Completions ``messages``
- dict with ``messages`` -> Chat Completions (+ ``tools`` if present)
- dict with ``input`` -> Responses API (+ ``instructions``/``tools``)
- any other dict -> fall back to summing string values
"""
def _chars(value: Any) -> int:
if value is None:
return 0
if isinstance(value, str):
return len(value)
return len(str(value))
def _message_chars(messages: Any) -> int:
if not isinstance(messages, list):
return _chars(messages)
return sum(_chars(item) for item in messages)
if isinstance(api_payload, list):
return _message_chars(api_payload) // 4
if isinstance(api_payload, dict):
messages = api_payload.get("messages")
if isinstance(messages, list):
total_chars = _message_chars(messages)
if "tools" in api_payload:
total_chars += _chars(api_payload.get("tools"))
return total_chars // 4
if "input" in api_payload:
total_chars = (
_chars(api_payload.get("input"))
+ _chars(api_payload.get("instructions"))
+ _chars(api_payload.get("tools"))
)
return total_chars // 4
return sum(_chars(value) for value in api_payload.values()) // 4
return _chars(api_payload) // 4
def interruptible_api_call(agent, api_kwargs: dict):
"""
@ -200,9 +253,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
# httpx timeout (default 1800s) with zero feedback. The stale
# detector kills the connection early so the main retry loop can
# apply richer recovery (credential rotation, provider fallback).
_stale_timeout = agent._compute_non_stream_stale_timeout(
api_kwargs.get("messages", [])
)
_stale_timeout = agent._compute_non_stream_stale_timeout(api_kwargs)
_call_start = time.time()
agent._touch_activity("waiting for non-streaming API response")
@ -226,7 +277,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
# arrives within the configured timeout.
_elapsed = time.time() - _call_start
if _elapsed > _stale_timeout:
_est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
_est_ctx = estimate_request_context_tokens(api_kwargs)
logger.warning(
"Non-streaming API call stale for %.0fs (threshold %.0fs). "
"model=%s context=~%s tokens. Killing connection.",
@ -362,6 +413,7 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
reasoning_config=agent.reasoning_config,
session_id=getattr(agent, "session_id", None),
max_tokens=agent.max_tokens,
timeout=agent._resolved_api_call_timeout(),
request_overrides=agent.request_overrides,
is_github_responses=is_github_responses,
is_codex_backend=is_codex_backend,
@ -2019,7 +2071,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
# when the context is large. Without this, the stale detector kills
# healthy connections during the model's thinking phase, producing
# spurious RemoteProtocolError ("peer closed connection").
_est_tokens = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
_est_tokens = estimate_request_context_tokens(api_kwargs)
if _est_tokens > 100_000:
_stream_stale_timeout = max(_stream_stale_timeout_base, 300.0)
elif _est_tokens > 50_000:
@ -2055,7 +2107,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
# inner retry loop can start a fresh connection.
_stale_elapsed = time.time() - last_chunk_time["t"]
if _stale_elapsed > _stream_stale_timeout:
_est_ctx = sum(len(str(v)) for v in api_kwargs.get("messages", [])) // 4
_est_ctx = estimate_request_context_tokens(api_kwargs)
logger.warning(
"Stream stale for %.0fs (threshold %.0fs) — no chunks received. "
"model=%s context=~%s tokens. Killing connection.",

View file

@ -745,7 +745,7 @@ def _preflight_codex_api_kwargs(
"model", "instructions", "input", "tools", "store",
"reasoning", "include", "max_output_tokens", "temperature",
"tool_choice", "parallel_tool_calls", "prompt_cache_key", "service_tier",
"extra_headers", "extra_body",
"extra_headers", "extra_body", "timeout",
}
normalized: Dict[str, Any] = {
"model": model,
@ -771,6 +771,13 @@ def _preflight_codex_api_kwargs(
max_output_tokens = api_kwargs.get("max_output_tokens")
if isinstance(max_output_tokens, (int, float)) and max_output_tokens > 0:
normalized["max_output_tokens"] = int(max_output_tokens)
timeout = api_kwargs.get("timeout")
if (
isinstance(timeout, (int, float))
and not isinstance(timeout, bool)
and 0 < float(timeout) < float("inf")
):
normalized["timeout"] = float(timeout)
temperature = api_kwargs.get("temperature")
if isinstance(temperature, (int, float)):
normalized["temperature"] = float(temperature)

View file

@ -50,6 +50,7 @@ class ResponsesApiTransport(ProviderTransport):
reasoning_config: dict | None {effort, enabled}
session_id: str | None used for prompt_cache_key + xAI conv header
max_tokens: int | None max_output_tokens
timeout: float | None per-request timeout forwarded to the SDK
request_overrides: dict | None extra kwargs merged in
provider: str | None provider name for backend-specific logic
base_url: str | None endpoint URL
@ -143,6 +144,20 @@ class ResponsesApiTransport(ProviderTransport):
if request_overrides:
kwargs.update(request_overrides)
# Forward per-request timeout to the SDK so OpenAI/Anthropic clients
# honor it. Without this, ``providers.<id>.request_timeout_seconds``
# is silently dropped on the main agent Codex path while the
# chat_completions path and auxiliary Codex adapter both forward it.
timeout = kwargs.get("timeout", params.get("timeout"))
if (
isinstance(timeout, (int, float))
and not isinstance(timeout, bool)
and 0 < float(timeout) < float("inf")
):
kwargs["timeout"] = float(timeout)
else:
kwargs.pop("timeout", None)
if is_codex_backend:
prompt_cache_key = kwargs.get("prompt_cache_key")
cache_scope_id = str(prompt_cache_key or session_id or "").strip()

View file

@ -885,7 +885,11 @@ class AIAgent:
1. ``providers.<id>.models.<model>.stale_timeout_seconds``
2. ``providers.<id>.stale_timeout_seconds``
3. ``HERMES_API_CALL_STALE_TIMEOUT`` env var
4. 300.0s default
4. 90.0s default (time-to-first-byte for non-streaming / Codex
internal-streaming requests; lowered from 300s in May 2026 so
fallback providers kick in faster when upstream providers
stall). The detector still scales up for large contexts in
``_compute_non_stream_stale_timeout``.
Returns ``(timeout_seconds, uses_implicit_default)`` so the caller can
preserve legacy behaviors that only apply when the user has *not*
@ -900,20 +904,27 @@ class AIAgent:
if env_timeout is not None:
return float(env_timeout), False
return 300.0, True
return 90.0, True
def _compute_non_stream_stale_timeout(self, messages: list[dict[str, Any]]) -> float:
"""Compute the effective non-stream stale timeout for this request."""
def _compute_non_stream_stale_timeout(self, api_payload: Any) -> float:
"""Compute the effective non-stream stale timeout for this request.
Accepts either the full ``api_kwargs`` dict (Chat Completions or
Responses API) or a legacy ``messages`` list. Context-size scaling
applies the same way to both shapes via
:func:`agent.chat_completion_helpers.estimate_request_context_tokens`.
"""
stale_base, uses_implicit_default = self._resolved_api_call_stale_timeout_base()
base_url = getattr(self, "_base_url", None) or self.base_url or ""
if uses_implicit_default and base_url and is_local_endpoint(base_url):
return float("inf")
est_tokens = sum(len(str(v)) for v in messages) // 4
from agent.chat_completion_helpers import estimate_request_context_tokens
est_tokens = estimate_request_context_tokens(api_payload)
if est_tokens > 100_000:
return max(stale_base, 600.0)
return max(stale_base, 240.0)
if est_tokens > 50_000:
return max(stale_base, 450.0)
return max(stale_base, 150.0)
return stale_base
def _is_openrouter_url(self) -> bool:

View file

@ -1240,6 +1240,8 @@ AUTHOR_MAP = {
"165905879+davidcampbelldc@users.noreply.github.com": "davidcampbelldc",
"hoangv.pham0803@gmail.com": "hehehe0803", # PR #26212 salvage (codex kanban writable root)
"26063003+hehehe0803@users.noreply.github.com": "hehehe0803",
"kasunvinod@users.noreply.github.com": "kasunvinod", # PR #24126 salvage (codex timeout propagation)
"15059870+kasunvinod@users.noreply.github.com": "kasunvinod",
"38348871+vaddisrinivas@users.noreply.github.com": "vaddisrinivas", # PR #26394 salvage (Docker messaging extra)
# batch salvage (May 2026 LHF run, group 7)
"198679067+02356abc@users.noreply.github.com": "02356abc", # PR #28286 salvage (wecom CLOSING)

View file

@ -0,0 +1,192 @@
"""Tests for the non-stream stale-call detector context estimator.
Covers:
- ``estimate_request_context_tokens`` for Chat Completions, Responses API,
bare lists, and mixed-shape dicts.
- ``AIAgent._compute_non_stream_stale_timeout`` with both legacy ``messages``
list and full ``api_kwargs`` dicts.
- The May 2026 default-base change (300s -> 90s) and the lowered
context-tier ceilings (450/600 -> 150/240).
"""
from __future__ import annotations
import os
from pathlib import Path
import pytest
def _write_config(tmp_path: Path, body: str) -> None:
hermes_home = tmp_path
(hermes_home / "config.yaml").write_text(body or "{}\n", encoding="utf-8")
def _make_agent(tmp_path: Path, **overrides):
from run_agent import AIAgent
kwargs = dict(
model="gpt-5.5",
provider="openai-codex",
api_key="sk-dummy",
base_url="https://chatgpt.com/backend-api/codex",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
platform="cli",
)
kwargs.update(overrides)
return AIAgent(**kwargs)
# ── estimator ──────────────────────────────────────────────────────────────
def test_estimator_chat_completions_messages():
from agent.chat_completion_helpers import estimate_request_context_tokens
payload = {
"model": "gpt-5.4",
"messages": [
{"role": "user", "content": "x" * 400},
{"role": "assistant", "content": "y" * 400},
],
}
# 800+ chars from messages -> ~200 tokens (char/4 estimate)
assert estimate_request_context_tokens(payload) >= 200
def test_estimator_responses_api_input():
from agent.chat_completion_helpers import estimate_request_context_tokens
payload = {
"model": "gpt-5.5",
"instructions": "i" * 1000,
"input": "x" * 4000,
"tools": [{"name": "t", "description": "d" * 200}],
}
# input(4000) + instructions(1000) + tools (~stringified) -> well over 1000 tokens
tokens = estimate_request_context_tokens(payload)
assert tokens >= 1200, f"Responses API estimator returned {tokens}"
def test_estimator_responses_api_long_session_triggers_tier():
"""A real long Codex session (large ``input``) should clear the 50k boundary."""
from agent.chat_completion_helpers import estimate_request_context_tokens
payload = {
"model": "gpt-5.5",
"input": "x" * 240_000, # ~60k tokens (240k chars / 4)
"instructions": "s" * 4000,
}
assert estimate_request_context_tokens(payload) > 50_000
def test_estimator_bare_list_back_compat():
from agent.chat_completion_helpers import estimate_request_context_tokens
messages = [
{"role": "user", "content": "x" * 800},
]
assert estimate_request_context_tokens(messages) >= 200
def test_estimator_empty_inputs():
from agent.chat_completion_helpers import estimate_request_context_tokens
assert estimate_request_context_tokens({}) == 0
assert estimate_request_context_tokens([]) == 0
assert estimate_request_context_tokens(None) == 0
def test_estimator_unknown_dict_fallback():
from agent.chat_completion_helpers import estimate_request_context_tokens
payload = {"random_field": "z" * 400}
assert estimate_request_context_tokens(payload) > 50
# ── default base + tier scaling ────────────────────────────────────────────
def test_default_base_is_90s(monkeypatch, tmp_path):
"""Default base stale timeout dropped from 300s to 90s (May 2026)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / ".env").write_text("", encoding="utf-8")
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
_write_config(tmp_path, "")
agent = _make_agent(tmp_path)
base, implicit = agent._resolved_api_call_stale_timeout_base()
assert base == 90.0
assert implicit is True
def test_short_codex_request_uses_base_only(monkeypatch, tmp_path):
"""Codex payload below 50k tokens -> default 90s base."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / ".env").write_text("", encoding="utf-8")
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
_write_config(tmp_path, "")
agent = _make_agent(tmp_path)
payload = {"model": "gpt-5.5", "input": "hi", "instructions": ""}
assert agent._compute_non_stream_stale_timeout(payload) == 90.0
def test_long_codex_request_bumps_to_50k_tier(monkeypatch, tmp_path):
"""Codex payload > 50k tokens -> at least 150s."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / ".env").write_text("", encoding="utf-8")
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
_write_config(tmp_path, "")
agent = _make_agent(tmp_path)
payload = {"model": "gpt-5.5", "input": "x" * 240_000, "instructions": ""}
timeout = agent._compute_non_stream_stale_timeout(payload)
assert timeout >= 150.0
assert timeout < 240.0
def test_very_long_codex_request_bumps_to_100k_tier(monkeypatch, tmp_path):
"""Codex payload > 100k tokens -> at least 240s."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / ".env").write_text("", encoding="utf-8")
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
_write_config(tmp_path, "")
agent = _make_agent(tmp_path)
payload = {"model": "gpt-5.5", "input": "x" * 500_000, "instructions": ""}
assert agent._compute_non_stream_stale_timeout(payload) >= 240.0
def test_chat_completions_long_messages_bumps_tier(monkeypatch, tmp_path):
"""Chat Completions estimator still works for the legacy messages path."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / ".env").write_text("", encoding="utf-8")
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
_write_config(tmp_path, "")
agent = _make_agent(
tmp_path,
provider="openai",
base_url="https://api.openai.com/v1",
model="gpt-5.4",
)
payload = {
"model": "gpt-5.4",
"messages": [{"role": "user", "content": "x" * 240_000}],
}
assert agent._compute_non_stream_stale_timeout(payload) >= 150.0
def test_explicit_user_config_overrides_default(monkeypatch, tmp_path):
"""If the user explicitly sets a stale_timeout, the new defaults don't apply."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / ".env").write_text("", encoding="utf-8")
_write_config(tmp_path, """\
providers:
openai-codex:
stale_timeout_seconds: 1800
""")
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
import importlib
from hermes_cli import timeouts as to_mod
importlib.reload(to_mod)
agent = _make_agent(tmp_path)
assert agent._compute_non_stream_stale_timeout({"input": "hi"}) == 1800.0

View file

@ -452,3 +452,64 @@ class TestCodexNormalizeResponse:
tc = nr.tool_calls[0]
assert tc.name == "terminal"
assert '"command"' in tc.arguments
class TestCodexTransportTimeout:
"""Forward per-request timeout from build_kwargs to the SDK kwargs."""
def test_positive_timeout_preserved(self, transport):
kw = transport.build_kwargs(
model="gpt-5.5",
messages=[{"role": "user", "content": "hi"}],
tools=[],
timeout=600.0,
)
assert kw.get("timeout") == 600.0
def test_zero_timeout_dropped(self, transport):
kw = transport.build_kwargs(
model="gpt-5.5",
messages=[{"role": "user", "content": "hi"}],
tools=[],
timeout=0,
)
assert "timeout" not in kw
def test_none_timeout_omitted(self, transport):
kw = transport.build_kwargs(
model="gpt-5.5",
messages=[{"role": "user", "content": "hi"}],
tools=[],
timeout=None,
)
assert "timeout" not in kw
def test_inf_timeout_dropped(self, transport):
kw = transport.build_kwargs(
model="gpt-5.5",
messages=[{"role": "user", "content": "hi"}],
tools=[],
timeout=float("inf"),
)
assert "timeout" not in kw
def test_bool_timeout_dropped(self, transport):
"""``True`` is technically int but must not survive — caller bug guard."""
kw = transport.build_kwargs(
model="gpt-5.5",
messages=[{"role": "user", "content": "hi"}],
tools=[],
timeout=True,
)
assert "timeout" not in kw
def test_request_overrides_can_supply_timeout(self, transport):
"""request_overrides["timeout"] is honored when no explicit kwarg passed."""
kw = transport.build_kwargs(
model="gpt-5.5",
messages=[{"role": "user", "content": "hi"}],
tools=[],
request_overrides={"timeout": 450.0},
)
assert kw.get("timeout") == 450.0

View file

@ -265,7 +265,7 @@ def test_resolved_api_call_stale_timeout_priority(monkeypatch, tmp_path):
assert agent2._resolved_api_call_stale_timeout_base() == (999.0, False)
monkeypatch.delenv("HERMES_API_CALL_STALE_TIMEOUT", raising=False)
assert agent2._resolved_api_call_stale_timeout_base() == (300.0, True)
assert agent2._resolved_api_call_stale_timeout_base() == (90.0, True)
def test_default_non_stream_stale_timeout_auto_disables_for_local_endpoints(monkeypatch, tmp_path):

View file

@ -105,7 +105,7 @@ def test_stale_non_stream_close_is_single_owner(monkeypatch):
monkeypatch.setattr(run_agent, "OpenAI", factory)
agent = _build_agent()
agent._compute_non_stream_stale_timeout = lambda _messages: 0.01
agent._compute_non_stream_stale_timeout = lambda api_payload: 0.01
with pytest.raises(APIConnectionError):
agent._interruptible_api_call({"model": agent.model, "messages": []})

View file

@ -306,7 +306,10 @@ def test_build_api_kwargs_codex(monkeypatch):
assert kwargs["parallel_tool_calls"] is True
assert isinstance(kwargs["prompt_cache_key"], str)
assert len(kwargs["prompt_cache_key"]) > 0
assert "timeout" not in kwargs
# ``timeout`` is now wired from ``_resolved_api_call_timeout`` (default 1800s)
# so per-provider ``request_timeout_seconds`` actually reaches the SDK.
assert isinstance(kwargs.get("timeout"), float)
assert kwargs["timeout"] > 0
assert "max_tokens" not in kwargs
assert "extra_body" not in kwargs
@ -1053,6 +1056,29 @@ def test_preflight_codex_api_kwargs_allows_service_tier(monkeypatch):
assert result["service_tier"] == "priority"
def test_preflight_codex_api_kwargs_preserves_positive_timeout(monkeypatch):
"""Positive numeric timeouts survive preflight so the SDK honors them."""
agent = _build_agent(monkeypatch)
kwargs = _codex_request_kwargs()
kwargs["timeout"] = 600.0
from agent.codex_responses_adapter import _preflight_codex_api_kwargs
result = _preflight_codex_api_kwargs(kwargs)
assert result["timeout"] == 600.0
def test_preflight_codex_api_kwargs_drops_invalid_timeout(monkeypatch):
"""Zero, negative, inf, and booleans are all dropped — not passed to SDK."""
agent = _build_agent(monkeypatch)
from agent.codex_responses_adapter import _preflight_codex_api_kwargs
for bad in (0, -1, float("inf"), True, False, "300", None):
kwargs = _codex_request_kwargs()
kwargs["timeout"] = bad
result = _preflight_codex_api_kwargs(kwargs)
assert "timeout" not in result, f"timeout={bad!r} should be dropped"
def test_run_conversation_codex_replay_payload_keeps_call_id(monkeypatch):
agent = _build_agent(monkeypatch)
responses = [_codex_tool_call_response(), _codex_message_response("done")]