fix(codex): relax no-byte TTFB watchdog default from 12s to 120s

The chatgpt.com/backend-api/codex endpoint can spend tens of seconds in
backend admission / prompt prefill before emitting its first SSE event. The
12s no-byte TTFB cutoff aborted those still-valid streams, surfacing as
'Codex stream produced no bytes within 12s' through all retries (Discord
reports). The OpenAI SDK's own streaming read timeout is 600s, so 12s was
~50x more aggressive than the transport layer would have tolerated.

Default the no-byte cutoff to 120s and raise the openai-codex MAX cap default
to 120s so it no longer clamps the new default back to 20s. Disabling stays
available via HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0; the 25k-token auto-disable,
_STRICT override, and post-first-event idle watchdog are unchanged.

Co-authored-by: Gille <4317663+helix4u@users.noreply.github.com>
This commit is contained in:
teknium1 2026-05-29 01:24:53 -07:00 committed by Teknium
parent 6bebab4761
commit 73d73f1f0d
2 changed files with 50 additions and 2 deletions

View file

@ -284,8 +284,15 @@ def interruptible_api_call(agent, api_kwargs: dict):
else:
_codex_idle_timeout_default = 12.0
# No-byte TTFB cutoff. The OpenAI SDK's own streaming read timeout is far
# longer (openai 2.x DEFAULT_TIMEOUT.read = 600s), so a tight 12s default
# killed subscription-backed Codex requests mid-prefill before the backend
# had a chance to emit its first SSE event. Default to 120s — long enough to
# clear normal backend admission / prompt prefill, short enough to still
# reconnect promptly when the socket is genuinely wedged. Set
# HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0 to disable this watchdog entirely.
_ttfb_enabled = _codex_watchdog_enabled
_ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 12.0)
_ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 120.0)
if _ttfb_timeout <= 0:
_ttfb_enabled = False
elif _openai_codex_backend:
@ -307,7 +314,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
_ttfb_disable_above,
)
else:
_ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 20.0)
_ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 120.0)
if _ttfb_cap > 0 and _ttfb_timeout > _ttfb_cap:
logger.info(
"Capping openai-codex no-byte TTFB timeout from %.0fs to %.0fs "

View file

@ -102,6 +102,47 @@ def test_ttfb_kills_when_no_stream_event(tmp_path, monkeypatch):
stop["flag"] = True
def test_ttfb_default_tolerates_slow_first_event(tmp_path, monkeypatch):
"""With no env var set, the no-byte TTFB default is generous (120s), so a
request whose first stream event is merely slow (~2s of backend admission /
prefill) is NOT killed. This is the subscription-backed Codex case the tight
12s default used to abort mid-prefill."""
from agent import chat_completion_helpers as h
agent = _make_codex_agent(tmp_path, monkeypatch)
# Default behavior: no explicit TTFB override.
monkeypatch.delenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", raising=False)
monkeypatch.delenv("HERMES_CODEX_TTFB_MAX_SECONDS", raising=False)
closes: list = []
dummy_client = SimpleNamespace()
monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
monkeypatch.setattr(
agent, "_abort_request_openai_client",
lambda c, reason=None: closes.append(reason),
)
monkeypatch.setattr(
agent, "_close_request_openai_client",
lambda c, reason=None: closes.append(reason),
)
sentinel = SimpleNamespace(ok=True)
def fake_slow_first_event(api_kwargs, client=None, on_first_delta=None):
# Backend is alive but slow to admit: first event lands after ~2s,
# well under the 120s default cutoff. Mark the first byte so the
# no-byte detector sees activity, then return the response.
time.sleep(2.0)
agent._codex_stream_last_event_ts = time.time()
return sentinel
monkeypatch.setattr(agent, "_run_codex_stream", fake_slow_first_event)
resp = h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": "hi"})
assert resp is sentinel
assert "codex_ttfb_kill" not in closes
def test_ttfb_includes_silent_hang_hint_for_gpt_5_5(tmp_path, monkeypatch):
"""The no-first-byte watchdog should surface the same actionable hint as the
stale-call timeout path when the model matches the silent-hang heuristic."""