fix(codex): relax no-byte TTFB watchdog default from 12s to 120s

The chatgpt.com/backend-api/codex endpoint can spend tens of seconds in backend admission / prompt prefill before emitting its first SSE event. The 12s no-byte TTFB cutoff aborted those still-valid streams, surfacing as 'Codex stream produced no bytes within 12s' through all retries (Discord reports). The OpenAI SDK's own streaming read timeout is 600s, so 12s was ~50x more aggressive than the transport layer would have tolerated. Default the no-byte cutoff to 120s and raise the openai-codex MAX cap default to 120s so it no longer clamps the new default back to 20s. Disabling stays available via HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0; the 25k-token auto-disable, _STRICT override, and post-first-event idle watchdog are unchanged. Co-authored-by: Gille <4317663+helix4u@users.noreply.github.com>
2026-07-26 17:38:36 +00:00 · 2026-05-29 01:24:53 -07:00 · 2026-05-29 01:24:53 -07:00 · 73d73f1f0d
commit 73d73f1f0d
parent 6bebab4761
2 changed files with 50 additions and 2 deletions
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@ -284,8 +284,15 @@ def interruptible_api_call(agent, api_kwargs: dict):
    else:
        _codex_idle_timeout_default = 12.0

+    # No-byte TTFB cutoff. The OpenAI SDK's own streaming read timeout is far
+    # longer (openai 2.x DEFAULT_TIMEOUT.read = 600s), so a tight 12s default
+    # killed subscription-backed Codex requests mid-prefill before the backend
+    # had a chance to emit its first SSE event. Default to 120s — long enough to
+    # clear normal backend admission / prompt prefill, short enough to still
+    # reconnect promptly when the socket is genuinely wedged. Set
+    # HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0 to disable this watchdog entirely.
    _ttfb_enabled = _codex_watchdog_enabled
-    _ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 12.0)
+    _ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 120.0)
    if _ttfb_timeout <= 0:
        _ttfb_enabled = False
    elif _openai_codex_backend:
@ -307,7 +314,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
                _ttfb_disable_above,
            )
        else:
-            _ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 20.0)
+            _ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 120.0)
            if _ttfb_cap > 0 and _ttfb_timeout > _ttfb_cap:
                logger.info(
                    "Capping openai-codex no-byte TTFB timeout from %.0fs to %.0fs "
--- a/tests/agent/test_codex_ttfb_watchdog.py
+++ b/tests/agent/test_codex_ttfb_watchdog.py
@ -102,6 +102,47 @@ def test_ttfb_kills_when_no_stream_event(tmp_path, monkeypatch):
        stop["flag"] = True


+def test_ttfb_default_tolerates_slow_first_event(tmp_path, monkeypatch):
+    """With no env var set, the no-byte TTFB default is generous (120s), so a
+    request whose first stream event is merely slow (~2s of backend admission /
+    prefill) is NOT killed. This is the subscription-backed Codex case the tight
+    12s default used to abort mid-prefill."""
+    from agent import chat_completion_helpers as h
+
+    agent = _make_codex_agent(tmp_path, monkeypatch)
+    # Default behavior: no explicit TTFB override.
+    monkeypatch.delenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", raising=False)
+    monkeypatch.delenv("HERMES_CODEX_TTFB_MAX_SECONDS", raising=False)
+
+    closes: list = []
+    dummy_client = SimpleNamespace()
+    monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
+    monkeypatch.setattr(
+        agent, "_abort_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+    monkeypatch.setattr(
+        agent, "_close_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+
+    sentinel = SimpleNamespace(ok=True)
+
+    def fake_slow_first_event(api_kwargs, client=None, on_first_delta=None):
+        # Backend is alive but slow to admit: first event lands after ~2s,
+        # well under the 120s default cutoff. Mark the first byte so the
+        # no-byte detector sees activity, then return the response.
+        time.sleep(2.0)
+        agent._codex_stream_last_event_ts = time.time()
+        return sentinel
+
+    monkeypatch.setattr(agent, "_run_codex_stream", fake_slow_first_event)
+
+    resp = h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": "hi"})
+    assert resp is sentinel
+    assert "codex_ttfb_kill" not in closes
+
+
 def test_ttfb_includes_silent_hang_hint_for_gpt_5_5(tmp_path, monkeypatch):
    """The no-first-byte watchdog should surface the same actionable hint as the
    stale-call timeout path when the model matches the silent-hang heuristic."""