fix(codex): add time-to-first-byte watchdog for stalled Codex streams

The chatgpt.com/backend-api/codex endpoint has an intermittent failure mode where it accepts the connection but never emits a single stream event — the socket just hangs. Direct sequential probing reproduces it (0 events, no HTTP status), and a fresh reconnect then succeeds in ~2s. Today the only guard is the wall-clock stale timeout in interruptible_api_call, so a dead-on-arrival connection is held for the full stale window (90-900s depending on context / config) before the retry loop can reconnect — minutes of wasted wall time per stall, at a rate of ~20% of calls during affected windows. Add a TTFB watchdog scoped to the codex_responses path: - codex_runtime.run_codex_stream stamps agent._codex_stream_last_event_ts on *every* stream event (not just output-text deltas), so reasoning-only and tool-call-only turns are not mistaken for a stall. - interruptible_api_call resets that marker before the worker starts and, while it is still None, kills the connection once elapsed exceeds the TTFB cutoff (default 45s, tunable via HERMES_CODEX_TTFB_TIMEOUT_SECONDS, 0 disables). The raised TimeoutError flows through the existing retry path unchanged. Once any event has arrived the stream is healthy and only the existing wall-clock stale timeout applies, so legitimate long generations are never interrupted. Gated to codex_responses; the chat_completions non-stream, anthropic and bedrock branches have no first-event signal and are untouched. Adds tests/agent/test_codex_ttfb_watchdog.py covering the stall kill, the events-flowing pass-through, and the env-disable path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-29 06:31:32 +00:00 · 2026-05-25 09:09:22 +00:00 · 2026-05-25 09:09:22 +00:00 · 8601c4d44c
commit 8601c4d44c
parent a989a79c0c
3 changed files with 248 additions and 1 deletions
--- a/tests/agent/test_codex_ttfb_watchdog.py
+++ b/tests/agent/test_codex_ttfb_watchdog.py
@ -0,0 +1,175 @@
+"""Regression tests for the Codex time-to-first-byte (TTFB) watchdog.
+
+The chatgpt.com/backend-api/codex endpoint has an intermittent failure mode
+where it accepts the connection but never emits a single stream event. The
+watchdog in ``interruptible_api_call`` kills such a connection at a short TTFB
+cutoff (instead of waiting out the much longer wall-clock stale timeout) so the
+retry loop can reconnect promptly. Once any stream event arrives, the stream is
+considered healthy and only the wall-clock stale timeout applies — long
+generations must never be interrupted by the TTFB cutoff.
+
+The "bytes flowing" signal is ``agent._codex_stream_last_event_ts``, set on
+*any* event by ``codex_runtime.run_codex_stream`` — so reasoning-only or
+tool-call-only turns (which emit no output-text deltas) are not mistaken for a
+stall.
+"""
+
+from __future__ import annotations
+
+import sys
+import time
+import types
+from types import SimpleNamespace
+
+import pytest
+
+# Stub optional heavy imports so run_agent imports cleanly in isolation.
+sys.modules.setdefault("fire", types.SimpleNamespace(Fire=lambda *a, **k: None))
+sys.modules.setdefault("firecrawl", types.SimpleNamespace(Firecrawl=object))
+sys.modules.setdefault("fal_client", types.SimpleNamespace())
+
+
+def _make_codex_agent(tmp_path, monkeypatch):
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    (tmp_path / ".env").write_text("", encoding="utf-8")
+    (tmp_path / "config.yaml").write_text("{}\n", encoding="utf-8")
+    from run_agent import AIAgent
+
+    agent = AIAgent(
+        model="gpt-5.5",
+        provider="openai-codex",
+        api_key="sk-dummy",
+        base_url="https://chatgpt.com/backend-api/codex",
+        quiet_mode=True,
+        skip_context_files=True,
+        skip_memory=True,
+        platform="cli",
+    )
+    # The watchdog is gated on the codex_responses api_mode; assert/force it so
+    # the test is robust to detection-logic changes elsewhere.
+    agent.api_mode = "codex_responses"
+    monkeypatch.setattr(agent, "_emit_status", lambda *a, **k: None)
+    # Keep the wall-clock stale timeout high so any early kill is unambiguously
+    # the TTFB path, not the stale-call path.
+    monkeypatch.setattr(
+        agent, "_compute_non_stream_stale_timeout", lambda *a, **k: 60.0
+    )
+    return agent
+
+
+def test_ttfb_kills_when_no_stream_event(tmp_path, monkeypatch):
+    """Backend accepts the connection but emits no event -> killed at the TTFB
+    cutoff, well before the 60s wall-clock stale timeout, with a retryable
+    TimeoutError and a ``codex_ttfb_kill`` close reason."""
+    from agent import chat_completion_helpers as h
+
+    agent = _make_codex_agent(tmp_path, monkeypatch)
+    monkeypatch.setenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", "1")
+
+    closes: list = []
+    dummy_client = SimpleNamespace()
+    monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
+    monkeypatch.setattr(
+        agent, "_abort_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+    monkeypatch.setattr(
+        agent, "_close_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+
+    stop = {"flag": False}
+
+    def fake_hang(api_kwargs, client=None, on_first_delta=None):
+        # Never set _codex_stream_last_event_ts: simulate zero events arriving.
+        deadline = time.time() + 30
+        while time.time() < deadline and not stop["flag"] and not agent._interrupt_requested:
+            time.sleep(0.02)
+        raise RuntimeError("connection closed")
+
+    monkeypatch.setattr(agent, "_run_codex_stream", fake_hang)
+
+    t0 = time.time()
+    try:
+        with pytest.raises(TimeoutError) as excinfo:
+            h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": "hi"})
+        elapsed = time.time() - t0
+        assert "TTFB" in str(excinfo.value)
+        assert "codex_ttfb_kill" in closes
+        # ~1s cutoff + 2s join grace; must be far under the 60s stale timeout.
+        assert elapsed < 15, f"TTFB watchdog took {elapsed:.1f}s"
+    finally:
+        stop["flag"] = True
+
+
+def test_ttfb_does_not_kill_when_events_flow(tmp_path, monkeypatch):
+    """Once a stream event has arrived, a generation that runs past the TTFB
+    cutoff is NOT killed by the watchdog — it completes normally."""
+    from agent import chat_completion_helpers as h
+
+    agent = _make_codex_agent(tmp_path, monkeypatch)
+    monkeypatch.setenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", "1")
+
+    closes: list = []
+    dummy_client = SimpleNamespace()
+    monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
+    monkeypatch.setattr(
+        agent, "_abort_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+    monkeypatch.setattr(
+        agent, "_close_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+
+    sentinel = SimpleNamespace(ok=True)
+
+    def fake_stream(api_kwargs, client=None, on_first_delta=None):
+        # Bytes flowing: mark stream activity right away, then keep generating
+        # past the 1s TTFB cutoff before returning a real response.
+        agent._codex_stream_last_event_ts = time.time()
+        if on_first_delta:
+            on_first_delta()
+        time.sleep(2.0)
+        return sentinel
+
+    monkeypatch.setattr(agent, "_run_codex_stream", fake_stream)
+
+    resp = h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": "hi"})
+    assert resp is sentinel
+    assert "codex_ttfb_kill" not in closes
+
+
+def test_ttfb_disabled_via_env_zero(tmp_path, monkeypatch):
+    """Setting HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0 disables the TTFB watchdog;
+    a no-event stall then falls through to the (here, 60s) stale timeout, so a
+    short hang is NOT killed by TTFB."""
+    from agent import chat_completion_helpers as h
+
+    agent = _make_codex_agent(tmp_path, monkeypatch)
+    monkeypatch.setenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", "0")
+
+    closes: list = []
+    dummy_client = SimpleNamespace()
+    monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
+    monkeypatch.setattr(
+        agent, "_abort_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+    monkeypatch.setattr(
+        agent, "_close_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+
+    sentinel = SimpleNamespace(ok=True)
+
+    def fake_stream(api_kwargs, client=None, on_first_delta=None):
+        # No event marker, but only briefly — well under the 60s stale timeout.
+        time.sleep(2.0)
+        return sentinel
+
+    monkeypatch.setattr(agent, "_run_codex_stream", fake_stream)
+
+    resp = h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": "hi"})
+    assert resp is sentinel
+    assert "codex_ttfb_kill" not in closes