fix(agent): tolerate large codex stream prefill

2026-05-29 06:31:32 +00:00 · 2026-05-28 02:42:18 +09:00 · 2026-05-28 02:42:18 +09:00 · 283bb810e7
commit 283bb810e7
parent 486d632cc2
2 changed files with 294 additions and 18 deletions
--- a/tests/agent/test_codex_ttfb_watchdog.py
+++ b/tests/agent/test_codex_ttfb_watchdog.py
@ -4,9 +4,9 @@ The chatgpt.com/backend-api/codex endpoint has an intermittent failure mode
 where it accepts the connection but never emits a single stream event. The
 watchdog in ``interruptible_api_call`` kills such a connection at a short TTFB
 cutoff (instead of waiting out the much longer wall-clock stale timeout) so the
-retry loop can reconnect promptly. Once any stream event arrives, the stream is
-considered healthy and only the wall-clock stale timeout applies — long
-generations must never be interrupted by the TTFB cutoff.
+retry loop can reconnect promptly. Once any stream event arrives, the TTFB
+watchdog is satisfied and a separate idle watchdog handles streams that stop
+emitting SSE events.

 The "bytes flowing" signal is ``agent._codex_stream_last_event_ts``, set on
 *any* event by ``codex_runtime.run_codex_stream`` — so reasoning-only or
@ -148,6 +148,49 @@ def test_ttfb_includes_silent_hang_hint_for_gpt_5_5(tmp_path, monkeypatch):
        stop["flag"] = True


+def test_ttfb_high_env_is_capped_for_openai_codex(tmp_path, monkeypatch):
+    """A stale local env value like 90s must not make openai-codex wait 90s
+    before reconnecting when the backend emits no SSE frames."""
+    from agent import chat_completion_helpers as h
+
+    agent = _make_codex_agent(tmp_path, monkeypatch)
+    monkeypatch.setenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", "90")
+    monkeypatch.setenv("HERMES_CODEX_TTFB_MAX_SECONDS", "1")
+
+    closes: list = []
+    dummy_client = SimpleNamespace()
+    monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
+    monkeypatch.setattr(
+        agent, "_abort_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+    monkeypatch.setattr(
+        agent, "_close_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+
+    stop = {"flag": False}
+
+    def fake_hang(api_kwargs, client=None, on_first_delta=None):
+        deadline = time.time() + 30
+        while time.time() < deadline and not stop["flag"] and not agent._interrupt_requested:
+            time.sleep(0.02)
+        raise RuntimeError("connection closed")
+
+    monkeypatch.setattr(agent, "_run_codex_stream", fake_hang)
+
+    t0 = time.time()
+    try:
+        with pytest.raises(TimeoutError) as excinfo:
+            h.interruptible_api_call(agent, {"model": "gpt-5.4", "input": "hi"})
+        elapsed = time.time() - t0
+        assert "TTFB threshold: 1s" in str(excinfo.value)
+        assert "codex_ttfb_kill" in closes
+        assert elapsed < 15, f"TTFB watchdog ignored cap and took {elapsed:.1f}s"
+    finally:
+        stop["flag"] = True
+
+
 def test_ttfb_does_not_kill_when_events_flow(tmp_path, monkeypatch):
    """Once a stream event has arrived, a generation that runs past the TTFB
    cutoff is NOT killed by the watchdog — it completes normally."""
@ -186,6 +229,51 @@ def test_ttfb_does_not_kill_when_events_flow(tmp_path, monkeypatch):
    assert "codex_ttfb_kill" not in closes


+def test_event_idle_kills_after_first_event_then_silence(tmp_path, monkeypatch):
+    """If Codex emits an opening SSE event and then goes silent, kill it via
+    the stream-idle watchdog instead of waiting for the long non-stream stale
+    timeout."""
+    from agent import chat_completion_helpers as h
+
+    agent = _make_codex_agent(tmp_path, monkeypatch)
+    monkeypatch.setenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", "10")
+    monkeypatch.setenv("HERMES_CODEX_EVENT_STALE_TIMEOUT_SECONDS", "1")
+
+    closes: list = []
+    dummy_client = SimpleNamespace()
+    monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
+    monkeypatch.setattr(
+        agent,
+        "_abort_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+    monkeypatch.setattr(
+        agent,
+        "_close_request_openai_client",
+        lambda c, reason=None: closes.append(reason),
+    )
+
+    stop = {"flag": False}
+
+    def fake_stream(api_kwargs, client=None, on_first_delta=None):
+        agent._codex_stream_last_event_ts = time.time()
+        deadline = time.time() + 30
+        while time.time() < deadline and not stop["flag"] and not agent._interrupt_requested:
+            time.sleep(0.02)
+        raise RuntimeError("connection closed")
+
+    monkeypatch.setattr(agent, "_run_codex_stream", fake_stream)
+
+    try:
+        with pytest.raises(TimeoutError) as excinfo:
+            h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": "hi"})
+        assert "after first byte" in str(excinfo.value)
+        assert "codex_stream_idle_kill" in closes
+        assert "codex_ttfb_kill" not in closes
+    finally:
+        stop["flag"] = True
+
+
 def test_ttfb_disabled_via_env_zero(tmp_path, monkeypatch):
    """Setting HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0 disables the TTFB watchdog;
    a no-event stall then falls through to the (here, 60s) stale timeout, so a
@ -219,3 +307,77 @@ def test_ttfb_disabled_via_env_zero(tmp_path, monkeypatch):
    resp = h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": "hi"})
    assert resp is sentinel
    assert "codex_ttfb_kill" not in closes
+
+
+def test_large_codex_request_waits_instead_of_ttfb_reconnect(tmp_path, monkeypatch):
+    """Large Codex inputs can legitimately take longer than the small-request
+    first-byte cutoff before the first SSE frame. Preserve the full input and
+    wait instead of killing/retrying at TTFB."""
+    from agent import chat_completion_helpers as h
+
+    agent = _make_codex_agent(tmp_path, monkeypatch)
+    monkeypatch.setenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", "1")
+
+    closes: list = []
+    dummy_client = SimpleNamespace()
+    monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
+    monkeypatch.setattr(
+        agent, "_abort_request_openai_client", lambda c, reason=None: closes.append(reason)
+    )
+    monkeypatch.setattr(
+        agent, "_close_request_openai_client", lambda c, reason=None: closes.append(reason)
+    )
+
+    sentinel = SimpleNamespace(ok=True)
+
+    def fake_stream(api_kwargs, client=None, on_first_delta=None):
+        # No event marker for 2s: this would trip the 1s TTFB watchdog on a
+        # small request, but should be allowed for a large request.
+        time.sleep(2.0)
+        return sentinel
+
+    monkeypatch.setattr(agent, "_run_codex_stream", fake_stream)
+
+    large_input = "x" * 120_000  # ~30k estimated tokens, above large-request gate.
+    resp = h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": large_input})
+    assert resp is sentinel
+    assert "codex_ttfb_kill" not in closes
+
+
+def test_large_codex_request_strict_ttfb_env_still_reconnects(tmp_path, monkeypatch):
+    """Operators can force the old early-reconnect behavior for large inputs
+    with HERMES_CODEX_TTFB_STRICT=1."""
+    from agent import chat_completion_helpers as h
+
+    agent = _make_codex_agent(tmp_path, monkeypatch)
+    monkeypatch.setenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", "1")
+    monkeypatch.setenv("HERMES_CODEX_TTFB_STRICT", "1")
+
+    closes: list = []
+    dummy_client = SimpleNamespace()
+    monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client)
+    monkeypatch.setattr(
+        agent, "_abort_request_openai_client", lambda c, reason=None: closes.append(reason)
+    )
+    monkeypatch.setattr(
+        agent, "_close_request_openai_client", lambda c, reason=None: closes.append(reason)
+    )
+
+    stop = {"flag": False}
+
+    def fake_hang(api_kwargs, client=None, on_first_delta=None):
+        deadline = time.time() + 30
+        while time.time() < deadline and not stop["flag"] and not agent._interrupt_requested:
+            time.sleep(0.02)
+        raise RuntimeError("connection closed")
+
+    monkeypatch.setattr(agent, "_run_codex_stream", fake_hang)
+
+    large_input = "x" * 120_000
+    try:
+        with pytest.raises(TimeoutError) as excinfo:
+            h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": large_input})
+        assert "TTFB threshold: 1s" in str(excinfo.value)
+        assert "codex_ttfb_kill" in closes
+    finally:
+        stop["flag"] = True