diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py index 09b5b730fd2..cc882f7e55c 100644 --- a/agent/chat_completion_helpers.py +++ b/agent/chat_completion_helpers.py @@ -284,8 +284,15 @@ def interruptible_api_call(agent, api_kwargs: dict): else: _codex_idle_timeout_default = 12.0 + # No-byte TTFB cutoff. The OpenAI SDK's own streaming read timeout is far + # longer (openai 2.x DEFAULT_TIMEOUT.read = 600s), so a tight 12s default + # killed subscription-backed Codex requests mid-prefill before the backend + # had a chance to emit its first SSE event. Default to 120s — long enough to + # clear normal backend admission / prompt prefill, short enough to still + # reconnect promptly when the socket is genuinely wedged. Set + # HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0 to disable this watchdog entirely. _ttfb_enabled = _codex_watchdog_enabled - _ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 12.0) + _ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 120.0) if _ttfb_timeout <= 0: _ttfb_enabled = False elif _openai_codex_backend: @@ -307,7 +314,7 @@ def interruptible_api_call(agent, api_kwargs: dict): _ttfb_disable_above, ) else: - _ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 20.0) + _ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 120.0) if _ttfb_cap > 0 and _ttfb_timeout > _ttfb_cap: logger.info( "Capping openai-codex no-byte TTFB timeout from %.0fs to %.0fs " diff --git a/tests/agent/test_codex_ttfb_watchdog.py b/tests/agent/test_codex_ttfb_watchdog.py index 02f3e750c7c..d989d69d1e3 100644 --- a/tests/agent/test_codex_ttfb_watchdog.py +++ b/tests/agent/test_codex_ttfb_watchdog.py @@ -102,6 +102,47 @@ def test_ttfb_kills_when_no_stream_event(tmp_path, monkeypatch): stop["flag"] = True +def test_ttfb_default_tolerates_slow_first_event(tmp_path, monkeypatch): + """With no env var set, the no-byte TTFB default is generous (120s), so a + request whose first stream event is merely slow (~2s of backend admission / + prefill) is NOT killed. This is the subscription-backed Codex case the tight + 12s default used to abort mid-prefill.""" + from agent import chat_completion_helpers as h + + agent = _make_codex_agent(tmp_path, monkeypatch) + # Default behavior: no explicit TTFB override. + monkeypatch.delenv("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", raising=False) + monkeypatch.delenv("HERMES_CODEX_TTFB_MAX_SECONDS", raising=False) + + closes: list = [] + dummy_client = SimpleNamespace() + monkeypatch.setattr(agent, "_create_request_openai_client", lambda **k: dummy_client) + monkeypatch.setattr( + agent, "_abort_request_openai_client", + lambda c, reason=None: closes.append(reason), + ) + monkeypatch.setattr( + agent, "_close_request_openai_client", + lambda c, reason=None: closes.append(reason), + ) + + sentinel = SimpleNamespace(ok=True) + + def fake_slow_first_event(api_kwargs, client=None, on_first_delta=None): + # Backend is alive but slow to admit: first event lands after ~2s, + # well under the 120s default cutoff. Mark the first byte so the + # no-byte detector sees activity, then return the response. + time.sleep(2.0) + agent._codex_stream_last_event_ts = time.time() + return sentinel + + monkeypatch.setattr(agent, "_run_codex_stream", fake_slow_first_event) + + resp = h.interruptible_api_call(agent, {"model": "gpt-5.5", "input": "hi"}) + assert resp is sentinel + assert "codex_ttfb_kill" not in closes + + def test_ttfb_includes_silent_hang_hint_for_gpt_5_5(tmp_path, monkeypatch): """The no-first-byte watchdog should surface the same actionable hint as the stale-call timeout path when the model matches the silent-hang heuristic."""