diff --git a/gateway/run.py b/gateway/run.py index 597bef8edd0..c8b7a34ea42 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -203,6 +203,78 @@ def _is_fresh_gateway_interruption( return current - timestamp <= window +# Assistant-message fields that must survive transcript replay so multi-turn +# reasoning context, prefix-cache hits, and provider-specific echo +# requirements all behave the same on the gateway as they do in the CLI. +# +# ``reasoning`` and ``reasoning_details`` were the original three preserved +# by PR #2974 (schema v6). ``reasoning_content``, ``codex_reasoning_items``, +# ``codex_message_items``, and ``finish_reason`` were added to the DB later +# but the gateway's replay whitelist was never expanded to match — so any +# pure-text assistant turn (no ``tool_calls``) silently dropped them on +# replay, regressing the CLI-vs-gateway behavioural parity. +# +# Why each field matters on replay: +# * ``reasoning`` / ``reasoning_content``: provider-facing thinking text. +# ``_copy_reasoning_content_for_api`` promotes ``reasoning`` → +# ``reasoning_content`` at send time, but only when the strings happen to +# match. Carrying the original ``reasoning_content`` verbatim avoids +# reconstruction loss for providers that return them as distinct fields +# (DeepSeek/Kimi/Moonshot thinking modes). +# * ``reasoning_details``: opaque structured array (signature, +# encrypted_content) used by OpenRouter/Anthropic to maintain reasoning +# continuity across turns. +# * ``codex_reasoning_items``: encrypted reasoning blobs for the OpenAI +# Codex Responses API. +# * ``codex_message_items``: exact assistant message items with ``phase``. +# OpenAI docs: "preserve and resend phase on all assistant messages — +# dropping it can degrade performance." Required for prefix cache hits. +# * ``finish_reason``: informational; cheap to keep so transcripts replay +# identically across CLI and gateway. +_ASSISTANT_REPLAY_FIELDS: tuple[str, ...] = ( + "reasoning", + "reasoning_content", + "reasoning_details", + "codex_reasoning_items", + "codex_message_items", + "finish_reason", +) + + +def _build_replay_entry(role: str, content: Any, msg: Dict[str, Any]) -> Dict[str, Any]: + """Build a replay entry for a non-tool-calling message, preserving the + assistant fields the agent's API builders rely on for multi-turn fidelity. + + Lifted out of the inline ``run_sync`` closure so the field whitelist can + be unit-tested in isolation. Mirrors the ``_ASSISTANT_REPLAY_FIELDS`` + contract above. + + Empty values: most fields are dropped when falsy (matching the original + PR #2974 behaviour) since an empty list/string for those carries no + information. The exception is ``reasoning_content``: DeepSeek/Kimi + thinking-mode replay treats an empty string as a meaningful sentinel + that ``_copy_reasoning_content_for_api`` upgrades to a single space. + Dropping it here would make the gateway send no ``reasoning_content`` at + all on the next turn, which can cause HTTP 400 from strict thinking + providers. + """ + entry: Dict[str, Any] = {"role": role, "content": content} + if role == "assistant": + for _rkey in _ASSISTANT_REPLAY_FIELDS: + if _rkey not in msg: + continue + _rval = msg.get(_rkey) + if _rkey == "reasoning_content": + # Preserve empty-string sentinel for thinking-mode replay. + if _rval is None: + continue + else: + if not _rval: + continue + entry[_rkey] = _rval + return entry + + def _last_transcript_timestamp(history: Optional[List[Dict[str, Any]]]) -> Any: """Return the ``timestamp`` of the last usable transcript row, if any. @@ -14412,17 +14484,12 @@ class GatewayRunner: if msg.get("mirror"): mirror_src = msg.get("mirror_source", "another session") content = f"[Delivered from {mirror_src}] {content}" - entry = {"role": role, "content": content} - # Preserve reasoning fields on assistant messages so - # multi-turn reasoning context survives session reload. - # The agent's _build_api_kwargs converts these to the - # provider-specific format (reasoning_content, etc.). - if role == "assistant": - for _rkey in ("reasoning", "reasoning_details", - "codex_reasoning_items"): - _rval = msg.get(_rkey) - if _rval: - entry[_rkey] = _rval + # Preserve assistant reasoning + Codex replay fields so + # multi-turn reasoning context, prefix-cache hits, and + # provider-specific echo requirements survive session + # reload. See ``_ASSISTANT_REPLAY_FIELDS`` for the full + # whitelist and rationale. + entry = _build_replay_entry(role, content, msg) agent_history.append(entry) # Collect MEDIA paths already in history so we can exclude them diff --git a/tests/gateway/test_replay_entry_fields.py b/tests/gateway/test_replay_entry_fields.py new file mode 100644 index 00000000000..4858cf62522 --- /dev/null +++ b/tests/gateway/test_replay_entry_fields.py @@ -0,0 +1,254 @@ +"""Tests for ``gateway.run._build_replay_entry``. + +The gateway rebuilds ``agent_history`` from the persisted transcript on every +turn (unlike the CLI, which keeps the live in-memory message list). When a +pure-text assistant turn (no ``tool_calls``) is replayed, the simple-text +branch in ``run_sync`` used to whitelist only three reasoning fields: +``reasoning``, ``reasoning_details``, ``codex_reasoning_items``. + +That whitelist predated three fields the DB now persists: +``reasoning_content``, ``codex_message_items``, and ``finish_reason``. The +unrecovered drop of ``codex_message_items`` in particular kills prefix-cache +hits for OpenAI Codex Responses API users — OpenAI's docs require the +``phase`` field be replayed on every assistant message. + +These tests pin the expanded whitelist so it doesn't regress. +""" +from __future__ import annotations + +import pytest + +from gateway.run import _ASSISTANT_REPLAY_FIELDS, _build_replay_entry + + +class TestBuildReplayEntry: + def test_user_message_has_only_role_and_content(self): + entry = _build_replay_entry( + "user", + "hello", + {"role": "user", "content": "hello", "reasoning": "leak", "extra": "drop"}, + ) + assert entry == {"role": "user", "content": "hello"} + + def test_tool_message_has_only_role_and_content(self): + # Tool messages aren't routed through this helper in production + # (they take the rich-passthrough branch), but the helper itself + # must not leak reasoning fields onto non-assistant roles even if + # someone calls it incorrectly. + entry = _build_replay_entry( + "tool", + "result", + {"role": "tool", "content": "result", "reasoning": "leak"}, + ) + assert entry == {"role": "tool", "content": "result"} + + def test_assistant_minimal_has_only_role_and_content(self): + entry = _build_replay_entry( + "assistant", + "ok", + {"role": "assistant", "content": "ok"}, + ) + assert entry == {"role": "assistant", "content": "ok"} + + def test_assistant_preserves_reasoning(self): + msg = { + "role": "assistant", + "content": "answer", + "reasoning": "I think therefore I am.", + } + entry = _build_replay_entry("assistant", "answer", msg) + assert entry["reasoning"] == "I think therefore I am." + + def test_assistant_preserves_reasoning_content(self): + """reasoning_content was silently dropped before this fix. + + Required for DeepSeek/Kimi/Moonshot thinking-mode echo so the + provider receives back what it sent. + """ + msg = { + "role": "assistant", + "content": "answer", + "reasoning_content": "structured CoT", + } + entry = _build_replay_entry("assistant", "answer", msg) + assert entry["reasoning_content"] == "structured CoT" + + def test_assistant_preserves_reasoning_details(self): + details = [ + { + "type": "reasoning.summary", + "format": "text", + "summary": "thought hard", + }, + { + "type": "reasoning.encrypted", + "data": "opaque_blob", + "signature": "sig123", + }, + ] + msg = { + "role": "assistant", + "content": "answer", + "reasoning_details": details, + } + entry = _build_replay_entry("assistant", "answer", msg) + assert entry["reasoning_details"] == details + + def test_assistant_preserves_codex_reasoning_items(self): + items = [{"type": "reasoning", "encrypted_content": "blob"}] + msg = { + "role": "assistant", + "content": "answer", + "codex_reasoning_items": items, + } + entry = _build_replay_entry("assistant", "answer", msg) + assert entry["codex_reasoning_items"] == items + + def test_assistant_preserves_codex_message_items(self): + """codex_message_items was silently dropped before this fix. + + OpenAI docs: 'preserve and resend phase on all assistant messages + — dropping it can degrade performance.' Required for prefix + cache hits on the Codex Responses API. + """ + items = [ + { + "type": "message", + "role": "assistant", + "id": "msg_123", + "phase": "final_answer", + "content": [{"type": "output_text", "text": "Done"}], + } + ] + msg = { + "role": "assistant", + "content": "Done", + "codex_message_items": items, + } + entry = _build_replay_entry("assistant", "Done", msg) + assert entry["codex_message_items"] == items + + def test_assistant_preserves_finish_reason(self): + """finish_reason was silently dropped before this fix. + + Cheap to keep; lets transcripts replay byte-identically across + CLI and gateway. + """ + msg = { + "role": "assistant", + "content": "answer", + "finish_reason": "stop", + } + entry = _build_replay_entry("assistant", "answer", msg) + assert entry["finish_reason"] == "stop" + + def test_assistant_drops_falsy_reasoning(self): + """Empty/None reasoning fields stay dropped (matching PR #2974 + behaviour) — empty strings/lists for these fields carry no info.""" + msg = { + "role": "assistant", + "content": "answer", + "reasoning": "", + "reasoning_details": [], + "codex_reasoning_items": [], + "codex_message_items": [], + "finish_reason": "", + } + entry = _build_replay_entry("assistant", "answer", msg) + assert entry == {"role": "assistant", "content": "answer"} + + def test_assistant_preserves_empty_reasoning_content(self): + """Empty reasoning_content is a meaningful sentinel. + + DeepSeek V4 Pro thinking mode rejects bare missing reasoning_content + with HTTP 400. ``_copy_reasoning_content_for_api`` upgrades the + empty string to a single space at API-send time, but only if the + empty string actually reached it. Dropping it here would 400 the + next turn for affected providers. + """ + msg = { + "role": "assistant", + "content": "answer", + "reasoning_content": "", + } + entry = _build_replay_entry("assistant", "answer", msg) + assert "reasoning_content" in entry + assert entry["reasoning_content"] == "" + + def test_assistant_drops_none_reasoning_content(self): + """None reasoning_content is just an absent field; drop it.""" + msg = { + "role": "assistant", + "content": "answer", + "reasoning_content": None, + } + entry = _build_replay_entry("assistant", "answer", msg) + assert "reasoning_content" not in entry + + def test_assistant_preserves_all_six_fields_together(self): + details = [{"type": "reasoning.summary", "summary": "s"}] + codex_items = [{"type": "reasoning", "encrypted_content": "b"}] + msg_items = [ + { + "type": "message", + "role": "assistant", + "phase": "final_answer", + "content": [{"type": "output_text", "text": "x"}], + } + ] + msg = { + "role": "assistant", + "content": "answer", + "reasoning": "thinking", + "reasoning_content": "structured", + "reasoning_details": details, + "codex_reasoning_items": codex_items, + "codex_message_items": msg_items, + "finish_reason": "stop", + } + entry = _build_replay_entry("assistant", "answer", msg) + assert entry["reasoning"] == "thinking" + assert entry["reasoning_content"] == "structured" + assert entry["reasoning_details"] == details + assert entry["codex_reasoning_items"] == codex_items + assert entry["codex_message_items"] == msg_items + assert entry["finish_reason"] == "stop" + + def test_assistant_does_not_invent_keys(self): + """The helper only copies over fields that are explicitly present.""" + msg = {"role": "assistant", "content": "answer", "reasoning": "r"} + entry = _build_replay_entry("assistant", "answer", msg) + # reasoning_details/etc. weren't in msg, so they shouldn't be in entry + for absent in ( + "reasoning_content", + "reasoning_details", + "codex_reasoning_items", + "codex_message_items", + "finish_reason", + ): + assert absent not in entry + + def test_replay_fields_constant_is_stable(self): + """Pin the whitelist explicitly so accidental renames are caught.""" + assert _ASSISTANT_REPLAY_FIELDS == ( + "reasoning", + "reasoning_content", + "reasoning_details", + "codex_reasoning_items", + "codex_message_items", + "finish_reason", + ) + + def test_unrelated_keys_are_ignored(self): + """Random keys on the message must not leak into the replay entry.""" + msg = { + "role": "assistant", + "content": "answer", + "timestamp": 12345.6, + "internal_marker": "should not flow", + "tool_call_id": "should not be set on simple-text branch", + } + entry = _build_replay_entry("assistant", "answer", msg) + assert "timestamp" not in entry + assert "internal_marker" not in entry + assert "tool_call_id" not in entry