fix(gateway): preserve reasoning_content, codex_message_items, finish_reason on transcript replay (#22839)

PR #2974 whitelisted three reasoning fields (reasoning, reasoning_details, codex_reasoning_items) for the gateway's simple-text replay branch. Three more fields were added to the DB later but the whitelist was never updated: - reasoning_content: provider-facing thinking text. _copy_reasoning_content_for_api promotes 'reasoning' -> 'reasoning_content' at send time only when the strings happen to match. Carrying the original verbatim avoids loss for providers that return them as distinct fields (DeepSeek/Kimi/ Moonshot thinking modes), and preserves the empty-string sentinel that DeepSeek V4 Pro requires for thinking-mode replay. - codex_message_items: exact assistant message items with 'phase'. OpenAI docs: 'preserve and resend phase on all assistant messages — dropping it can degrade performance.' Required for prefix cache hits. No recovery path exists — once dropped, gone. - finish_reason: informational; cheap to keep so transcripts replay identically across CLI and gateway. The CLI is unaffected because cli.py keeps the live in-memory message list across turns (cli.py:10046 'self.conversation_history = result["messages"]'). The gateway rebuilds agent_history from the SQLite transcript on every turn, so any field stripped during replay is silently lost. Refactors the inline whitelist into a module-level _build_replay_entry() helper so the contract can be unit-tested. 16 new tests pin the field set and falsy-value handling. Verified end-to-end: DB stores all 8 fields, replay now preserves all 8 (was preserving only 5 for assistant text turns).
2026-05-18 04:41:56 +00:00 · 2026-05-09 14:47:33 -07:00 · 2026-05-09 14:47:33 -07:00 · 70bfd429e5
commit 70bfd429e5
parent c7f0aab949
2 changed files with 332 additions and 11 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -203,6 +203,78 @@ def _is_fresh_gateway_interruption(
    return current - timestamp <= window


+# Assistant-message fields that must survive transcript replay so multi-turn
+# reasoning context, prefix-cache hits, and provider-specific echo
+# requirements all behave the same on the gateway as they do in the CLI.
+#
+# ``reasoning`` and ``reasoning_details`` were the original three preserved
+# by PR #2974 (schema v6).  ``reasoning_content``, ``codex_reasoning_items``,
+# ``codex_message_items``, and ``finish_reason`` were added to the DB later
+# but the gateway's replay whitelist was never expanded to match — so any
+# pure-text assistant turn (no ``tool_calls``) silently dropped them on
+# replay, regressing the CLI-vs-gateway behavioural parity.
+#
+# Why each field matters on replay:
+#   * ``reasoning`` / ``reasoning_content``: provider-facing thinking text.
+#     ``_copy_reasoning_content_for_api`` promotes ``reasoning`` →
+#     ``reasoning_content`` at send time, but only when the strings happen to
+#     match.  Carrying the original ``reasoning_content`` verbatim avoids
+#     reconstruction loss for providers that return them as distinct fields
+#     (DeepSeek/Kimi/Moonshot thinking modes).
+#   * ``reasoning_details``: opaque structured array (signature,
+#     encrypted_content) used by OpenRouter/Anthropic to maintain reasoning
+#     continuity across turns.
+#   * ``codex_reasoning_items``: encrypted reasoning blobs for the OpenAI
+#     Codex Responses API.
+#   * ``codex_message_items``: exact assistant message items with ``phase``.
+#     OpenAI docs: "preserve and resend phase on all assistant messages —
+#     dropping it can degrade performance."  Required for prefix cache hits.
+#   * ``finish_reason``: informational; cheap to keep so transcripts replay
+#     identically across CLI and gateway.
+_ASSISTANT_REPLAY_FIELDS: tuple[str, ...] = (
+    "reasoning",
+    "reasoning_content",
+    "reasoning_details",
+    "codex_reasoning_items",
+    "codex_message_items",
+    "finish_reason",
+)
+
+
+def _build_replay_entry(role: str, content: Any, msg: Dict[str, Any]) -> Dict[str, Any]:
+    """Build a replay entry for a non-tool-calling message, preserving the
+    assistant fields the agent's API builders rely on for multi-turn fidelity.
+
+    Lifted out of the inline ``run_sync`` closure so the field whitelist can
+    be unit-tested in isolation.  Mirrors the ``_ASSISTANT_REPLAY_FIELDS``
+    contract above.
+
+    Empty values: most fields are dropped when falsy (matching the original
+    PR #2974 behaviour) since an empty list/string for those carries no
+    information.  The exception is ``reasoning_content``: DeepSeek/Kimi
+    thinking-mode replay treats an empty string as a meaningful sentinel
+    that ``_copy_reasoning_content_for_api`` upgrades to a single space.
+    Dropping it here would make the gateway send no ``reasoning_content`` at
+    all on the next turn, which can cause HTTP 400 from strict thinking
+    providers.
+    """
+    entry: Dict[str, Any] = {"role": role, "content": content}
+    if role == "assistant":
+        for _rkey in _ASSISTANT_REPLAY_FIELDS:
+            if _rkey not in msg:
+                continue
+            _rval = msg.get(_rkey)
+            if _rkey == "reasoning_content":
+                # Preserve empty-string sentinel for thinking-mode replay.
+                if _rval is None:
+                    continue
+            else:
+                if not _rval:
+                    continue
+            entry[_rkey] = _rval
+    return entry
+
+
 def _last_transcript_timestamp(history: Optional[List[Dict[str, Any]]]) -> Any:
    """Return the ``timestamp`` of the last usable transcript row, if any.

@ -14412,17 +14484,12 @@ class GatewayRunner:
                        if msg.get("mirror"):
                            mirror_src = msg.get("mirror_source", "another session")
                            content = f"[Delivered from {mirror_src}] {content}"
-                        entry = {"role": role, "content": content}
-                        # Preserve reasoning fields on assistant messages so
-                        # multi-turn reasoning context survives session reload.
-                        # The agent's _build_api_kwargs converts these to the
-                        # provider-specific format (reasoning_content, etc.).
-                        if role == "assistant":
-                            for _rkey in ("reasoning", "reasoning_details",
-                                          "codex_reasoning_items"):
-                                _rval = msg.get(_rkey)
-                                if _rval:
-                                    entry[_rkey] = _rval
+                        # Preserve assistant reasoning + Codex replay fields so
+                        # multi-turn reasoning context, prefix-cache hits, and
+                        # provider-specific echo requirements survive session
+                        # reload.  See ``_ASSISTANT_REPLAY_FIELDS`` for the full
+                        # whitelist and rationale.
+                        entry = _build_replay_entry(role, content, msg)
                        agent_history.append(entry)
            
            # Collect MEDIA paths already in history so we can exclude them
--- a/tests/gateway/test_replay_entry_fields.py
+++ b/tests/gateway/test_replay_entry_fields.py
@ -0,0 +1,254 @@
+"""Tests for ``gateway.run._build_replay_entry``.
+
+The gateway rebuilds ``agent_history`` from the persisted transcript on every
+turn (unlike the CLI, which keeps the live in-memory message list).  When a
+pure-text assistant turn (no ``tool_calls``) is replayed, the simple-text
+branch in ``run_sync`` used to whitelist only three reasoning fields:
+``reasoning``, ``reasoning_details``, ``codex_reasoning_items``.
+
+That whitelist predated three fields the DB now persists:
+``reasoning_content``, ``codex_message_items``, and ``finish_reason``.  The
+unrecovered drop of ``codex_message_items`` in particular kills prefix-cache
+hits for OpenAI Codex Responses API users — OpenAI's docs require the
+``phase`` field be replayed on every assistant message.
+
+These tests pin the expanded whitelist so it doesn't regress.
+"""
+from __future__ import annotations
+
+import pytest
+
+from gateway.run import _ASSISTANT_REPLAY_FIELDS, _build_replay_entry
+
+
+class TestBuildReplayEntry:
+    def test_user_message_has_only_role_and_content(self):
+        entry = _build_replay_entry(
+            "user",
+            "hello",
+            {"role": "user", "content": "hello", "reasoning": "leak", "extra": "drop"},
+        )
+        assert entry == {"role": "user", "content": "hello"}
+
+    def test_tool_message_has_only_role_and_content(self):
+        # Tool messages aren't routed through this helper in production
+        # (they take the rich-passthrough branch), but the helper itself
+        # must not leak reasoning fields onto non-assistant roles even if
+        # someone calls it incorrectly.
+        entry = _build_replay_entry(
+            "tool",
+            "result",
+            {"role": "tool", "content": "result", "reasoning": "leak"},
+        )
+        assert entry == {"role": "tool", "content": "result"}
+
+    def test_assistant_minimal_has_only_role_and_content(self):
+        entry = _build_replay_entry(
+            "assistant",
+            "ok",
+            {"role": "assistant", "content": "ok"},
+        )
+        assert entry == {"role": "assistant", "content": "ok"}
+
+    def test_assistant_preserves_reasoning(self):
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning": "I think therefore I am.",
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert entry["reasoning"] == "I think therefore I am."
+
+    def test_assistant_preserves_reasoning_content(self):
+        """reasoning_content was silently dropped before this fix.
+
+        Required for DeepSeek/Kimi/Moonshot thinking-mode echo so the
+        provider receives back what it sent.
+        """
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning_content": "structured CoT",
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert entry["reasoning_content"] == "structured CoT"
+
+    def test_assistant_preserves_reasoning_details(self):
+        details = [
+            {
+                "type": "reasoning.summary",
+                "format": "text",
+                "summary": "thought hard",
+            },
+            {
+                "type": "reasoning.encrypted",
+                "data": "opaque_blob",
+                "signature": "sig123",
+            },
+        ]
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning_details": details,
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert entry["reasoning_details"] == details
+
+    def test_assistant_preserves_codex_reasoning_items(self):
+        items = [{"type": "reasoning", "encrypted_content": "blob"}]
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "codex_reasoning_items": items,
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert entry["codex_reasoning_items"] == items
+
+    def test_assistant_preserves_codex_message_items(self):
+        """codex_message_items was silently dropped before this fix.
+
+        OpenAI docs: 'preserve and resend phase on all assistant messages
+        — dropping it can degrade performance.'  Required for prefix
+        cache hits on the Codex Responses API.
+        """
+        items = [
+            {
+                "type": "message",
+                "role": "assistant",
+                "id": "msg_123",
+                "phase": "final_answer",
+                "content": [{"type": "output_text", "text": "Done"}],
+            }
+        ]
+        msg = {
+            "role": "assistant",
+            "content": "Done",
+            "codex_message_items": items,
+        }
+        entry = _build_replay_entry("assistant", "Done", msg)
+        assert entry["codex_message_items"] == items
+
+    def test_assistant_preserves_finish_reason(self):
+        """finish_reason was silently dropped before this fix.
+
+        Cheap to keep; lets transcripts replay byte-identically across
+        CLI and gateway.
+        """
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "finish_reason": "stop",
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert entry["finish_reason"] == "stop"
+
+    def test_assistant_drops_falsy_reasoning(self):
+        """Empty/None reasoning fields stay dropped (matching PR #2974
+        behaviour) — empty strings/lists for these fields carry no info."""
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning": "",
+            "reasoning_details": [],
+            "codex_reasoning_items": [],
+            "codex_message_items": [],
+            "finish_reason": "",
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert entry == {"role": "assistant", "content": "answer"}
+
+    def test_assistant_preserves_empty_reasoning_content(self):
+        """Empty reasoning_content is a meaningful sentinel.
+
+        DeepSeek V4 Pro thinking mode rejects bare missing reasoning_content
+        with HTTP 400.  ``_copy_reasoning_content_for_api`` upgrades the
+        empty string to a single space at API-send time, but only if the
+        empty string actually reached it.  Dropping it here would 400 the
+        next turn for affected providers.
+        """
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning_content": "",
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert "reasoning_content" in entry
+        assert entry["reasoning_content"] == ""
+
+    def test_assistant_drops_none_reasoning_content(self):
+        """None reasoning_content is just an absent field; drop it."""
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning_content": None,
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert "reasoning_content" not in entry
+
+    def test_assistant_preserves_all_six_fields_together(self):
+        details = [{"type": "reasoning.summary", "summary": "s"}]
+        codex_items = [{"type": "reasoning", "encrypted_content": "b"}]
+        msg_items = [
+            {
+                "type": "message",
+                "role": "assistant",
+                "phase": "final_answer",
+                "content": [{"type": "output_text", "text": "x"}],
+            }
+        ]
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "reasoning": "thinking",
+            "reasoning_content": "structured",
+            "reasoning_details": details,
+            "codex_reasoning_items": codex_items,
+            "codex_message_items": msg_items,
+            "finish_reason": "stop",
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert entry["reasoning"] == "thinking"
+        assert entry["reasoning_content"] == "structured"
+        assert entry["reasoning_details"] == details
+        assert entry["codex_reasoning_items"] == codex_items
+        assert entry["codex_message_items"] == msg_items
+        assert entry["finish_reason"] == "stop"
+
+    def test_assistant_does_not_invent_keys(self):
+        """The helper only copies over fields that are explicitly present."""
+        msg = {"role": "assistant", "content": "answer", "reasoning": "r"}
+        entry = _build_replay_entry("assistant", "answer", msg)
+        # reasoning_details/etc. weren't in msg, so they shouldn't be in entry
+        for absent in (
+            "reasoning_content",
+            "reasoning_details",
+            "codex_reasoning_items",
+            "codex_message_items",
+            "finish_reason",
+        ):
+            assert absent not in entry
+
+    def test_replay_fields_constant_is_stable(self):
+        """Pin the whitelist explicitly so accidental renames are caught."""
+        assert _ASSISTANT_REPLAY_FIELDS == (
+            "reasoning",
+            "reasoning_content",
+            "reasoning_details",
+            "codex_reasoning_items",
+            "codex_message_items",
+            "finish_reason",
+        )
+
+    def test_unrelated_keys_are_ignored(self):
+        """Random keys on the message must not leak into the replay entry."""
+        msg = {
+            "role": "assistant",
+            "content": "answer",
+            "timestamp": 12345.6,
+            "internal_marker": "should not flow",
+            "tool_call_id": "should not be set on simple-text branch",
+        }
+        entry = _build_replay_entry("assistant", "answer", msg)
+        assert "timestamp" not in entry
+        assert "internal_marker" not in entry
+        assert "tool_call_id" not in entry