fix(gateway): preserve reasoning_content, codex_message_items, finish_reason on transcript replay (#22839)

PR #2974 whitelisted three reasoning fields (reasoning, reasoning_details,
codex_reasoning_items) for the gateway's simple-text replay branch. Three
more fields were added to the DB later but the whitelist was never updated:

  - reasoning_content: provider-facing thinking text. _copy_reasoning_content_for_api
    promotes 'reasoning' -> 'reasoning_content' at send time only when the
    strings happen to match. Carrying the original verbatim avoids loss
    for providers that return them as distinct fields (DeepSeek/Kimi/
    Moonshot thinking modes), and preserves the empty-string sentinel
    that DeepSeek V4 Pro requires for thinking-mode replay.
  - codex_message_items: exact assistant message items with 'phase'.
    OpenAI docs: 'preserve and resend phase on all assistant messages —
    dropping it can degrade performance.' Required for prefix cache hits.
    No recovery path exists — once dropped, gone.
  - finish_reason: informational; cheap to keep so transcripts replay
    identically across CLI and gateway.

The CLI is unaffected because cli.py keeps the live in-memory message list
across turns (cli.py:10046 'self.conversation_history = result["messages"]').
The gateway rebuilds agent_history from the SQLite transcript on every turn,
so any field stripped during replay is silently lost.

Refactors the inline whitelist into a module-level _build_replay_entry()
helper so the contract can be unit-tested. 16 new tests pin the field set
and falsy-value handling.

Verified end-to-end: DB stores all 8 fields, replay now preserves all 8
(was preserving only 5 for assistant text turns).
This commit is contained in:
Teknium 2026-05-09 14:47:33 -07:00 committed by GitHub
parent c7f0aab949
commit 70bfd429e5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 332 additions and 11 deletions

View file

@ -203,6 +203,78 @@ def _is_fresh_gateway_interruption(
return current - timestamp <= window
# Assistant-message fields that must survive transcript replay so multi-turn
# reasoning context, prefix-cache hits, and provider-specific echo
# requirements all behave the same on the gateway as they do in the CLI.
#
# ``reasoning`` and ``reasoning_details`` were the original three preserved
# by PR #2974 (schema v6). ``reasoning_content``, ``codex_reasoning_items``,
# ``codex_message_items``, and ``finish_reason`` were added to the DB later
# but the gateway's replay whitelist was never expanded to match — so any
# pure-text assistant turn (no ``tool_calls``) silently dropped them on
# replay, regressing the CLI-vs-gateway behavioural parity.
#
# Why each field matters on replay:
# * ``reasoning`` / ``reasoning_content``: provider-facing thinking text.
# ``_copy_reasoning_content_for_api`` promotes ``reasoning`` →
# ``reasoning_content`` at send time, but only when the strings happen to
# match. Carrying the original ``reasoning_content`` verbatim avoids
# reconstruction loss for providers that return them as distinct fields
# (DeepSeek/Kimi/Moonshot thinking modes).
# * ``reasoning_details``: opaque structured array (signature,
# encrypted_content) used by OpenRouter/Anthropic to maintain reasoning
# continuity across turns.
# * ``codex_reasoning_items``: encrypted reasoning blobs for the OpenAI
# Codex Responses API.
# * ``codex_message_items``: exact assistant message items with ``phase``.
# OpenAI docs: "preserve and resend phase on all assistant messages —
# dropping it can degrade performance." Required for prefix cache hits.
# * ``finish_reason``: informational; cheap to keep so transcripts replay
# identically across CLI and gateway.
_ASSISTANT_REPLAY_FIELDS: tuple[str, ...] = (
"reasoning",
"reasoning_content",
"reasoning_details",
"codex_reasoning_items",
"codex_message_items",
"finish_reason",
)
def _build_replay_entry(role: str, content: Any, msg: Dict[str, Any]) -> Dict[str, Any]:
"""Build a replay entry for a non-tool-calling message, preserving the
assistant fields the agent's API builders rely on for multi-turn fidelity.
Lifted out of the inline ``run_sync`` closure so the field whitelist can
be unit-tested in isolation. Mirrors the ``_ASSISTANT_REPLAY_FIELDS``
contract above.
Empty values: most fields are dropped when falsy (matching the original
PR #2974 behaviour) since an empty list/string for those carries no
information. The exception is ``reasoning_content``: DeepSeek/Kimi
thinking-mode replay treats an empty string as a meaningful sentinel
that ``_copy_reasoning_content_for_api`` upgrades to a single space.
Dropping it here would make the gateway send no ``reasoning_content`` at
all on the next turn, which can cause HTTP 400 from strict thinking
providers.
"""
entry: Dict[str, Any] = {"role": role, "content": content}
if role == "assistant":
for _rkey in _ASSISTANT_REPLAY_FIELDS:
if _rkey not in msg:
continue
_rval = msg.get(_rkey)
if _rkey == "reasoning_content":
# Preserve empty-string sentinel for thinking-mode replay.
if _rval is None:
continue
else:
if not _rval:
continue
entry[_rkey] = _rval
return entry
def _last_transcript_timestamp(history: Optional[List[Dict[str, Any]]]) -> Any:
"""Return the ``timestamp`` of the last usable transcript row, if any.
@ -14412,17 +14484,12 @@ class GatewayRunner:
if msg.get("mirror"):
mirror_src = msg.get("mirror_source", "another session")
content = f"[Delivered from {mirror_src}] {content}"
entry = {"role": role, "content": content}
# Preserve reasoning fields on assistant messages so
# multi-turn reasoning context survives session reload.
# The agent's _build_api_kwargs converts these to the
# provider-specific format (reasoning_content, etc.).
if role == "assistant":
for _rkey in ("reasoning", "reasoning_details",
"codex_reasoning_items"):
_rval = msg.get(_rkey)
if _rval:
entry[_rkey] = _rval
# Preserve assistant reasoning + Codex replay fields so
# multi-turn reasoning context, prefix-cache hits, and
# provider-specific echo requirements survive session
# reload. See ``_ASSISTANT_REPLAY_FIELDS`` for the full
# whitelist and rationale.
entry = _build_replay_entry(role, content, msg)
agent_history.append(entry)
# Collect MEDIA paths already in history so we can exclude them

View file

@ -0,0 +1,254 @@
"""Tests for ``gateway.run._build_replay_entry``.
The gateway rebuilds ``agent_history`` from the persisted transcript on every
turn (unlike the CLI, which keeps the live in-memory message list). When a
pure-text assistant turn (no ``tool_calls``) is replayed, the simple-text
branch in ``run_sync`` used to whitelist only three reasoning fields:
``reasoning``, ``reasoning_details``, ``codex_reasoning_items``.
That whitelist predated three fields the DB now persists:
``reasoning_content``, ``codex_message_items``, and ``finish_reason``. The
unrecovered drop of ``codex_message_items`` in particular kills prefix-cache
hits for OpenAI Codex Responses API users OpenAI's docs require the
``phase`` field be replayed on every assistant message.
These tests pin the expanded whitelist so it doesn't regress.
"""
from __future__ import annotations
import pytest
from gateway.run import _ASSISTANT_REPLAY_FIELDS, _build_replay_entry
class TestBuildReplayEntry:
def test_user_message_has_only_role_and_content(self):
entry = _build_replay_entry(
"user",
"hello",
{"role": "user", "content": "hello", "reasoning": "leak", "extra": "drop"},
)
assert entry == {"role": "user", "content": "hello"}
def test_tool_message_has_only_role_and_content(self):
# Tool messages aren't routed through this helper in production
# (they take the rich-passthrough branch), but the helper itself
# must not leak reasoning fields onto non-assistant roles even if
# someone calls it incorrectly.
entry = _build_replay_entry(
"tool",
"result",
{"role": "tool", "content": "result", "reasoning": "leak"},
)
assert entry == {"role": "tool", "content": "result"}
def test_assistant_minimal_has_only_role_and_content(self):
entry = _build_replay_entry(
"assistant",
"ok",
{"role": "assistant", "content": "ok"},
)
assert entry == {"role": "assistant", "content": "ok"}
def test_assistant_preserves_reasoning(self):
msg = {
"role": "assistant",
"content": "answer",
"reasoning": "I think therefore I am.",
}
entry = _build_replay_entry("assistant", "answer", msg)
assert entry["reasoning"] == "I think therefore I am."
def test_assistant_preserves_reasoning_content(self):
"""reasoning_content was silently dropped before this fix.
Required for DeepSeek/Kimi/Moonshot thinking-mode echo so the
provider receives back what it sent.
"""
msg = {
"role": "assistant",
"content": "answer",
"reasoning_content": "structured CoT",
}
entry = _build_replay_entry("assistant", "answer", msg)
assert entry["reasoning_content"] == "structured CoT"
def test_assistant_preserves_reasoning_details(self):
details = [
{
"type": "reasoning.summary",
"format": "text",
"summary": "thought hard",
},
{
"type": "reasoning.encrypted",
"data": "opaque_blob",
"signature": "sig123",
},
]
msg = {
"role": "assistant",
"content": "answer",
"reasoning_details": details,
}
entry = _build_replay_entry("assistant", "answer", msg)
assert entry["reasoning_details"] == details
def test_assistant_preserves_codex_reasoning_items(self):
items = [{"type": "reasoning", "encrypted_content": "blob"}]
msg = {
"role": "assistant",
"content": "answer",
"codex_reasoning_items": items,
}
entry = _build_replay_entry("assistant", "answer", msg)
assert entry["codex_reasoning_items"] == items
def test_assistant_preserves_codex_message_items(self):
"""codex_message_items was silently dropped before this fix.
OpenAI docs: 'preserve and resend phase on all assistant messages
dropping it can degrade performance.' Required for prefix
cache hits on the Codex Responses API.
"""
items = [
{
"type": "message",
"role": "assistant",
"id": "msg_123",
"phase": "final_answer",
"content": [{"type": "output_text", "text": "Done"}],
}
]
msg = {
"role": "assistant",
"content": "Done",
"codex_message_items": items,
}
entry = _build_replay_entry("assistant", "Done", msg)
assert entry["codex_message_items"] == items
def test_assistant_preserves_finish_reason(self):
"""finish_reason was silently dropped before this fix.
Cheap to keep; lets transcripts replay byte-identically across
CLI and gateway.
"""
msg = {
"role": "assistant",
"content": "answer",
"finish_reason": "stop",
}
entry = _build_replay_entry("assistant", "answer", msg)
assert entry["finish_reason"] == "stop"
def test_assistant_drops_falsy_reasoning(self):
"""Empty/None reasoning fields stay dropped (matching PR #2974
behaviour) empty strings/lists for these fields carry no info."""
msg = {
"role": "assistant",
"content": "answer",
"reasoning": "",
"reasoning_details": [],
"codex_reasoning_items": [],
"codex_message_items": [],
"finish_reason": "",
}
entry = _build_replay_entry("assistant", "answer", msg)
assert entry == {"role": "assistant", "content": "answer"}
def test_assistant_preserves_empty_reasoning_content(self):
"""Empty reasoning_content is a meaningful sentinel.
DeepSeek V4 Pro thinking mode rejects bare missing reasoning_content
with HTTP 400. ``_copy_reasoning_content_for_api`` upgrades the
empty string to a single space at API-send time, but only if the
empty string actually reached it. Dropping it here would 400 the
next turn for affected providers.
"""
msg = {
"role": "assistant",
"content": "answer",
"reasoning_content": "",
}
entry = _build_replay_entry("assistant", "answer", msg)
assert "reasoning_content" in entry
assert entry["reasoning_content"] == ""
def test_assistant_drops_none_reasoning_content(self):
"""None reasoning_content is just an absent field; drop it."""
msg = {
"role": "assistant",
"content": "answer",
"reasoning_content": None,
}
entry = _build_replay_entry("assistant", "answer", msg)
assert "reasoning_content" not in entry
def test_assistant_preserves_all_six_fields_together(self):
details = [{"type": "reasoning.summary", "summary": "s"}]
codex_items = [{"type": "reasoning", "encrypted_content": "b"}]
msg_items = [
{
"type": "message",
"role": "assistant",
"phase": "final_answer",
"content": [{"type": "output_text", "text": "x"}],
}
]
msg = {
"role": "assistant",
"content": "answer",
"reasoning": "thinking",
"reasoning_content": "structured",
"reasoning_details": details,
"codex_reasoning_items": codex_items,
"codex_message_items": msg_items,
"finish_reason": "stop",
}
entry = _build_replay_entry("assistant", "answer", msg)
assert entry["reasoning"] == "thinking"
assert entry["reasoning_content"] == "structured"
assert entry["reasoning_details"] == details
assert entry["codex_reasoning_items"] == codex_items
assert entry["codex_message_items"] == msg_items
assert entry["finish_reason"] == "stop"
def test_assistant_does_not_invent_keys(self):
"""The helper only copies over fields that are explicitly present."""
msg = {"role": "assistant", "content": "answer", "reasoning": "r"}
entry = _build_replay_entry("assistant", "answer", msg)
# reasoning_details/etc. weren't in msg, so they shouldn't be in entry
for absent in (
"reasoning_content",
"reasoning_details",
"codex_reasoning_items",
"codex_message_items",
"finish_reason",
):
assert absent not in entry
def test_replay_fields_constant_is_stable(self):
"""Pin the whitelist explicitly so accidental renames are caught."""
assert _ASSISTANT_REPLAY_FIELDS == (
"reasoning",
"reasoning_content",
"reasoning_details",
"codex_reasoning_items",
"codex_message_items",
"finish_reason",
)
def test_unrelated_keys_are_ignored(self):
"""Random keys on the message must not leak into the replay entry."""
msg = {
"role": "assistant",
"content": "answer",
"timestamp": 12345.6,
"internal_marker": "should not flow",
"tool_call_id": "should not be set on simple-text branch",
}
entry = _build_replay_entry("assistant", "answer", msg)
assert "timestamp" not in entry
assert "internal_marker" not in entry
assert "tool_call_id" not in entry