From b9b9ee3e6c04df01a8c13b634318fe7d60e2602a Mon Sep 17 00:00:00 2001
From: lsdsjy <luwinyang@deepseek.com>
Date: Tue, 28 Apr 2026 14:17:11 +0800
Subject: [PATCH] fix(deepseek): preserve v4 reasoning_content on replay

---
 agent/transports/chat_completions.py          |   6 +-
 run_agent.py                                  |  44 ++--
 .../agent/transports/test_chat_completions.py |  35 +++
 .../test_deepseek_reasoning_content_echo.py   | 101 +++++++-
 .../test_deepseek_v4_thinking_live.py         | 245 ++++++++++++++++++
 5 files changed, 398 insertions(+), 33 deletions(-)
 create mode 100644 tests/run_agent/test_deepseek_v4_thinking_live.py

diff --git a/agent/transports/chat_completions.py b/agent/transports/chat_completions.py
index 0da01fedf9..9a115e4547 100644
--- a/agent/transports/chat_completions.py
+++ b/agent/transports/chat_completions.py
@@ -477,9 +477,13 @@ class ChatCompletionsTransport(ProviderTransport):
         # so keep them apart in provider_data rather than merging.
         reasoning = getattr(msg, "reasoning", None)
         reasoning_content = getattr(msg, "reasoning_content", None)
+        if reasoning_content is None and hasattr(msg, "model_extra"):
+            model_extra = getattr(msg, "model_extra", None) or {}
+            if isinstance(model_extra, dict) and "reasoning_content" in model_extra:
+                reasoning_content = model_extra["reasoning_content"]
 
         provider_data: Dict[str, Any] = {}
-        if reasoning_content:
+        if reasoning_content is not None:
             provider_data["reasoning_content"] = reasoning_content
         rd = getattr(msg, "reasoning_details", None)
         if rd:
diff --git a/run_agent.py b/run_agent.py
index c9801b6795..44abe31dc4 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -8501,6 +8501,7 @@ class AIAgent:
         Handles reasoning extraction, reasoning_details, and optional tool_calls
         so both the tool-call path and the final-response path share one builder.
         """
+        assistant_tool_calls = getattr(assistant_message, "tool_calls", None)
         reasoning_text = self._extract_reasoning(assistant_message)
         _from_structured = bool(reasoning_text)
 
@@ -8560,16 +8561,19 @@ class AIAgent:
             "finish_reason": finish_reason,
         }
 
-        if hasattr(assistant_message, "reasoning_content"):
-            raw_reasoning_content = getattr(assistant_message, "reasoning_content", None)
-            if raw_reasoning_content is not None:
-                msg["reasoning_content"] = _sanitize_surrogates(raw_reasoning_content)
-            elif msg.get("tool_calls") and self._needs_deepseek_tool_reasoning():
-                # DeepSeek thinking mode requires reasoning_content on every
-                # assistant tool-call message. Without it, replaying the
-                # persisted message causes HTTP 400. Include empty string
-                # as a defensive compatibility fallback (refs #15250).
-                msg["reasoning_content"] = ""
+        raw_reasoning_content = getattr(assistant_message, "reasoning_content", None)
+        if raw_reasoning_content is None and hasattr(assistant_message, "model_extra"):
+            model_extra = getattr(assistant_message, "model_extra", None) or {}
+            if isinstance(model_extra, dict) and "reasoning_content" in model_extra:
+                raw_reasoning_content = model_extra["reasoning_content"]
+        if raw_reasoning_content is not None:
+            msg["reasoning_content"] = _sanitize_surrogates(raw_reasoning_content)
+        elif assistant_tool_calls and self._needs_deepseek_tool_reasoning():
+            # DeepSeek thinking mode requires reasoning_content on every
+            # assistant tool-call message. Without it, replaying the
+            # persisted message causes HTTP 400. Include empty string
+            # only when no structured reasoning text was captured.
+            msg["reasoning_content"] = reasoning_text or ""
 
         # Additive fallback (refs #16844, #16884). Streaming-only providers
         # (glm, MiniMax, gpt-5.x via aigw, Anthropic via openai-compat shims)
@@ -8626,9 +8630,9 @@ class AIAgent:
         if codex_message_items:
             msg["codex_message_items"] = codex_message_items
 
-        if assistant_message.tool_calls:
+        if assistant_tool_calls:
             tool_calls = []
-            for tool_call in assistant_message.tool_calls:
+            for tool_call in assistant_tool_calls:
                 raw_id = getattr(tool_call, "id", None)
                 call_id = getattr(tool_call, "call_id", None)
                 if not isinstance(call_id, str) or not call_id.strip():
@@ -8728,11 +8732,11 @@ class AIAgent:
         # if the source turn has tool_calls AND a 'reasoning' field but no
         # 'reasoning_content' key, the 'reasoning' text was written by a
         # prior provider (e.g. MiniMax) — DeepSeek's own _build_assistant_message
-        # always pins reasoning_content="" at creation time for tool-call turns,
-        # so the shape (reasoning set, reasoning_content absent, tool_calls
-        # present) is unreachable from same-provider DeepSeek history. Inject
-        # "" to satisfy the API without leaking another provider's chain of
-        # thought to DeepSeek/Kimi.
+        # pins reasoning_content at creation time for tool-call turns, so the
+        # shape (reasoning set, reasoning_content absent, tool_calls present)
+        # is unreachable from same-provider DeepSeek history after this fix.
+        # Inject "" to satisfy the API without leaking another provider's
+        # chain of thought to DeepSeek/Kimi.
         normalized_reasoning = source_msg.get("reasoning")
         if (
             needs_thinking_pad
@@ -8745,9 +8749,9 @@ class AIAgent:
 
         # 3. Healthy session: promote 'reasoning' field to 'reasoning_content'
         # for providers that use the internal 'reasoning' key.
-        # This must happen BEFORE the DeepSeek/Kimi tool-call check so that
-        # genuine reasoning content is not overwritten by the empty-string
-        # fallback (#15812 regression in PR #15478).
+        # This must happen before the unconditional empty-string fallback so
+        # genuine reasoning content is not overwritten (#15812 regression in
+        # PR #15478).
         if isinstance(normalized_reasoning, str) and normalized_reasoning:
             api_msg["reasoning_content"] = normalized_reasoning
             return
diff --git a/tests/agent/transports/test_chat_completions.py b/tests/agent/transports/test_chat_completions.py
index 66aa7e9058..b8fdced8aa 100644
--- a/tests/agent/transports/test_chat_completions.py
+++ b/tests/agent/transports/test_chat_completions.py
@@ -620,6 +620,41 @@ class TestChatCompletionsNormalize:
         assert nr.reasoning == "summary text"
         assert nr.provider_data == {"reasoning_content": "detailed scratchpad"}
 
+    def test_empty_reasoning_content_preserved(self, transport):
+        """DeepSeek can require an explicit empty reasoning_content replay field."""
+        r = SimpleNamespace(
+            choices=[SimpleNamespace(
+                message=SimpleNamespace(
+                    content=None,
+                    tool_calls=None,
+                    reasoning=None,
+                    reasoning_content="",
+                ),
+                finish_reason="stop",
+            )],
+            usage=None,
+        )
+        nr = transport.normalize_response(r)
+        assert nr.provider_data == {"reasoning_content": ""}
+        assert nr.reasoning_content == ""
+
+    def test_reasoning_content_preserved_from_model_extra(self, transport):
+        """OpenAI SDK can expose provider-specific DeepSeek fields via model_extra."""
+        r = SimpleNamespace(
+            choices=[SimpleNamespace(
+                message=SimpleNamespace(
+                    content=None,
+                    tool_calls=None,
+                    reasoning=None,
+                    model_extra={"reasoning_content": "model-extra scratchpad"},
+                ),
+                finish_reason="stop",
+            )],
+            usage=None,
+        )
+        nr = transport.normalize_response(r)
+        assert nr.provider_data == {"reasoning_content": "model-extra scratchpad"}
+
 
 class TestChatCompletionsCacheStats:
 
diff --git a/tests/run_agent/test_deepseek_reasoning_content_echo.py b/tests/run_agent/test_deepseek_reasoning_content_echo.py
index a3f1cf8bb1..edae5efce4 100644
--- a/tests/run_agent/test_deepseek_reasoning_content_echo.py
+++ b/tests/run_agent/test_deepseek_reasoning_content_echo.py
@@ -23,6 +23,8 @@ Refs #15250 / #15353.
 
 from __future__ import annotations
 
+from types import SimpleNamespace
+
 import pytest
 
 from run_agent import AIAgent
@@ -33,6 +35,10 @@ def _make_agent(provider: str = "", model: str = "", base_url: str = "") -> AIAg
     agent.provider = provider
     agent.model = model
     agent.base_url = base_url
+    agent.verbose_logging = False
+    agent.reasoning_callback = None
+    agent.stream_delta_callback = None
+    agent._stream_callback = None
     return agent
 
 
@@ -109,16 +115,7 @@ class TestCopyReasoningContentForApi:
         assert api_msg["reasoning_content"] == "<think>real chain of thought</think>"
 
     def test_deepseek_reasoning_field_promoted(self) -> None:
-        """When only 'reasoning' is set (no tool_calls), it gets promoted to reasoning_content.
-
-        On DeepSeek/Kimi, tool-call turns with 'reasoning' but no
-        'reasoning_content' are treated as cross-provider poisoned history
-        (#15748) and padded with "" instead of promoted. Same-provider
-        DeepSeek tool-call turns always have reasoning_content pinned at
-        creation time by _build_assistant_message, so the (reasoning-set,
-        reasoning_content-absent, tool_calls-present) shape is unreachable
-        from same-provider history.
-        """
+        """When only 'reasoning' is set, it gets promoted to reasoning_content."""
         agent = _make_agent(provider="deepseek", model="deepseek-v4-flash")
         source = {
             "role": "assistant",
@@ -135,8 +132,8 @@ class TestCopyReasoningContentForApi:
 
         If the source turn has tool_calls AND a 'reasoning' field but NO
         'reasoning_content' key, it's from a prior provider (the DeepSeek
-        build path always pins reasoning_content="" at creation). Inject
-        "" instead of forwarding the prior provider's chain of thought.
+        build path pins reasoning_content at creation). Inject "" instead
+        of forwarding the prior provider's chain of thought.
         """
         agent = _make_agent(provider="deepseek", model="deepseek-v4-flash")
         source = {
@@ -228,6 +225,86 @@ class TestCopyReasoningContentForApi:
         assert "reasoning_content" not in api_msg
 
 
+class TestBuildAssistantMessageDeepSeekReasoningContent:
+    """_build_assistant_message pins replay-safe DeepSeek tool-call state."""
+
+    def test_deepseek_tool_call_reasoning_is_backfilled_into_reasoning_content(self) -> None:
+        agent = _make_agent(provider="deepseek", model="deepseek-v4-flash")
+        assistant_message = SimpleNamespace(
+            content=None,
+            reasoning="DeepSeek tool-call reasoning",
+            reasoning_content=None,
+            reasoning_details=None,
+            codex_reasoning_items=None,
+            codex_message_items=None,
+            tool_calls=[
+                SimpleNamespace(
+                    id="call_1",
+                    call_id=None,
+                    response_item_id=None,
+                    type="function",
+                    function=SimpleNamespace(name="terminal", arguments="{}"),
+                )
+            ],
+        )
+
+        msg = agent._build_assistant_message(assistant_message, "tool_calls")
+
+        assert msg["reasoning_content"] == "DeepSeek tool-call reasoning"
+        assert msg["tool_calls"][0]["id"] == "call_1"
+
+    def test_deepseek_model_extra_reasoning_content_is_preserved(self) -> None:
+        """OpenAI SDK stores unknown provider fields in model_extra."""
+        agent = _make_agent(provider="deepseek", model="deepseek-v4-flash")
+        assistant_message = SimpleNamespace(
+            content=None,
+            reasoning=None,
+            reasoning_content=None,
+            model_extra={"reasoning_content": "DeepSeek model_extra reasoning"},
+            reasoning_details=None,
+            codex_reasoning_items=None,
+            codex_message_items=None,
+            tool_calls=[
+                SimpleNamespace(
+                    id="call_1",
+                    call_id=None,
+                    response_item_id=None,
+                    type="function",
+                    function=SimpleNamespace(name="terminal", arguments="{}"),
+                )
+            ],
+        )
+
+        msg = agent._build_assistant_message(assistant_message, "tool_calls")
+
+        assert msg["reasoning_content"] == "DeepSeek model_extra reasoning"
+
+    def test_deepseek_tool_call_without_raw_reasoning_content_gets_empty_string(self) -> None:
+        agent = _make_agent(provider="deepseek", model="deepseek-v4-flash")
+        assistant_message = SimpleNamespace(
+            content=None,
+            reasoning=None,
+            reasoning_content=None,
+            reasoning_details=None,
+            codex_reasoning_items=None,
+            codex_message_items=None,
+            tool_calls=[
+                SimpleNamespace(
+                    id="call_1",
+                    call_id=None,
+                    response_item_id=None,
+                    type="function",
+                    function=SimpleNamespace(name="terminal", arguments="{}"),
+                )
+            ],
+        )
+
+        msg = agent._build_assistant_message(assistant_message, "tool_calls")
+
+        assert msg["reasoning_content"] == ""
+        assert msg["tool_calls"][0]["id"] == "call_1"
+
+
 class TestNeedsKimiToolReasoning:
     """The extracted _needs_kimi_tool_reasoning() helper keeps Kimi behavior intact."""
 
diff --git a/tests/run_agent/test_deepseek_v4_thinking_live.py b/tests/run_agent/test_deepseek_v4_thinking_live.py
new file mode 100644
index 0000000000..b938c27413
--- /dev/null
+++ b/tests/run_agent/test_deepseek_v4_thinking_live.py
@@ -0,0 +1,245 @@
+"""Live DeepSeek V4 thinking-mode tool-call replay smoke test.
+
+Opt-in only:
+    HERMES_LIVE_TESTS=1 pytest tests/run_agent/test_deepseek_v4_thinking_live.py -q
+
+Requires DEEPSEEK_API_KEY in the process environment. The key is captured at
+module import time because tests/conftest.py intentionally removes credential
+environment variables before each test body runs.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from typing import Any
+
+import pytest
+
+
+LIVE = os.environ.get("HERMES_LIVE_TESTS") == "1"
+DEEPSEEK_KEY = os.environ.get("DEEPSEEK_API_KEY", "")
+LIVE_MODELS = ("deepseek-v4-flash", "deepseek-v4-pro")
+LIVE_BASE_URL = "https://api.deepseek.com"
+
+pytestmark = [
+    pytest.mark.skipif(not LIVE, reason="live-only: set HERMES_LIVE_TESTS=1"),
+    pytest.mark.skipif(not DEEPSEEK_KEY, reason="DEEPSEEK_API_KEY not configured"),
+]
+
+TOOL_NAME = "lookup_ticket_status"
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": TOOL_NAME,
+            "description": "Return the status for a test ticket id.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "ticket_id": {
+                        "type": "string",
+                        "description": "The ticket id to look up.",
+                    },
+                },
+                "required": ["ticket_id"],
+                "additionalProperties": False,
+            },
+        },
+    }
+]
+
+
+def _thinking_kwargs() -> dict:
+    return {
+        "reasoning_effort": "high",
+        "extra_body": {"thinking": {"type": "enabled"}},
+    }
+
+
+def _jsonable(value: Any) -> Any:
+    if hasattr(value, "model_dump"):
+        return value.model_dump(mode="json")
+    if isinstance(value, dict):
+        return {k: _jsonable(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [_jsonable(v) for v in value]
+    return value
+
+
+def _print_trace(label: str, value: Any) -> None:
+    sys.__stdout__.write(f"\n--- {label} ---\n")
+    sys.__stdout__.write(
+        json.dumps(_jsonable(value), ensure_ascii=False, indent=2, sort_keys=True)
+    )
+    sys.__stdout__.write("\n")
+    sys.__stdout__.flush()
+
+
+def _message_snapshot(message) -> dict:
+    return {
+        "content": getattr(message, "content", None),
+        "reasoning": getattr(message, "reasoning", None),
+        "reasoning_content": _raw_reasoning_content(message),
+        "model_extra": getattr(message, "model_extra", None),
+        "tool_calls": _jsonable(getattr(message, "tool_calls", None)),
+    }
+
+
+def _make_live_client():
+    from openai import OpenAI
+
+    return OpenAI(api_key=DEEPSEEK_KEY, base_url=LIVE_BASE_URL)
+
+
+def _make_agent_for_message_building(model: str):
+    from run_agent import AIAgent
+
+    agent = object.__new__(AIAgent)
+    agent.provider = "deepseek"
+    agent.model = model
+    agent.base_url = LIVE_BASE_URL
+    agent.verbose_logging = False
+    agent.reasoning_callback = None
+    agent.stream_delta_callback = None
+    agent._stream_callback = None
+    return agent
+
+
+def _raw_reasoning_content(message):
+    direct = getattr(message, "reasoning_content", None)
+    if direct is not None:
+        return direct
+    model_extra = getattr(message, "model_extra", None) or {}
+    if isinstance(model_extra, dict) and "reasoning_content" in model_extra:
+        return model_extra["reasoning_content"]
+    return None
+
+
+@pytest.mark.parametrize("live_model", LIVE_MODELS)
+def test_deepseek_v4_thinking_tool_call_replay_round_trip(live_model: str):
+    """Hit DeepSeek twice and replay the assistant tool-call turn.
+
+    The first request forces a tool call with thinking enabled. The second
+    request replays that assistant message with content, reasoning_content,
+    and tool_calls, then appends the tool result. DeepSeek accepting the
+    second request is the live guardrail for the V4 thinking replay contract.
+    """
+
+    client = _make_live_client()
+    agent = _make_agent_for_message_building(live_model)
+
+    first_request = {
+        "model": live_model,
+        "messages": [
+            {
+                "role": "user",
+                "content": (
+                    "You must use the provided lookup_ticket_status tool "
+                    "exactly once with ticket_id 'DS-4242'. Do not answer "
+                    "directly."
+                ),
+            }
+        ],
+        "tools": TOOLS,
+        "max_tokens": 1024,
+        "timeout": 90,
+        **_thinking_kwargs(),
+    }
+    _print_trace(f"{live_model} first request", first_request)
+    first = client.chat.completions.create(**first_request)
+    _print_trace(f"{live_model} first raw response", first)
+
+    first_choice = first.choices[0]
+    first_message = first_choice.message
+    _print_trace(
+        f"{live_model} first assistant message",
+        {
+            "finish_reason": first_choice.finish_reason,
+            **_message_snapshot(first_message),
+        },
+    )
+    assert first_message.tool_calls, "DeepSeek did not return a tool call"
+    first_tool_call = first_message.tool_calls[0]
+    assert first_tool_call.function.name == TOOL_NAME
+    assert isinstance(json.loads(first_tool_call.function.arguments or "{}"), dict)
+
+    raw_reasoning_content = _raw_reasoning_content(first_message)
+    assert raw_reasoning_content is not None, (
+        "DeepSeek did not return reasoning_content; the thinking payload may "
+        "not have been honored"
+    )
+
+    stored_assistant = agent._build_assistant_message(
+        first_message,
+        first_choice.finish_reason or "tool_calls",
+    )
+    _print_trace(f"{live_model} stored assistant message", stored_assistant)
+    assert stored_assistant["reasoning_content"] == raw_reasoning_content
+
+    replay_assistant = {
+        "role": "assistant",
+        "content": stored_assistant.get("content") or "",
+        "tool_calls": stored_assistant["tool_calls"],
+    }
+    agent._copy_reasoning_content_for_api(stored_assistant, replay_assistant)
+    _print_trace(f"{live_model} replay assistant message", replay_assistant)
+
+    tool_call_id = stored_assistant["tool_calls"][0]["id"]
+    messages = [
+        {
+            "role": "user",
+            "content": (
+                "You must use the provided lookup_ticket_status tool "
+                "exactly once with ticket_id 'DS-4242'. Do not answer "
+                "directly."
+            ),
+        },
+        replay_assistant,
+        {
+            "role": "tool",
+            "tool_call_id": tool_call_id,
+            "content": json.dumps(
+                {"ticket_id": "DS-4242", "status": "green", "source": "live-test"},
+                separators=(",", ":"),
+            ),
+        },
+    ]
+
+    from agent.transports.chat_completions import ChatCompletionsTransport
+
+    api_messages = ChatCompletionsTransport().convert_messages(messages)
+    _print_trace(
+        f"{live_model} second request messages after transport conversion",
+        api_messages,
+    )
+    assert api_messages[1]["reasoning_content"] == raw_reasoning_content
+    assert "call_id" not in api_messages[1]["tool_calls"][0]
+    assert "response_item_id" not in api_messages[1]["tool_calls"][0]
+
+    second_request = {
+        "model": live_model,
+        "messages": api_messages,
+        "max_tokens": 1024,
+        "timeout": 90,
+        **_thinking_kwargs(),
+    }
+    _print_trace(f"{live_model} second request", second_request)
+    second = client.chat.completions.create(**second_request)
+    _print_trace(f"{live_model} second raw response", second)
+    _print_trace(
+        f"{live_model} second assistant message",
+        {
+            "finish_reason": second.choices[0].finish_reason,
+            **_message_snapshot(second.choices[0].message),
+        },
+    )
+
+    second_message = second.choices[0].message
+    final_content = second_message.content or ""
+    final_reasoning = _raw_reasoning_content(second_message) or ""
+    assert second.choices[0].finish_reason == "stop"
+    assert final_content.strip() or final_reasoning.strip(), (
+        "DeepSeek returned neither visible content nor reasoning_content"
+    )