From b9b9ee3e6c04df01a8c13b634318fe7d60e2602a Mon Sep 17 00:00:00 2001 From: lsdsjy Date: Tue, 28 Apr 2026 14:17:11 +0800 Subject: [PATCH] fix(deepseek): preserve v4 reasoning_content on replay --- agent/transports/chat_completions.py | 6 +- run_agent.py | 44 ++-- .../agent/transports/test_chat_completions.py | 35 +++ .../test_deepseek_reasoning_content_echo.py | 101 +++++++- .../test_deepseek_v4_thinking_live.py | 245 ++++++++++++++++++ 5 files changed, 398 insertions(+), 33 deletions(-) create mode 100644 tests/run_agent/test_deepseek_v4_thinking_live.py diff --git a/agent/transports/chat_completions.py b/agent/transports/chat_completions.py index 0da01fedf9..9a115e4547 100644 --- a/agent/transports/chat_completions.py +++ b/agent/transports/chat_completions.py @@ -477,9 +477,13 @@ class ChatCompletionsTransport(ProviderTransport): # so keep them apart in provider_data rather than merging. reasoning = getattr(msg, "reasoning", None) reasoning_content = getattr(msg, "reasoning_content", None) + if reasoning_content is None and hasattr(msg, "model_extra"): + model_extra = getattr(msg, "model_extra", None) or {} + if isinstance(model_extra, dict) and "reasoning_content" in model_extra: + reasoning_content = model_extra["reasoning_content"] provider_data: Dict[str, Any] = {} - if reasoning_content: + if reasoning_content is not None: provider_data["reasoning_content"] = reasoning_content rd = getattr(msg, "reasoning_details", None) if rd: diff --git a/run_agent.py b/run_agent.py index c9801b6795..44abe31dc4 100644 --- a/run_agent.py +++ b/run_agent.py @@ -8501,6 +8501,7 @@ class AIAgent: Handles reasoning extraction, reasoning_details, and optional tool_calls so both the tool-call path and the final-response path share one builder. """ + assistant_tool_calls = getattr(assistant_message, "tool_calls", None) reasoning_text = self._extract_reasoning(assistant_message) _from_structured = bool(reasoning_text) @@ -8560,16 +8561,19 @@ class AIAgent: "finish_reason": finish_reason, } - if hasattr(assistant_message, "reasoning_content"): - raw_reasoning_content = getattr(assistant_message, "reasoning_content", None) - if raw_reasoning_content is not None: - msg["reasoning_content"] = _sanitize_surrogates(raw_reasoning_content) - elif msg.get("tool_calls") and self._needs_deepseek_tool_reasoning(): - # DeepSeek thinking mode requires reasoning_content on every - # assistant tool-call message. Without it, replaying the - # persisted message causes HTTP 400. Include empty string - # as a defensive compatibility fallback (refs #15250). - msg["reasoning_content"] = "" + raw_reasoning_content = getattr(assistant_message, "reasoning_content", None) + if raw_reasoning_content is None and hasattr(assistant_message, "model_extra"): + model_extra = getattr(assistant_message, "model_extra", None) or {} + if isinstance(model_extra, dict) and "reasoning_content" in model_extra: + raw_reasoning_content = model_extra["reasoning_content"] + if raw_reasoning_content is not None: + msg["reasoning_content"] = _sanitize_surrogates(raw_reasoning_content) + elif assistant_tool_calls and self._needs_deepseek_tool_reasoning(): + # DeepSeek thinking mode requires reasoning_content on every + # assistant tool-call message. Without it, replaying the + # persisted message causes HTTP 400. Include empty string + # only when no structured reasoning text was captured. + msg["reasoning_content"] = reasoning_text or "" # Additive fallback (refs #16844, #16884). Streaming-only providers # (glm, MiniMax, gpt-5.x via aigw, Anthropic via openai-compat shims) @@ -8626,9 +8630,9 @@ class AIAgent: if codex_message_items: msg["codex_message_items"] = codex_message_items - if assistant_message.tool_calls: + if assistant_tool_calls: tool_calls = [] - for tool_call in assistant_message.tool_calls: + for tool_call in assistant_tool_calls: raw_id = getattr(tool_call, "id", None) call_id = getattr(tool_call, "call_id", None) if not isinstance(call_id, str) or not call_id.strip(): @@ -8728,11 +8732,11 @@ class AIAgent: # if the source turn has tool_calls AND a 'reasoning' field but no # 'reasoning_content' key, the 'reasoning' text was written by a # prior provider (e.g. MiniMax) — DeepSeek's own _build_assistant_message - # always pins reasoning_content="" at creation time for tool-call turns, - # so the shape (reasoning set, reasoning_content absent, tool_calls - # present) is unreachable from same-provider DeepSeek history. Inject - # "" to satisfy the API without leaking another provider's chain of - # thought to DeepSeek/Kimi. + # pins reasoning_content at creation time for tool-call turns, so the + # shape (reasoning set, reasoning_content absent, tool_calls present) + # is unreachable from same-provider DeepSeek history after this fix. + # Inject "" to satisfy the API without leaking another provider's + # chain of thought to DeepSeek/Kimi. normalized_reasoning = source_msg.get("reasoning") if ( needs_thinking_pad @@ -8745,9 +8749,9 @@ class AIAgent: # 3. Healthy session: promote 'reasoning' field to 'reasoning_content' # for providers that use the internal 'reasoning' key. - # This must happen BEFORE the DeepSeek/Kimi tool-call check so that - # genuine reasoning content is not overwritten by the empty-string - # fallback (#15812 regression in PR #15478). + # This must happen before the unconditional empty-string fallback so + # genuine reasoning content is not overwritten (#15812 regression in + # PR #15478). if isinstance(normalized_reasoning, str) and normalized_reasoning: api_msg["reasoning_content"] = normalized_reasoning return diff --git a/tests/agent/transports/test_chat_completions.py b/tests/agent/transports/test_chat_completions.py index 66aa7e9058..b8fdced8aa 100644 --- a/tests/agent/transports/test_chat_completions.py +++ b/tests/agent/transports/test_chat_completions.py @@ -620,6 +620,41 @@ class TestChatCompletionsNormalize: assert nr.reasoning == "summary text" assert nr.provider_data == {"reasoning_content": "detailed scratchpad"} + def test_empty_reasoning_content_preserved(self, transport): + """DeepSeek can require an explicit empty reasoning_content replay field.""" + r = SimpleNamespace( + choices=[SimpleNamespace( + message=SimpleNamespace( + content=None, + tool_calls=None, + reasoning=None, + reasoning_content="", + ), + finish_reason="stop", + )], + usage=None, + ) + nr = transport.normalize_response(r) + assert nr.provider_data == {"reasoning_content": ""} + assert nr.reasoning_content == "" + + def test_reasoning_content_preserved_from_model_extra(self, transport): + """OpenAI SDK can expose provider-specific DeepSeek fields via model_extra.""" + r = SimpleNamespace( + choices=[SimpleNamespace( + message=SimpleNamespace( + content=None, + tool_calls=None, + reasoning=None, + model_extra={"reasoning_content": "model-extra scratchpad"}, + ), + finish_reason="stop", + )], + usage=None, + ) + nr = transport.normalize_response(r) + assert nr.provider_data == {"reasoning_content": "model-extra scratchpad"} + class TestChatCompletionsCacheStats: diff --git a/tests/run_agent/test_deepseek_reasoning_content_echo.py b/tests/run_agent/test_deepseek_reasoning_content_echo.py index a3f1cf8bb1..edae5efce4 100644 --- a/tests/run_agent/test_deepseek_reasoning_content_echo.py +++ b/tests/run_agent/test_deepseek_reasoning_content_echo.py @@ -23,6 +23,8 @@ Refs #15250 / #15353. from __future__ import annotations +from types import SimpleNamespace + import pytest from run_agent import AIAgent @@ -33,6 +35,10 @@ def _make_agent(provider: str = "", model: str = "", base_url: str = "") -> AIAg agent.provider = provider agent.model = model agent.base_url = base_url + agent.verbose_logging = False + agent.reasoning_callback = None + agent.stream_delta_callback = None + agent._stream_callback = None return agent @@ -109,16 +115,7 @@ class TestCopyReasoningContentForApi: assert api_msg["reasoning_content"] == "real chain of thought" def test_deepseek_reasoning_field_promoted(self) -> None: - """When only 'reasoning' is set (no tool_calls), it gets promoted to reasoning_content. - - On DeepSeek/Kimi, tool-call turns with 'reasoning' but no - 'reasoning_content' are treated as cross-provider poisoned history - (#15748) and padded with "" instead of promoted. Same-provider - DeepSeek tool-call turns always have reasoning_content pinned at - creation time by _build_assistant_message, so the (reasoning-set, - reasoning_content-absent, tool_calls-present) shape is unreachable - from same-provider history. - """ + """When only 'reasoning' is set, it gets promoted to reasoning_content.""" agent = _make_agent(provider="deepseek", model="deepseek-v4-flash") source = { "role": "assistant", @@ -135,8 +132,8 @@ class TestCopyReasoningContentForApi: If the source turn has tool_calls AND a 'reasoning' field but NO 'reasoning_content' key, it's from a prior provider (the DeepSeek - build path always pins reasoning_content="" at creation). Inject - "" instead of forwarding the prior provider's chain of thought. + build path pins reasoning_content at creation). Inject "" instead + of forwarding the prior provider's chain of thought. """ agent = _make_agent(provider="deepseek", model="deepseek-v4-flash") source = { @@ -228,6 +225,86 @@ class TestCopyReasoningContentForApi: assert "reasoning_content" not in api_msg +class TestBuildAssistantMessageDeepSeekReasoningContent: + """_build_assistant_message pins replay-safe DeepSeek tool-call state.""" + + def test_deepseek_tool_call_reasoning_is_backfilled_into_reasoning_content(self) -> None: + agent = _make_agent(provider="deepseek", model="deepseek-v4-flash") + assistant_message = SimpleNamespace( + content=None, + reasoning="DeepSeek tool-call reasoning", + reasoning_content=None, + reasoning_details=None, + codex_reasoning_items=None, + codex_message_items=None, + tool_calls=[ + SimpleNamespace( + id="call_1", + call_id=None, + response_item_id=None, + type="function", + function=SimpleNamespace(name="terminal", arguments="{}"), + ) + ], + ) + + msg = agent._build_assistant_message(assistant_message, "tool_calls") + + assert msg["reasoning_content"] == "DeepSeek tool-call reasoning" + assert msg["tool_calls"][0]["id"] == "call_1" + + def test_deepseek_model_extra_reasoning_content_is_preserved(self) -> None: + """OpenAI SDK stores unknown provider fields in model_extra.""" + agent = _make_agent(provider="deepseek", model="deepseek-v4-flash") + assistant_message = SimpleNamespace( + content=None, + reasoning=None, + reasoning_content=None, + model_extra={"reasoning_content": "DeepSeek model_extra reasoning"}, + reasoning_details=None, + codex_reasoning_items=None, + codex_message_items=None, + tool_calls=[ + SimpleNamespace( + id="call_1", + call_id=None, + response_item_id=None, + type="function", + function=SimpleNamespace(name="terminal", arguments="{}"), + ) + ], + ) + + msg = agent._build_assistant_message(assistant_message, "tool_calls") + + assert msg["reasoning_content"] == "DeepSeek model_extra reasoning" + + def test_deepseek_tool_call_without_raw_reasoning_content_gets_empty_string(self) -> None: + agent = _make_agent(provider="deepseek", model="deepseek-v4-flash") + assistant_message = SimpleNamespace( + content=None, + reasoning=None, + reasoning_content=None, + reasoning_details=None, + codex_reasoning_items=None, + codex_message_items=None, + tool_calls=[ + SimpleNamespace( + id="call_1", + call_id=None, + response_item_id=None, + type="function", + function=SimpleNamespace(name="terminal", arguments="{}"), + ) + ], + ) + + msg = agent._build_assistant_message(assistant_message, "tool_calls") + + assert msg["reasoning_content"] == "" + assert msg["tool_calls"][0]["id"] == "call_1" + + class TestNeedsKimiToolReasoning: """The extracted _needs_kimi_tool_reasoning() helper keeps Kimi behavior intact.""" diff --git a/tests/run_agent/test_deepseek_v4_thinking_live.py b/tests/run_agent/test_deepseek_v4_thinking_live.py new file mode 100644 index 0000000000..b938c27413 --- /dev/null +++ b/tests/run_agent/test_deepseek_v4_thinking_live.py @@ -0,0 +1,245 @@ +"""Live DeepSeek V4 thinking-mode tool-call replay smoke test. + +Opt-in only: + HERMES_LIVE_TESTS=1 pytest tests/run_agent/test_deepseek_v4_thinking_live.py -q + +Requires DEEPSEEK_API_KEY in the process environment. The key is captured at +module import time because tests/conftest.py intentionally removes credential +environment variables before each test body runs. +""" + +from __future__ import annotations + +import json +import os +import sys +from typing import Any + +import pytest + + +LIVE = os.environ.get("HERMES_LIVE_TESTS") == "1" +DEEPSEEK_KEY = os.environ.get("DEEPSEEK_API_KEY", "") +LIVE_MODELS = ("deepseek-v4-flash", "deepseek-v4-pro") +LIVE_BASE_URL = "https://api.deepseek.com" + +pytestmark = [ + pytest.mark.skipif(not LIVE, reason="live-only: set HERMES_LIVE_TESTS=1"), + pytest.mark.skipif(not DEEPSEEK_KEY, reason="DEEPSEEK_API_KEY not configured"), +] + +TOOL_NAME = "lookup_ticket_status" +TOOLS = [ + { + "type": "function", + "function": { + "name": TOOL_NAME, + "description": "Return the status for a test ticket id.", + "parameters": { + "type": "object", + "properties": { + "ticket_id": { + "type": "string", + "description": "The ticket id to look up.", + }, + }, + "required": ["ticket_id"], + "additionalProperties": False, + }, + }, + } +] + + +def _thinking_kwargs() -> dict: + return { + "reasoning_effort": "high", + "extra_body": {"thinking": {"type": "enabled"}}, + } + + +def _jsonable(value: Any) -> Any: + if hasattr(value, "model_dump"): + return value.model_dump(mode="json") + if isinstance(value, dict): + return {k: _jsonable(v) for k, v in value.items()} + if isinstance(value, list): + return [_jsonable(v) for v in value] + return value + + +def _print_trace(label: str, value: Any) -> None: + sys.__stdout__.write(f"\n--- {label} ---\n") + sys.__stdout__.write( + json.dumps(_jsonable(value), ensure_ascii=False, indent=2, sort_keys=True) + ) + sys.__stdout__.write("\n") + sys.__stdout__.flush() + + +def _message_snapshot(message) -> dict: + return { + "content": getattr(message, "content", None), + "reasoning": getattr(message, "reasoning", None), + "reasoning_content": _raw_reasoning_content(message), + "model_extra": getattr(message, "model_extra", None), + "tool_calls": _jsonable(getattr(message, "tool_calls", None)), + } + + +def _make_live_client(): + from openai import OpenAI + + return OpenAI(api_key=DEEPSEEK_KEY, base_url=LIVE_BASE_URL) + + +def _make_agent_for_message_building(model: str): + from run_agent import AIAgent + + agent = object.__new__(AIAgent) + agent.provider = "deepseek" + agent.model = model + agent.base_url = LIVE_BASE_URL + agent.verbose_logging = False + agent.reasoning_callback = None + agent.stream_delta_callback = None + agent._stream_callback = None + return agent + + +def _raw_reasoning_content(message): + direct = getattr(message, "reasoning_content", None) + if direct is not None: + return direct + model_extra = getattr(message, "model_extra", None) or {} + if isinstance(model_extra, dict) and "reasoning_content" in model_extra: + return model_extra["reasoning_content"] + return None + + +@pytest.mark.parametrize("live_model", LIVE_MODELS) +def test_deepseek_v4_thinking_tool_call_replay_round_trip(live_model: str): + """Hit DeepSeek twice and replay the assistant tool-call turn. + + The first request forces a tool call with thinking enabled. The second + request replays that assistant message with content, reasoning_content, + and tool_calls, then appends the tool result. DeepSeek accepting the + second request is the live guardrail for the V4 thinking replay contract. + """ + + client = _make_live_client() + agent = _make_agent_for_message_building(live_model) + + first_request = { + "model": live_model, + "messages": [ + { + "role": "user", + "content": ( + "You must use the provided lookup_ticket_status tool " + "exactly once with ticket_id 'DS-4242'. Do not answer " + "directly." + ), + } + ], + "tools": TOOLS, + "max_tokens": 1024, + "timeout": 90, + **_thinking_kwargs(), + } + _print_trace(f"{live_model} first request", first_request) + first = client.chat.completions.create(**first_request) + _print_trace(f"{live_model} first raw response", first) + + first_choice = first.choices[0] + first_message = first_choice.message + _print_trace( + f"{live_model} first assistant message", + { + "finish_reason": first_choice.finish_reason, + **_message_snapshot(first_message), + }, + ) + assert first_message.tool_calls, "DeepSeek did not return a tool call" + first_tool_call = first_message.tool_calls[0] + assert first_tool_call.function.name == TOOL_NAME + assert isinstance(json.loads(first_tool_call.function.arguments or "{}"), dict) + + raw_reasoning_content = _raw_reasoning_content(first_message) + assert raw_reasoning_content is not None, ( + "DeepSeek did not return reasoning_content; the thinking payload may " + "not have been honored" + ) + + stored_assistant = agent._build_assistant_message( + first_message, + first_choice.finish_reason or "tool_calls", + ) + _print_trace(f"{live_model} stored assistant message", stored_assistant) + assert stored_assistant["reasoning_content"] == raw_reasoning_content + + replay_assistant = { + "role": "assistant", + "content": stored_assistant.get("content") or "", + "tool_calls": stored_assistant["tool_calls"], + } + agent._copy_reasoning_content_for_api(stored_assistant, replay_assistant) + _print_trace(f"{live_model} replay assistant message", replay_assistant) + + tool_call_id = stored_assistant["tool_calls"][0]["id"] + messages = [ + { + "role": "user", + "content": ( + "You must use the provided lookup_ticket_status tool " + "exactly once with ticket_id 'DS-4242'. Do not answer " + "directly." + ), + }, + replay_assistant, + { + "role": "tool", + "tool_call_id": tool_call_id, + "content": json.dumps( + {"ticket_id": "DS-4242", "status": "green", "source": "live-test"}, + separators=(",", ":"), + ), + }, + ] + + from agent.transports.chat_completions import ChatCompletionsTransport + + api_messages = ChatCompletionsTransport().convert_messages(messages) + _print_trace( + f"{live_model} second request messages after transport conversion", + api_messages, + ) + assert api_messages[1]["reasoning_content"] == raw_reasoning_content + assert "call_id" not in api_messages[1]["tool_calls"][0] + assert "response_item_id" not in api_messages[1]["tool_calls"][0] + + second_request = { + "model": live_model, + "messages": api_messages, + "max_tokens": 1024, + "timeout": 90, + **_thinking_kwargs(), + } + _print_trace(f"{live_model} second request", second_request) + second = client.chat.completions.create(**second_request) + _print_trace(f"{live_model} second raw response", second) + _print_trace( + f"{live_model} second assistant message", + { + "finish_reason": second.choices[0].finish_reason, + **_message_snapshot(second.choices[0].message), + }, + ) + + second_message = second.choices[0].message + final_content = second_message.content or "" + final_reasoning = _raw_reasoning_content(second_message) or "" + assert second.choices[0].finish_reason == "stop" + assert final_content.strip() or final_reasoning.strip(), ( + "DeepSeek returned neither visible content nor reasoning_content" + )