fix(deepseek): preserve v4 reasoning_content on replay

2026-05-03 02:11:48 +00:00 · 2026-04-28 14:17:11 +08:00 · 2026-04-28 14:17:11 +08:00 · b9b9ee3e6c
commit b9b9ee3e6c
parent 8fbc9d7d78
5 changed files with 398 additions and 33 deletions
--- a/tests/run_agent/test_deepseek_v4_thinking_live.py
+++ b/tests/run_agent/test_deepseek_v4_thinking_live.py
@ -0,0 +1,245 @@
+"""Live DeepSeek V4 thinking-mode tool-call replay smoke test.
+
+Opt-in only:
+    HERMES_LIVE_TESTS=1 pytest tests/run_agent/test_deepseek_v4_thinking_live.py -q
+
+Requires DEEPSEEK_API_KEY in the process environment. The key is captured at
+module import time because tests/conftest.py intentionally removes credential
+environment variables before each test body runs.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from typing import Any
+
+import pytest
+
+
+LIVE = os.environ.get("HERMES_LIVE_TESTS") == "1"
+DEEPSEEK_KEY = os.environ.get("DEEPSEEK_API_KEY", "")
+LIVE_MODELS = ("deepseek-v4-flash", "deepseek-v4-pro")
+LIVE_BASE_URL = "https://api.deepseek.com"
+
+pytestmark = [
+    pytest.mark.skipif(not LIVE, reason="live-only: set HERMES_LIVE_TESTS=1"),
+    pytest.mark.skipif(not DEEPSEEK_KEY, reason="DEEPSEEK_API_KEY not configured"),
+]
+
+TOOL_NAME = "lookup_ticket_status"
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": TOOL_NAME,
+            "description": "Return the status for a test ticket id.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "ticket_id": {
+                        "type": "string",
+                        "description": "The ticket id to look up.",
+                    },
+                },
+                "required": ["ticket_id"],
+                "additionalProperties": False,
+            },
+        },
+    }
+]
+
+
+def _thinking_kwargs() -> dict:
+    return {
+        "reasoning_effort": "high",
+        "extra_body": {"thinking": {"type": "enabled"}},
+    }
+
+
+def _jsonable(value: Any) -> Any:
+    if hasattr(value, "model_dump"):
+        return value.model_dump(mode="json")
+    if isinstance(value, dict):
+        return {k: _jsonable(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [_jsonable(v) for v in value]
+    return value
+
+
+def _print_trace(label: str, value: Any) -> None:
+    sys.__stdout__.write(f"\n--- {label} ---\n")
+    sys.__stdout__.write(
+        json.dumps(_jsonable(value), ensure_ascii=False, indent=2, sort_keys=True)
+    )
+    sys.__stdout__.write("\n")
+    sys.__stdout__.flush()
+
+
+def _message_snapshot(message) -> dict:
+    return {
+        "content": getattr(message, "content", None),
+        "reasoning": getattr(message, "reasoning", None),
+        "reasoning_content": _raw_reasoning_content(message),
+        "model_extra": getattr(message, "model_extra", None),
+        "tool_calls": _jsonable(getattr(message, "tool_calls", None)),
+    }
+
+
+def _make_live_client():
+    from openai import OpenAI
+
+    return OpenAI(api_key=DEEPSEEK_KEY, base_url=LIVE_BASE_URL)
+
+
+def _make_agent_for_message_building(model: str):
+    from run_agent import AIAgent
+
+    agent = object.__new__(AIAgent)
+    agent.provider = "deepseek"
+    agent.model = model
+    agent.base_url = LIVE_BASE_URL
+    agent.verbose_logging = False
+    agent.reasoning_callback = None
+    agent.stream_delta_callback = None
+    agent._stream_callback = None
+    return agent
+
+
+def _raw_reasoning_content(message):
+    direct = getattr(message, "reasoning_content", None)
+    if direct is not None:
+        return direct
+    model_extra = getattr(message, "model_extra", None) or {}
+    if isinstance(model_extra, dict) and "reasoning_content" in model_extra:
+        return model_extra["reasoning_content"]
+    return None
+
+
+@pytest.mark.parametrize("live_model", LIVE_MODELS)
+def test_deepseek_v4_thinking_tool_call_replay_round_trip(live_model: str):
+    """Hit DeepSeek twice and replay the assistant tool-call turn.
+
+    The first request forces a tool call with thinking enabled. The second
+    request replays that assistant message with content, reasoning_content,
+    and tool_calls, then appends the tool result. DeepSeek accepting the
+    second request is the live guardrail for the V4 thinking replay contract.
+    """
+
+    client = _make_live_client()
+    agent = _make_agent_for_message_building(live_model)
+
+    first_request = {
+        "model": live_model,
+        "messages": [
+            {
+                "role": "user",
+                "content": (
+                    "You must use the provided lookup_ticket_status tool "
+                    "exactly once with ticket_id 'DS-4242'. Do not answer "
+                    "directly."
+                ),
+            }
+        ],
+        "tools": TOOLS,
+        "max_tokens": 1024,
+        "timeout": 90,
+        **_thinking_kwargs(),
+    }
+    _print_trace(f"{live_model} first request", first_request)
+    first = client.chat.completions.create(**first_request)
+    _print_trace(f"{live_model} first raw response", first)
+
+    first_choice = first.choices[0]
+    first_message = first_choice.message
+    _print_trace(
+        f"{live_model} first assistant message",
+        {
+            "finish_reason": first_choice.finish_reason,
+            **_message_snapshot(first_message),
+        },
+    )
+    assert first_message.tool_calls, "DeepSeek did not return a tool call"
+    first_tool_call = first_message.tool_calls[0]
+    assert first_tool_call.function.name == TOOL_NAME
+    assert isinstance(json.loads(first_tool_call.function.arguments or "{}"), dict)
+
+    raw_reasoning_content = _raw_reasoning_content(first_message)
+    assert raw_reasoning_content is not None, (
+        "DeepSeek did not return reasoning_content; the thinking payload may "
+        "not have been honored"
+    )
+
+    stored_assistant = agent._build_assistant_message(
+        first_message,
+        first_choice.finish_reason or "tool_calls",
+    )
+    _print_trace(f"{live_model} stored assistant message", stored_assistant)
+    assert stored_assistant["reasoning_content"] == raw_reasoning_content
+
+    replay_assistant = {
+        "role": "assistant",
+        "content": stored_assistant.get("content") or "",
+        "tool_calls": stored_assistant["tool_calls"],
+    }
+    agent._copy_reasoning_content_for_api(stored_assistant, replay_assistant)
+    _print_trace(f"{live_model} replay assistant message", replay_assistant)
+
+    tool_call_id = stored_assistant["tool_calls"][0]["id"]
+    messages = [
+        {
+            "role": "user",
+            "content": (
+                "You must use the provided lookup_ticket_status tool "
+                "exactly once with ticket_id 'DS-4242'. Do not answer "
+                "directly."
+            ),
+        },
+        replay_assistant,
+        {
+            "role": "tool",
+            "tool_call_id": tool_call_id,
+            "content": json.dumps(
+                {"ticket_id": "DS-4242", "status": "green", "source": "live-test"},
+                separators=(",", ":"),
+            ),
+        },
+    ]
+
+    from agent.transports.chat_completions import ChatCompletionsTransport
+
+    api_messages = ChatCompletionsTransport().convert_messages(messages)
+    _print_trace(
+        f"{live_model} second request messages after transport conversion",
+        api_messages,
+    )
+    assert api_messages[1]["reasoning_content"] == raw_reasoning_content
+    assert "call_id" not in api_messages[1]["tool_calls"][0]
+    assert "response_item_id" not in api_messages[1]["tool_calls"][0]
+
+    second_request = {
+        "model": live_model,
+        "messages": api_messages,
+        "max_tokens": 1024,
+        "timeout": 90,
+        **_thinking_kwargs(),
+    }
+    _print_trace(f"{live_model} second request", second_request)
+    second = client.chat.completions.create(**second_request)
+    _print_trace(f"{live_model} second raw response", second)
+    _print_trace(
+        f"{live_model} second assistant message",
+        {
+            "finish_reason": second.choices[0].finish_reason,
+            **_message_snapshot(second.choices[0].message),
+        },
+    )
+
+    second_message = second.choices[0].message
+    final_content = second_message.content or ""
+    final_reasoning = _raw_reasoning_content(second_message) or ""
+    assert second.choices[0].finish_reason == "stop"
+    assert final_content.strip() or final_reasoning.strip(), (
+        "DeepSeek returned neither visible content nor reasoning_content"
+    )