"""Live DeepSeek V4 thinking-mode tool-call replay smoke test. Opt-in only: HERMES_LIVE_TESTS=1 pytest tests/run_agent/test_deepseek_v4_thinking_live.py -q Requires DEEPSEEK_API_KEY in the process environment. The key is captured at module import time because tests/conftest.py intentionally removes credential environment variables before each test body runs. """ from __future__ import annotations import json import os import sys from typing import Any import pytest LIVE = os.environ.get("HERMES_LIVE_TESTS") == "1" DEEPSEEK_KEY = os.environ.get("DEEPSEEK_API_KEY", "") LIVE_MODELS = ("deepseek-v4-flash", "deepseek-v4-pro") LIVE_BASE_URL = "https://api.deepseek.com" pytestmark = [ pytest.mark.skipif(not LIVE, reason="live-only: set HERMES_LIVE_TESTS=1"), pytest.mark.skipif(not DEEPSEEK_KEY, reason="DEEPSEEK_API_KEY not configured"), ] TOOL_NAME = "lookup_ticket_status" TOOLS = [ { "type": "function", "function": { "name": TOOL_NAME, "description": "Return the status for a test ticket id.", "parameters": { "type": "object", "properties": { "ticket_id": { "type": "string", "description": "The ticket id to look up.", }, }, "required": ["ticket_id"], "additionalProperties": False, }, }, } ] def _thinking_kwargs() -> dict: return { "reasoning_effort": "high", "extra_body": {"thinking": {"type": "enabled"}}, } def _jsonable(value: Any) -> Any: if hasattr(value, "model_dump"): return value.model_dump(mode="json") if isinstance(value, dict): return {k: _jsonable(v) for k, v in value.items()} if isinstance(value, list): return [_jsonable(v) for v in value] return value def _print_trace(label: str, value: Any) -> None: sys.__stdout__.write(f"\n--- {label} ---\n") sys.__stdout__.write( json.dumps(_jsonable(value), ensure_ascii=False, indent=2, sort_keys=True) ) sys.__stdout__.write("\n") sys.__stdout__.flush() def _message_snapshot(message) -> dict: return { "content": getattr(message, "content", None), "reasoning": getattr(message, "reasoning", None), "reasoning_content": _raw_reasoning_content(message), "model_extra": getattr(message, "model_extra", None), "tool_calls": _jsonable(getattr(message, "tool_calls", None)), } def _make_live_client(): from openai import OpenAI return OpenAI(api_key=DEEPSEEK_KEY, base_url=LIVE_BASE_URL) def _make_agent_for_message_building(model: str): from run_agent import AIAgent agent = object.__new__(AIAgent) agent.provider = "deepseek" agent.model = model agent.base_url = LIVE_BASE_URL agent.verbose_logging = False agent.reasoning_callback = None agent.stream_delta_callback = None agent._stream_callback = None return agent def _raw_reasoning_content(message): direct = getattr(message, "reasoning_content", None) if direct is not None: return direct model_extra = getattr(message, "model_extra", None) or {} if isinstance(model_extra, dict) and "reasoning_content" in model_extra: return model_extra["reasoning_content"] return None @pytest.mark.parametrize("live_model", LIVE_MODELS) def test_deepseek_v4_thinking_tool_call_replay_round_trip(live_model: str): """Hit DeepSeek twice and replay the assistant tool-call turn. The first request forces a tool call with thinking enabled. The second request replays that assistant message with content, reasoning_content, and tool_calls, then appends the tool result. DeepSeek accepting the second request is the live guardrail for the V4 thinking replay contract. """ client = _make_live_client() agent = _make_agent_for_message_building(live_model) first_request = { "model": live_model, "messages": [ { "role": "user", "content": ( "You must use the provided lookup_ticket_status tool " "exactly once with ticket_id 'DS-4242'. Do not answer " "directly." ), } ], "tools": TOOLS, "max_tokens": 1024, "timeout": 90, **_thinking_kwargs(), } _print_trace(f"{live_model} first request", first_request) first = client.chat.completions.create(**first_request) _print_trace(f"{live_model} first raw response", first) first_choice = first.choices[0] first_message = first_choice.message _print_trace( f"{live_model} first assistant message", { "finish_reason": first_choice.finish_reason, **_message_snapshot(first_message), }, ) assert first_message.tool_calls, "DeepSeek did not return a tool call" first_tool_call = first_message.tool_calls[0] assert first_tool_call.function.name == TOOL_NAME assert isinstance(json.loads(first_tool_call.function.arguments or "{}"), dict) raw_reasoning_content = _raw_reasoning_content(first_message) assert raw_reasoning_content is not None, ( "DeepSeek did not return reasoning_content; the thinking payload may " "not have been honored" ) stored_assistant = agent._build_assistant_message( first_message, first_choice.finish_reason or "tool_calls", ) _print_trace(f"{live_model} stored assistant message", stored_assistant) assert stored_assistant["reasoning_content"] == raw_reasoning_content replay_assistant = { "role": "assistant", "content": stored_assistant.get("content") or "", "tool_calls": stored_assistant["tool_calls"], } agent._copy_reasoning_content_for_api(stored_assistant, replay_assistant) _print_trace(f"{live_model} replay assistant message", replay_assistant) tool_call_id = stored_assistant["tool_calls"][0]["id"] messages = [ { "role": "user", "content": ( "You must use the provided lookup_ticket_status tool " "exactly once with ticket_id 'DS-4242'. Do not answer " "directly." ), }, replay_assistant, { "role": "tool", "tool_call_id": tool_call_id, "content": json.dumps( {"ticket_id": "DS-4242", "status": "green", "source": "live-test"}, separators=(",", ":"), ), }, ] from agent.transports.chat_completions import ChatCompletionsTransport api_messages = ChatCompletionsTransport().convert_messages(messages) _print_trace( f"{live_model} second request messages after transport conversion", api_messages, ) assert api_messages[1]["reasoning_content"] == raw_reasoning_content assert "call_id" not in api_messages[1]["tool_calls"][0] assert "response_item_id" not in api_messages[1]["tool_calls"][0] second_request = { "model": live_model, "messages": api_messages, "max_tokens": 1024, "timeout": 90, **_thinking_kwargs(), } _print_trace(f"{live_model} second request", second_request) second = client.chat.completions.create(**second_request) _print_trace(f"{live_model} second raw response", second) _print_trace( f"{live_model} second assistant message", { "finish_reason": second.choices[0].finish_reason, **_message_snapshot(second.choices[0].message), }, ) second_message = second.choices[0].message final_content = second_message.content or "" final_reasoning = _raw_reasoning_content(second_message) or "" assert second.choices[0].finish_reason == "stop" assert final_content.strip() or final_reasoning.strip(), ( "DeepSeek returned neither visible content nor reasoning_content" )