From 68754719165265e3dc152f10cabf9f46a4b45122 Mon Sep 17 00:00:00 2001
From: lhysdl <lhysdl@gmail.com>
Date: Sun, 26 Apr 2026 16:46:09 +0800
Subject: [PATCH] fix(tts): update MiniMax API endpoint to v1/text_to_speech

MiniMax deprecated the old v1/t2a_v2 endpoint (api.minimax.io) and
moved to v1/text_to_speech (api.minimax.chat). The new API:

- Uses a flat payload: {model, text, voice_id} instead of nested
  voice_setting / audio_setting objects
- Returns raw audio bytes (Content-Type: audio/mpeg) instead of
  JSON with hex-encoded audio
- Uses model 'speech-01' instead of 'speech-2.8-hd'
- Updated default voice_id to 'female-shaonv' for Chinese TTS

The implementation detects Content-Type to handle both old and new
API responses, maintaining backward compatibility for any users who
manually configured the legacy base_url.
---
 tests/tools/test_tts_speed.py | 39 ++++++++++++-----------
 tools/tts_tool.py             | 60 ++++++++++++++++-------------------
 2 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/tests/tools/test_tts_speed.py b/tests/tools/test_tts_speed.py
index 7622a7f622..8a3866aaa8 100644
--- a/tests/tools/test_tts_speed.py
+++ b/tests/tools/test_tts_speed.py
@@ -110,7 +110,7 @@ class TestOpenaiTtsSpeed:
 
 
 # ---------------------------------------------------------------------------
-# MiniMax TTS speed (global fallback wired)
+# MiniMax TTS (new API: raw audio, no speed/voice_setting)
 # ---------------------------------------------------------------------------
 
 class TestMinimaxTtsSpeed:
@@ -118,28 +118,29 @@ class TestMinimaxTtsSpeed:
         monkeypatch.setenv("MINIMAX_API_KEY", "test-key")
         mock_response = MagicMock()
         mock_response.status_code = 200
-        mock_response.json.return_value = {
-            "data": {"audio": "deadbeef"},
-            "base_resp": {"status_code": 0, "status_msg": "success"},
-            "extra_info": {"audio_size": 8},
-        }
+        mock_response.headers = {"Content-Type": "audio/mpeg"}
+        mock_response.content = b"\x00\x01\x02\x03"
 
         # requests is imported locally inside _generate_minimax_tts
         with patch("requests.post", return_value=mock_response) as mock_post:
             from tools.tts_tool import _generate_minimax_tts
-            _generate_minimax_tts("Hello", str(tmp_path / "out.mp3"), tts_config)
-        return mock_post
+            output = _generate_minimax_tts("Hello", str(tmp_path / "out.mp3"), tts_config)
+        return mock_post, output
 
-    def test_global_speed_fallback(self, tmp_path, monkeypatch):
-        """Global tts.speed used when minimax.speed not set."""
-        mock_post = self._run({"speed": 1.5}, tmp_path, monkeypatch)
+    def test_simple_payload(self, tmp_path, monkeypatch):
+        """New API uses flat payload with model, text, voice_id."""
+        mock_post, _ = self._run({}, tmp_path, monkeypatch)
         payload = mock_post.call_args[1]["json"]
-        assert payload["voice_setting"]["speed"] == 1.5
+        assert "model" in payload
+        assert "text" in payload
+        assert "voice_id" in payload
+        assert "voice_setting" not in payload
+        assert "audio_setting" not in payload
+        assert "stream" not in payload
 
-    def test_provider_speed_overrides_global(self, tmp_path, monkeypatch):
-        """tts.minimax.speed takes precedence over tts.speed."""
-        mock_post = self._run(
-            {"speed": 1.5, "minimax": {"speed": 2.0}}, tmp_path, monkeypatch
-        )
-        payload = mock_post.call_args[1]["json"]
-        assert payload["voice_setting"]["speed"] == 2.0
+    def test_writes_raw_audio(self, tmp_path, monkeypatch):
+        """New API returns raw bytes written directly to file."""
+        _, output = self._run({}, tmp_path, monkeypatch)
+        assert output == str(tmp_path / "out.mp3")
+        with open(output, "rb") as f:
+            assert f.read() == b"\x00\x01\x02\x03"
diff --git a/tools/tts_tool.py b/tools/tts_tool.py
index 7473b32a1d..8b82e1665b 100644
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@@ -136,9 +136,9 @@ DEFAULT_KITTENTTS_VOICE = "Jasper"
 DEFAULT_PIPER_VOICE = "en_US-lessac-medium"  # balanced size/quality
 DEFAULT_OPENAI_VOICE = "alloy"
 DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
-DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
-DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady"
-DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2"
+DEFAULT_MINIMAX_MODEL = "speech-01"
+DEFAULT_MINIMAX_VOICE_ID = "female-shaonv"
+DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.chat/v1/text_to_speech"
 DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
 DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8"  # Paul - Neutral
 DEFAULT_XAI_VOICE_ID = "eve"
@@ -925,10 +925,11 @@ def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -
 # ===========================================================================
 def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
     """
-    Generate audio using MiniMax TTS API.
+    Generate audio using MiniMax TTS API (v1/text_to_speech).
 
-    MiniMax returns hex-encoded audio data. Supports streaming (SSE) and
-    non-streaming modes. This implementation uses non-streaming for simplicity.
+    The current API (api.minimax.chat/v1/text_to_speech) uses a simple payload
+    and returns raw audio bytes directly (Content-Type: audio/mpeg), unlike
+    the deprecated v1/t2a_v2 endpoint which returned JSON with hex-encoded audio.
 
     Args:
         text: Text to convert (max 10,000 characters).
@@ -947,35 +948,12 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
     mm_config = tts_config.get("minimax", {})
     model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
     voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
-    speed = mm_config.get("speed", tts_config.get("speed", 1))
-    vol = mm_config.get("vol", 1)
-    pitch = mm_config.get("pitch", 0)
     base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
 
-    # Determine audio format from output extension
-    if output_path.endswith(".wav"):
-        audio_format = "wav"
-    elif output_path.endswith(".flac"):
-        audio_format = "flac"
-    else:
-        audio_format = "mp3"
-
     payload = {
         "model": model,
         "text": text,
-        "stream": False,
-        "voice_setting": {
-            "voice_id": voice_id,
-            "speed": speed,
-            "vol": vol,
-            "pitch": pitch,
-        },
-        "audio_setting": {
-            "sample_rate": 32000,
-            "bitrate": 128000,
-            "format": audio_format,
-            "channel": 1,
-        },
+        "voice_id": voice_id,
     }
 
     headers = {
@@ -984,9 +962,25 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
     }
 
     response = requests.post(base_url, json=payload, headers=headers, timeout=60)
-    response.raise_for_status()
 
-    result = response.json()
+    content_type = response.headers.get("Content-Type", "")
+
+    if "audio/" in content_type:
+        # New API: returns raw audio directly
+        with open(output_path, "wb") as f:
+            f.write(response.content)
+        return output_path
+
+    # Legacy / fallback: try parsing as JSON with hex-encoded audio
+    try:
+        result = response.json()
+    except Exception:
+        response.raise_for_status()
+        raise RuntimeError(
+            f"MiniMax TTS returned unexpected Content-Type '{content_type}' "
+            f"({len(response.content)} bytes)"
+        )
+
     base_resp = result.get("base_resp", {})
     status_code = base_resp.get("status_code", -1)
 
@@ -998,7 +992,7 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
     if not hex_audio:
         raise RuntimeError("MiniMax TTS returned empty audio data")
 
-    # MiniMax returns hex-encoded audio (not base64)
+    # Legacy: hex-encoded audio
     audio_bytes = bytes.fromhex(hex_audio)
 
     with open(output_path, "wb") as f: