fix(tts): update MiniMax default model to speech-02 and correct API endpoint

The MiniMax TTS defaults were outdated: - DEFAULT_MINIMAX_MODEL was 'speech-01' but MiniMax now uses 'speech-02' - DEFAULT_MINIMAX_BASE_URL was 'https://api.minimax.chat/v1/text_to_speech' which no longer works; the correct endpoint is 'https://api.minimaxi.com/v1/t2a_v2' Users who configured tts.provider: minimax were getting model-not-supported errors because the hardcoded defaults did not match available API permissions.
2026-05-18 04:41:56 +00:00 · 2026-05-07 02:49:46 +08:00 · 2026-05-07 02:49:46 +08:00 · c875c0dc11
commit c875c0dc11
parent 6122a79aab
1 changed files with 79 additions and 41 deletions
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -159,9 +159,9 @@ DEFAULT_KITTENTTS_VOICE = "Jasper"
 DEFAULT_PIPER_VOICE = "en_US-lessac-medium"  # balanced size/quality
 DEFAULT_OPENAI_VOICE = "alloy"
 DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
-DEFAULT_MINIMAX_MODEL = "speech-01"
+DEFAULT_MINIMAX_MODEL = "speech-02"
 DEFAULT_MINIMAX_VOICE_ID = "female-shaonv"
-DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.chat/v1/text_to_speech"
+DEFAULT_MINIMAX_BASE_URL = "https://api.minimaxi.com/v1/t2a_v2"
 DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
 DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8"  # Paul - Neutral
 DEFAULT_XAI_VOICE_ID = "eve"
@ -960,11 +960,11 @@ def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -
 # ===========================================================================
 def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
    """
-    Generate audio using MiniMax TTS API (v1/text_to_speech).
+    Generate audio using MiniMax TTS API.

-    The current API (api.minimax.chat/v1/text_to_speech) uses a simple payload
-    and returns raw audio bytes directly (Content-Type: audio/mpeg), unlike
-    the deprecated v1/t2a_v2 endpoint which returned JSON with hex-encoded audio.
+    Supports two endpoints:
+    - v1/text_to_speech: simple payload, returns raw audio (Content-Type: audio/mpeg)
+    - v1/t2a_v2: nested voice_setting/audio_setting, returns JSON with hex-encoded audio

    Args:
        text: Text to convert (max 10,000 characters).
@ -984,56 +984,94 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
    model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
    voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
    base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
-
-    payload = {
-        "model": model,
-        "text": text,
-        "voice_id": voice_id,
-    }
+    speed = mm_config.get("speed", 1.0)
+    vol = mm_config.get("vol", 1.0)
+    pitch = mm_config.get("pitch", 0)
+    emotion = mm_config.get("emotion", "neutral")
+    sample_rate = mm_config.get("sample_rate", 32000)
+    bitrate = mm_config.get("bitrate", 128000)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

+    # Detect endpoint from URL
+    is_t2a_v2 = "t2a_v2" in base_url
+
+    if is_t2a_v2:
+        # t2a_v2 endpoint: nested voice_setting/audio_setting structure
+        payload = {
+            "model": model,
+            "text": text,
+            "voice_setting": {
+                "voice_id": voice_id,
+                "speed": speed,
+                "vol": vol,
+                "pitch": pitch,
+                "emotion": emotion,
+            },
+            "audio_setting": {
+                "sample_rate": sample_rate,
+                "bitrate": bitrate,
+                "format": "mp3",
+                "channel": 1,
+            },
+        }
+    else:
+        # text_to_speech endpoint: flat payload
+        payload = {
+            "model": model,
+            "text": text,
+            "voice_id": voice_id,
+        }
+
    response = requests.post(base_url, json=payload, headers=headers, timeout=60)

-    content_type = response.headers.get("Content-Type", "")
+    if is_t2a_v2:
+        # t2a_v2 returns JSON with hex-encoded audio
+        result = response.json()
+        base_resp = result.get("base_resp", {})
+        status_code = base_resp.get("status_code", -1)

-    if "audio/" in content_type:
-        # New API: returns raw audio directly
+        if status_code != 0:
+            status_msg = base_resp.get("status_msg", "unknown error")
+            raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
+
+        hex_audio = result.get("data", {}).get("audio", "")
+        if not hex_audio:
+            raise RuntimeError("MiniMax TTS returned empty audio data")
+
+        audio_bytes = bytes.fromhex(hex_audio)
        with open(output_path, "wb") as f:
-            f.write(response.content)
+            f.write(audio_bytes)
        return output_path

-    # Legacy / fallback: try parsing as JSON with hex-encoded audio
-    try:
-        result = response.json()
-    except Exception:
-        response.raise_for_status()
-        raise RuntimeError(
-            f"MiniMax TTS returned unexpected Content-Type '{content_type}' "
-            f"({len(response.content)} bytes)"
-        )
+    else:
+        # text_to_speech returns raw audio directly
+        content_type = response.headers.get("Content-Type", "")

-    base_resp = result.get("base_resp", {})
-    status_code = base_resp.get("status_code", -1)
+        if "audio/" in content_type:
+            with open(output_path, "wb") as f:
+                f.write(response.content)
+            return output_path

-    if status_code != 0:
-        status_msg = base_resp.get("status_msg", "unknown error")
-        raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
+        # Fallback: try parsing as JSON
+        try:
+            result = response.json()
+            base_resp = result.get("base_resp", {})
+            status_code = base_resp.get("status_code", -1)
+            if status_code != 0:
+                status_msg = base_resp.get("status_msg", "unknown error")
+                raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
+        except Exception:
+            response.raise_for_status()
+            raise RuntimeError(
+                f"MiniMax TTS returned unexpected Content-Type '{content_type}' "
+                f"({len(response.content)} bytes)"
+            )

-    hex_audio = result.get("data", {}).get("audio", "")
-    if not hex_audio:
-        raise RuntimeError("MiniMax TTS returned empty audio data")
-
-    # Legacy: hex-encoded audio
-    audio_bytes = bytes.fromhex(hex_audio)
-
-    with open(output_path, "wb") as f:
-        f.write(audio_bytes)
-
-    return output_path
+        raise RuntimeError("MiniMax TTS returned no audio data")


 # ===========================================================================