fix(tts): update MiniMax default model to speech-02 and correct API endpoint

The MiniMax TTS defaults were outdated: - DEFAULT_MINIMAX_MODEL was 'speech-01' but MiniMax now uses 'speech-02' - DEFAULT_MINIMAX_BASE_URL was 'https://api.minimax.chat/v1/text_to_speech' which no longer works; the correct endpoint is 'https://api.minimaxi.com/v1/t2a_v2' Users who configured tts.provider: minimax were getting model-not-supported errors because the hardcoded defaults did not match available API permissions.
2026-05-25 05:52:34 +00:00 · 2026-05-07 02:49:46 +08:00 · 2026-05-07 02:49:46 +08:00 · c875c0dc11
commit c875c0dc11
parent 6122a79aab
1 changed files with 79 additions and 41 deletions
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -159,9 +159,9 @@ DEFAULT_KITTENTTS_VOICE = "Jasper"
 DEFAULT_PIPER_VOICE = "en_US-lessac-medium"  # balanced size/quality
 DEFAULT_OPENAI_VOICE = "alloy"
 DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
-DEFAULT_MINIMAX_MODEL = "speech-01"
+DEFAULT_MINIMAX_MODEL = "speech-02"
 DEFAULT_MINIMAX_VOICE_ID = "female-shaonv"
-DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.chat/v1/text_to_speech"
+DEFAULT_MINIMAX_BASE_URL = "https://api.minimaxi.com/v1/t2a_v2"
 DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
 DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8"  # Paul - Neutral
 DEFAULT_XAI_VOICE_ID = "eve"
@ -960,11 +960,11 @@ def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -
 # ===========================================================================
 def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
    """
-    Generate audio using MiniMax TTS API (v1/text_to_speech).
+    Generate audio using MiniMax TTS API.
-    The current API (api.minimax.chat/v1/text_to_speech) uses a simple payload
+    Supports two endpoints:
-    and returns raw audio bytes directly (Content-Type: audio/mpeg), unlike
+    - v1/text_to_speech: simple payload, returns raw audio (Content-Type: audio/mpeg)
-    the deprecated v1/t2a_v2 endpoint which returned JSON with hex-encoded audio.
+    - v1/t2a_v2: nested voice_setting/audio_setting, returns JSON with hex-encoded audio
    Args:
        text: Text to convert (max 10,000 characters).
@ -984,56 +984,94 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
    model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
    voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
    base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
-
+    speed = mm_config.get("speed", 1.0)
-    payload = {
+    vol = mm_config.get("vol", 1.0)
-        "model": model,
+    pitch = mm_config.get("pitch", 0)
-        "text": text,
+    emotion = mm_config.get("emotion", "neutral")
-        "voice_id": voice_id,
+    sample_rate = mm_config.get("sample_rate", 32000)
-    }
+    bitrate = mm_config.get("bitrate", 128000)
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }
    # Detect endpoint from URL
    is_t2a_v2 = "t2a_v2" in base_url
    if is_t2a_v2:
        # t2a_v2 endpoint: nested voice_setting/audio_setting structure
        payload = {
            "model": model,
            "text": text,
            "voice_setting": {
                "voice_id": voice_id,
                "speed": speed,
                "vol": vol,
                "pitch": pitch,
                "emotion": emotion,
            },
            "audio_setting": {
                "sample_rate": sample_rate,
                "bitrate": bitrate,
                "format": "mp3",
                "channel": 1,
            },
        }
    else:
        # text_to_speech endpoint: flat payload
        payload = {
            "model": model,
            "text": text,
            "voice_id": voice_id,
        }
    response = requests.post(base_url, json=payload, headers=headers, timeout=60)
-    content_type = response.headers.get("Content-Type", "")
+    if is_t2a_v2:
        # t2a_v2 returns JSON with hex-encoded audio
        result = response.json()
        base_resp = result.get("base_resp", {})
        status_code = base_resp.get("status_code", -1)
-    if "audio/" in content_type:
+        if status_code != 0:
-        # New API: returns raw audio directly
+            status_msg = base_resp.get("status_msg", "unknown error")
            raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
        hex_audio = result.get("data", {}).get("audio", "")
        if not hex_audio:
            raise RuntimeError("MiniMax TTS returned empty audio data")
        audio_bytes = bytes.fromhex(hex_audio)
        with open(output_path, "wb") as f:
-            f.write(response.content)
+            f.write(audio_bytes)
        return output_path
-    # Legacy / fallback: try parsing as JSON with hex-encoded audio
+    else:
-    try:
+        # text_to_speech returns raw audio directly
-        result = response.json()
+        content_type = response.headers.get("Content-Type", "")
    except Exception:
        response.raise_for_status()
        raise RuntimeError(
            f"MiniMax TTS returned unexpected Content-Type '{content_type}' "
            f"({len(response.content)} bytes)"
        )
-    base_resp = result.get("base_resp", {})
+        if "audio/" in content_type:
-    status_code = base_resp.get("status_code", -1)
+            with open(output_path, "wb") as f:
                f.write(response.content)
            return output_path
-    if status_code != 0:
+        # Fallback: try parsing as JSON
-        status_msg = base_resp.get("status_msg", "unknown error")
+        try:
-        raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
+            result = response.json()
            base_resp = result.get("base_resp", {})
            status_code = base_resp.get("status_code", -1)
            if status_code != 0:
                status_msg = base_resp.get("status_msg", "unknown error")
                raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
        except Exception:
            response.raise_for_status()
            raise RuntimeError(
                f"MiniMax TTS returned unexpected Content-Type '{content_type}' "
                f"({len(response.content)} bytes)"
            )
-    hex_audio = result.get("data", {}).get("audio", "")
+        raise RuntimeError("MiniMax TTS returned no audio data")
    if not hex_audio:
        raise RuntimeError("MiniMax TTS returned empty audio data")
    # Legacy: hex-encoded audio
    audio_bytes = bytes.fromhex(hex_audio)
    with open(output_path, "wb") as f:
        f.write(audio_bytes)
    return output_path
 # ===========================================================================