diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 1ea3ba21c63..a0ea52a1d01 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -159,9 +159,9 @@ DEFAULT_KITTENTTS_VOICE = "Jasper" DEFAULT_PIPER_VOICE = "en_US-lessac-medium" # balanced size/quality DEFAULT_OPENAI_VOICE = "alloy" DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1" -DEFAULT_MINIMAX_MODEL = "speech-01" +DEFAULT_MINIMAX_MODEL = "speech-02" DEFAULT_MINIMAX_VOICE_ID = "female-shaonv" -DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.chat/v1/text_to_speech" +DEFAULT_MINIMAX_BASE_URL = "https://api.minimaxi.com/v1/t2a_v2" DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603" DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral DEFAULT_XAI_VOICE_ID = "eve" @@ -960,11 +960,11 @@ def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) - # =========================================================================== def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: """ - Generate audio using MiniMax TTS API (v1/text_to_speech). + Generate audio using MiniMax TTS API. - The current API (api.minimax.chat/v1/text_to_speech) uses a simple payload - and returns raw audio bytes directly (Content-Type: audio/mpeg), unlike - the deprecated v1/t2a_v2 endpoint which returned JSON with hex-encoded audio. + Supports two endpoints: + - v1/text_to_speech: simple payload, returns raw audio (Content-Type: audio/mpeg) + - v1/t2a_v2: nested voice_setting/audio_setting, returns JSON with hex-encoded audio Args: text: Text to convert (max 10,000 characters). @@ -984,56 +984,94 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any model = mm_config.get("model", DEFAULT_MINIMAX_MODEL) voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID) base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL) - - payload = { - "model": model, - "text": text, - "voice_id": voice_id, - } + speed = mm_config.get("speed", 1.0) + vol = mm_config.get("vol", 1.0) + pitch = mm_config.get("pitch", 0) + emotion = mm_config.get("emotion", "neutral") + sample_rate = mm_config.get("sample_rate", 32000) + bitrate = mm_config.get("bitrate", 128000) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}", } + # Detect endpoint from URL + is_t2a_v2 = "t2a_v2" in base_url + + if is_t2a_v2: + # t2a_v2 endpoint: nested voice_setting/audio_setting structure + payload = { + "model": model, + "text": text, + "voice_setting": { + "voice_id": voice_id, + "speed": speed, + "vol": vol, + "pitch": pitch, + "emotion": emotion, + }, + "audio_setting": { + "sample_rate": sample_rate, + "bitrate": bitrate, + "format": "mp3", + "channel": 1, + }, + } + else: + # text_to_speech endpoint: flat payload + payload = { + "model": model, + "text": text, + "voice_id": voice_id, + } + response = requests.post(base_url, json=payload, headers=headers, timeout=60) - content_type = response.headers.get("Content-Type", "") + if is_t2a_v2: + # t2a_v2 returns JSON with hex-encoded audio + result = response.json() + base_resp = result.get("base_resp", {}) + status_code = base_resp.get("status_code", -1) - if "audio/" in content_type: - # New API: returns raw audio directly + if status_code != 0: + status_msg = base_resp.get("status_msg", "unknown error") + raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}") + + hex_audio = result.get("data", {}).get("audio", "") + if not hex_audio: + raise RuntimeError("MiniMax TTS returned empty audio data") + + audio_bytes = bytes.fromhex(hex_audio) with open(output_path, "wb") as f: - f.write(response.content) + f.write(audio_bytes) return output_path - # Legacy / fallback: try parsing as JSON with hex-encoded audio - try: - result = response.json() - except Exception: - response.raise_for_status() - raise RuntimeError( - f"MiniMax TTS returned unexpected Content-Type '{content_type}' " - f"({len(response.content)} bytes)" - ) + else: + # text_to_speech returns raw audio directly + content_type = response.headers.get("Content-Type", "") - base_resp = result.get("base_resp", {}) - status_code = base_resp.get("status_code", -1) + if "audio/" in content_type: + with open(output_path, "wb") as f: + f.write(response.content) + return output_path - if status_code != 0: - status_msg = base_resp.get("status_msg", "unknown error") - raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}") + # Fallback: try parsing as JSON + try: + result = response.json() + base_resp = result.get("base_resp", {}) + status_code = base_resp.get("status_code", -1) + if status_code != 0: + status_msg = base_resp.get("status_msg", "unknown error") + raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}") + except Exception: + response.raise_for_status() + raise RuntimeError( + f"MiniMax TTS returned unexpected Content-Type '{content_type}' " + f"({len(response.content)} bytes)" + ) - hex_audio = result.get("data", {}).get("audio", "") - if not hex_audio: - raise RuntimeError("MiniMax TTS returned empty audio data") - - # Legacy: hex-encoded audio - audio_bytes = bytes.fromhex(hex_audio) - - with open(output_path, "wb") as f: - f.write(audio_bytes) - - return output_path + raise RuntimeError("MiniMax TTS returned no audio data") # ===========================================================================