From 68754719165265e3dc152f10cabf9f46a4b45122 Mon Sep 17 00:00:00 2001 From: lhysdl Date: Sun, 26 Apr 2026 16:46:09 +0800 Subject: [PATCH] fix(tts): update MiniMax API endpoint to v1/text_to_speech MiniMax deprecated the old v1/t2a_v2 endpoint (api.minimax.io) and moved to v1/text_to_speech (api.minimax.chat). The new API: - Uses a flat payload: {model, text, voice_id} instead of nested voice_setting / audio_setting objects - Returns raw audio bytes (Content-Type: audio/mpeg) instead of JSON with hex-encoded audio - Uses model 'speech-01' instead of 'speech-2.8-hd' - Updated default voice_id to 'female-shaonv' for Chinese TTS The implementation detects Content-Type to handle both old and new API responses, maintaining backward compatibility for any users who manually configured the legacy base_url. --- tests/tools/test_tts_speed.py | 39 ++++++++++++----------- tools/tts_tool.py | 60 ++++++++++++++++------------------- 2 files changed, 47 insertions(+), 52 deletions(-) diff --git a/tests/tools/test_tts_speed.py b/tests/tools/test_tts_speed.py index 7622a7f622..8a3866aaa8 100644 --- a/tests/tools/test_tts_speed.py +++ b/tests/tools/test_tts_speed.py @@ -110,7 +110,7 @@ class TestOpenaiTtsSpeed: # --------------------------------------------------------------------------- -# MiniMax TTS speed (global fallback wired) +# MiniMax TTS (new API: raw audio, no speed/voice_setting) # --------------------------------------------------------------------------- class TestMinimaxTtsSpeed: @@ -118,28 +118,29 @@ class TestMinimaxTtsSpeed: monkeypatch.setenv("MINIMAX_API_KEY", "test-key") mock_response = MagicMock() mock_response.status_code = 200 - mock_response.json.return_value = { - "data": {"audio": "deadbeef"}, - "base_resp": {"status_code": 0, "status_msg": "success"}, - "extra_info": {"audio_size": 8}, - } + mock_response.headers = {"Content-Type": "audio/mpeg"} + mock_response.content = b"\x00\x01\x02\x03" # requests is imported locally inside _generate_minimax_tts with patch("requests.post", return_value=mock_response) as mock_post: from tools.tts_tool import _generate_minimax_tts - _generate_minimax_tts("Hello", str(tmp_path / "out.mp3"), tts_config) - return mock_post + output = _generate_minimax_tts("Hello", str(tmp_path / "out.mp3"), tts_config) + return mock_post, output - def test_global_speed_fallback(self, tmp_path, monkeypatch): - """Global tts.speed used when minimax.speed not set.""" - mock_post = self._run({"speed": 1.5}, tmp_path, monkeypatch) + def test_simple_payload(self, tmp_path, monkeypatch): + """New API uses flat payload with model, text, voice_id.""" + mock_post, _ = self._run({}, tmp_path, monkeypatch) payload = mock_post.call_args[1]["json"] - assert payload["voice_setting"]["speed"] == 1.5 + assert "model" in payload + assert "text" in payload + assert "voice_id" in payload + assert "voice_setting" not in payload + assert "audio_setting" not in payload + assert "stream" not in payload - def test_provider_speed_overrides_global(self, tmp_path, monkeypatch): - """tts.minimax.speed takes precedence over tts.speed.""" - mock_post = self._run( - {"speed": 1.5, "minimax": {"speed": 2.0}}, tmp_path, monkeypatch - ) - payload = mock_post.call_args[1]["json"] - assert payload["voice_setting"]["speed"] == 2.0 + def test_writes_raw_audio(self, tmp_path, monkeypatch): + """New API returns raw bytes written directly to file.""" + _, output = self._run({}, tmp_path, monkeypatch) + assert output == str(tmp_path / "out.mp3") + with open(output, "rb") as f: + assert f.read() == b"\x00\x01\x02\x03" diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 7473b32a1d..8b82e1665b 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -136,9 +136,9 @@ DEFAULT_KITTENTTS_VOICE = "Jasper" DEFAULT_PIPER_VOICE = "en_US-lessac-medium" # balanced size/quality DEFAULT_OPENAI_VOICE = "alloy" DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1" -DEFAULT_MINIMAX_MODEL = "speech-2.8-hd" -DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady" -DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2" +DEFAULT_MINIMAX_MODEL = "speech-01" +DEFAULT_MINIMAX_VOICE_ID = "female-shaonv" +DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.chat/v1/text_to_speech" DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603" DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral DEFAULT_XAI_VOICE_ID = "eve" @@ -925,10 +925,11 @@ def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) - # =========================================================================== def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: """ - Generate audio using MiniMax TTS API. + Generate audio using MiniMax TTS API (v1/text_to_speech). - MiniMax returns hex-encoded audio data. Supports streaming (SSE) and - non-streaming modes. This implementation uses non-streaming for simplicity. + The current API (api.minimax.chat/v1/text_to_speech) uses a simple payload + and returns raw audio bytes directly (Content-Type: audio/mpeg), unlike + the deprecated v1/t2a_v2 endpoint which returned JSON with hex-encoded audio. Args: text: Text to convert (max 10,000 characters). @@ -947,35 +948,12 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any mm_config = tts_config.get("minimax", {}) model = mm_config.get("model", DEFAULT_MINIMAX_MODEL) voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID) - speed = mm_config.get("speed", tts_config.get("speed", 1)) - vol = mm_config.get("vol", 1) - pitch = mm_config.get("pitch", 0) base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL) - # Determine audio format from output extension - if output_path.endswith(".wav"): - audio_format = "wav" - elif output_path.endswith(".flac"): - audio_format = "flac" - else: - audio_format = "mp3" - payload = { "model": model, "text": text, - "stream": False, - "voice_setting": { - "voice_id": voice_id, - "speed": speed, - "vol": vol, - "pitch": pitch, - }, - "audio_setting": { - "sample_rate": 32000, - "bitrate": 128000, - "format": audio_format, - "channel": 1, - }, + "voice_id": voice_id, } headers = { @@ -984,9 +962,25 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any } response = requests.post(base_url, json=payload, headers=headers, timeout=60) - response.raise_for_status() - result = response.json() + content_type = response.headers.get("Content-Type", "") + + if "audio/" in content_type: + # New API: returns raw audio directly + with open(output_path, "wb") as f: + f.write(response.content) + return output_path + + # Legacy / fallback: try parsing as JSON with hex-encoded audio + try: + result = response.json() + except Exception: + response.raise_for_status() + raise RuntimeError( + f"MiniMax TTS returned unexpected Content-Type '{content_type}' " + f"({len(response.content)} bytes)" + ) + base_resp = result.get("base_resp", {}) status_code = base_resp.get("status_code", -1) @@ -998,7 +992,7 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any if not hex_audio: raise RuntimeError("MiniMax TTS returned empty audio data") - # MiniMax returns hex-encoded audio (not base64) + # Legacy: hex-encoded audio audio_bytes = bytes.fromhex(hex_audio) with open(output_path, "wb") as f: