From 7f08cb59417b19d70a7bc82e05f7bbedeb8a4f82 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Wed, 13 May 2026 22:01:41 -0700
Subject: [PATCH] fix(tts): align MiniMax TTS defaults with current API and add
 GroupId support

Follow-up on @pty819's t2a_v2 endpoint fix:

- Default model: speech-02 -> speech-02-hd (bare 'speech-02' is not in the
  supported enum; t2a_v2 rejects it with 400). Official enum: speech-01-hd,
  speech-01-turbo, speech-02-hd, speech-02-turbo, speech-2.6-hd/turbo,
  speech-2.8-hd/turbo.
- Default voice: female-shaonv -> English_expressive_narrator. The
  legacy speech-01-series short ID doesn't resolve cleanly on the
  speech-02+ models that are now the default.
- Default base URL: api.minimaxi.com -> api.minimax.io (matches the
  canonical host in the published docs; api-uw.minimax.io is the
  reduced-latency alt).
- Add GroupId support via tts.minimax.group_id config or MINIMAX_GROUP_ID
  env var. Some MiniMax accounts scope TTS requests by group; without it,
  requests 401. Only appended when not already in the user's base_url.

Tests rewritten to cover both the default t2a_v2 path (hex-encoded audio
in JSON, nested voice_setting/audio_setting) and the legacy
text_to_speech path (raw audio bytes, flat payload). Adds coverage for
GroupId config/env wiring and error surfacing.

Also adds AUTHOR_MAP entry for pty819's GitHub-noreply email.
---
 scripts/release.py            |   1 +
 tests/tools/test_tts_speed.py | 130 +++++++++++++++++++++++++++++-----
 tools/tts_tool.py             |  18 ++++-
 3 files changed, 128 insertions(+), 21 deletions(-)

diff --git a/scripts/release.py b/scripts/release.py
index afe864d2e94..8983408f11a 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -223,6 +223,7 @@ AUTHOR_MAP = {
     "hitesh@gmail.com": "htsh",
     "pty819@outlook.com": "pty819",
     "pty819@users.noreply.github.com": "pty819",
+    "14341805+pty819@users.noreply.github.com": "pty819",
     "517024110@qq.com": "chennest",
     # Curator fixes (Apr 30 2026)
     "yuxiangl490@gmail.com": "y0shua1ee",
diff --git a/tests/tools/test_tts_speed.py b/tests/tools/test_tts_speed.py
index 8a3866aaa8a..d9274bb84d7 100644
--- a/tests/tools/test_tts_speed.py
+++ b/tests/tools/test_tts_speed.py
@@ -8,7 +8,12 @@ import pytest
 
 @pytest.fixture(autouse=True)
 def clean_env(monkeypatch):
-    for key in ("OPENAI_API_KEY", "MINIMAX_API_KEY", "HERMES_SESSION_PLATFORM"):
+    for key in (
+        "OPENAI_API_KEY",
+        "MINIMAX_API_KEY",
+        "MINIMAX_GROUP_ID",
+        "HERMES_SESSION_PLATFORM",
+    ):
         monkeypatch.delenv(key, raising=False)
 
 
@@ -110,37 +115,126 @@ class TestOpenaiTtsSpeed:
 
 
 # ---------------------------------------------------------------------------
-# MiniMax TTS (new API: raw audio, no speed/voice_setting)
+# MiniMax TTS (t2a_v2 endpoint: nested voice_setting/audio_setting,
+# JSON response with hex-encoded audio.  Falls back to the legacy
+# text_to_speech endpoint shape when the base_url points at it.)
 # ---------------------------------------------------------------------------
 
-class TestMinimaxTtsSpeed:
-    def _run(self, tts_config, tmp_path, monkeypatch):
-        monkeypatch.setenv("MINIMAX_API_KEY", "test-key")
-        mock_response = MagicMock()
-        mock_response.status_code = 200
-        mock_response.headers = {"Content-Type": "audio/mpeg"}
-        mock_response.content = b"\x00\x01\x02\x03"
 
-        # requests is imported locally inside _generate_minimax_tts
-        with patch("requests.post", return_value=mock_response) as mock_post:
+def _hex_response(payload_audio: bytes = b"\x00\x01\x02\x03"):
+    """Build a mock response shaped like a successful t2a_v2 reply."""
+    mock_response = MagicMock()
+    mock_response.status_code = 200
+    mock_response.headers = {"Content-Type": "application/json"}
+    mock_response.json.return_value = {
+        "data": {"audio": payload_audio.hex(), "status": 2},
+        "base_resp": {"status_code": 0, "status_msg": "success"},
+    }
+    return mock_response
+
+
+class TestMinimaxTtsT2aV2:
+    """Default path: base_url contains 't2a_v2'."""
+
+    def _run(self, tts_config, tmp_path, monkeypatch, response=None):
+        monkeypatch.setenv("MINIMAX_API_KEY", "test-key")
+        resp = response if response is not None else _hex_response()
+        with patch("requests.post", return_value=resp) as mock_post:
             from tools.tts_tool import _generate_minimax_tts
             output = _generate_minimax_tts("Hello", str(tmp_path / "out.mp3"), tts_config)
         return mock_post, output
 
-    def test_simple_payload(self, tmp_path, monkeypatch):
-        """New API uses flat payload with model, text, voice_id."""
+    def test_nested_payload(self, tmp_path, monkeypatch):
+        """Default endpoint uses nested voice_setting / audio_setting."""
+        mock_post, _ = self._run({}, tmp_path, monkeypatch)
+        payload = mock_post.call_args[1]["json"]
+        assert payload["model"] == "speech-02-hd"
+        assert payload["text"] == "Hello"
+        assert "voice_setting" in payload
+        assert payload["voice_setting"]["voice_id"] == "English_expressive_narrator"
+        assert "audio_setting" in payload
+        assert payload["audio_setting"]["format"] == "mp3"
+        # Don't send flat top-level voice_id alongside nested voice_setting.
+        assert "voice_id" not in payload
+
+    def test_decodes_hex_audio(self, tmp_path, monkeypatch):
+        """t2a_v2 hex-encoded audio is decoded and written verbatim."""
+        _, output = self._run({}, tmp_path, monkeypatch)
+        with open(output, "rb") as f:
+            assert f.read() == b"\x00\x01\x02\x03"
+
+    def test_default_url_is_t2a_v2(self, tmp_path, monkeypatch):
+        """Default base URL points at the live t2a_v2 endpoint."""
+        mock_post, _ = self._run({}, tmp_path, monkeypatch)
+        url = mock_post.call_args[0][0]
+        assert "t2a_v2" in url
+        assert "api.minimax.io" in url
+
+    def test_group_id_from_config(self, tmp_path, monkeypatch):
+        """group_id from config attaches as ?GroupId=<id>."""
+        mock_post, _ = self._run({"minimax": {"group_id": "G123"}}, tmp_path, monkeypatch)
+        url = mock_post.call_args[0][0]
+        assert "GroupId=G123" in url
+
+    def test_group_id_from_env(self, tmp_path, monkeypatch):
+        """MINIMAX_GROUP_ID env var attaches as ?GroupId=<id>."""
+        monkeypatch.setenv("MINIMAX_GROUP_ID", "G456")
+        mock_post, _ = self._run({}, tmp_path, monkeypatch)
+        url = mock_post.call_args[0][0]
+        assert "GroupId=G456" in url
+
+    def test_group_id_already_in_url_left_alone(self, tmp_path, monkeypatch):
+        """If user already set GroupId in base_url, don't double-append it."""
+        cfg = {"minimax": {
+            "base_url": "https://api.minimax.io/v1/t2a_v2?GroupId=PRESET",
+            "group_id": "IGNORED",
+        }}
+        mock_post, _ = self._run(cfg, tmp_path, monkeypatch)
+        url = mock_post.call_args[0][0]
+        assert url.count("GroupId=") == 1
+        assert "GroupId=PRESET" in url
+
+    def test_api_error_raises(self, tmp_path, monkeypatch):
+        """Non-zero base_resp.status_code surfaces as RuntimeError."""
+        resp = MagicMock()
+        resp.status_code = 200
+        resp.headers = {"Content-Type": "application/json"}
+        resp.json.return_value = {
+            "data": {"audio": "", "status": 1},
+            "base_resp": {"status_code": 2013, "status_msg": "invalid voice"},
+        }
+        with pytest.raises(RuntimeError, match="2013"):
+            self._run({}, tmp_path, monkeypatch, response=resp)
+
+
+class TestMinimaxTtsLegacyTextToSpeech:
+    """Legacy path: caller pins base_url to the old text_to_speech endpoint."""
+
+    LEGACY_URL = "https://api.minimax.chat/v1/text_to_speech"
+
+    def _run(self, tts_config, tmp_path, monkeypatch):
+        monkeypatch.setenv("MINIMAX_API_KEY", "test-key")
+        cfg = dict(tts_config)
+        cfg.setdefault("minimax", {})["base_url"] = self.LEGACY_URL
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.headers = {"Content-Type": "audio/mpeg"}
+        mock_response.content = b"\x00\x01\x02\x03"
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            from tools.tts_tool import _generate_minimax_tts
+            output = _generate_minimax_tts("Hello", str(tmp_path / "out.mp3"), cfg)
+        return mock_post, output
+
+    def test_flat_payload(self, tmp_path, monkeypatch):
+        """Legacy endpoint keeps the flat {model, text, voice_id} shape."""
         mock_post, _ = self._run({}, tmp_path, monkeypatch)
         payload = mock_post.call_args[1]["json"]
-        assert "model" in payload
-        assert "text" in payload
         assert "voice_id" in payload
         assert "voice_setting" not in payload
         assert "audio_setting" not in payload
-        assert "stream" not in payload
 
     def test_writes_raw_audio(self, tmp_path, monkeypatch):
-        """New API returns raw bytes written directly to file."""
+        """Legacy endpoint returns raw bytes written directly to file."""
         _, output = self._run({}, tmp_path, monkeypatch)
-        assert output == str(tmp_path / "out.mp3")
         with open(output, "rb") as f:
             assert f.read() == b"\x00\x01\x02\x03"
diff --git a/tools/tts_tool.py b/tools/tts_tool.py
index a0ea52a1d01..9f0d272dac0 100644
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@@ -159,9 +159,9 @@ DEFAULT_KITTENTTS_VOICE = "Jasper"
 DEFAULT_PIPER_VOICE = "en_US-lessac-medium"  # balanced size/quality
 DEFAULT_OPENAI_VOICE = "alloy"
 DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
-DEFAULT_MINIMAX_MODEL = "speech-02"
-DEFAULT_MINIMAX_VOICE_ID = "female-shaonv"
-DEFAULT_MINIMAX_BASE_URL = "https://api.minimaxi.com/v1/t2a_v2"
+DEFAULT_MINIMAX_MODEL = "speech-02-hd"
+DEFAULT_MINIMAX_VOICE_ID = "English_expressive_narrator"
+DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2"
 DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
 DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8"  # Paul - Neutral
 DEFAULT_XAI_VOICE_ID = "eve"
@@ -991,6 +991,18 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
     sample_rate = mm_config.get("sample_rate", 32000)
     bitrate = mm_config.get("bitrate", 128000)
 
+    # MiniMax accounts scope TTS requests by GroupId.  When present, the docs
+    # show it as a ?GroupId=<id> query param on the t2a_v2 URL.  Accept it
+    # from config or from the MINIMAX_GROUP_ID env var; only attach when the
+    # URL doesn't already carry one.
+    group_id = (
+        str(mm_config.get("group_id") or "").strip()
+        or (get_env_value("MINIMAX_GROUP_ID") or "").strip()
+    )
+    if group_id and "GroupId=" not in base_url:
+        sep = "&" if "?" in base_url else "?"
+        base_url = f"{base_url}{sep}GroupId={group_id}"
+
     headers = {
         "Content-Type": "application/json",
         "Authorization": f"Bearer {api_key}",