mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(tts): update MiniMax API endpoint to v1/text_to_speech
MiniMax deprecated the old v1/t2a_v2 endpoint (api.minimax.io) and
moved to v1/text_to_speech (api.minimax.chat). The new API:
- Uses a flat payload: {model, text, voice_id} instead of nested
voice_setting / audio_setting objects
- Returns raw audio bytes (Content-Type: audio/mpeg) instead of
JSON with hex-encoded audio
- Uses model 'speech-01' instead of 'speech-2.8-hd'
- Updated default voice_id to 'female-shaonv' for Chinese TTS
The implementation detects Content-Type to handle both old and new
API responses, maintaining backward compatibility for any users who
manually configured the legacy base_url.
This commit is contained in:
parent
75bce317a3
commit
6875471916
2 changed files with 47 additions and 52 deletions
|
|
@ -110,7 +110,7 @@ class TestOpenaiTtsSpeed:
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# MiniMax TTS speed (global fallback wired)
|
# MiniMax TTS (new API: raw audio, no speed/voice_setting)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
class TestMinimaxTtsSpeed:
|
class TestMinimaxTtsSpeed:
|
||||||
|
|
@ -118,28 +118,29 @@ class TestMinimaxTtsSpeed:
|
||||||
monkeypatch.setenv("MINIMAX_API_KEY", "test-key")
|
monkeypatch.setenv("MINIMAX_API_KEY", "test-key")
|
||||||
mock_response = MagicMock()
|
mock_response = MagicMock()
|
||||||
mock_response.status_code = 200
|
mock_response.status_code = 200
|
||||||
mock_response.json.return_value = {
|
mock_response.headers = {"Content-Type": "audio/mpeg"}
|
||||||
"data": {"audio": "deadbeef"},
|
mock_response.content = b"\x00\x01\x02\x03"
|
||||||
"base_resp": {"status_code": 0, "status_msg": "success"},
|
|
||||||
"extra_info": {"audio_size": 8},
|
|
||||||
}
|
|
||||||
|
|
||||||
# requests is imported locally inside _generate_minimax_tts
|
# requests is imported locally inside _generate_minimax_tts
|
||||||
with patch("requests.post", return_value=mock_response) as mock_post:
|
with patch("requests.post", return_value=mock_response) as mock_post:
|
||||||
from tools.tts_tool import _generate_minimax_tts
|
from tools.tts_tool import _generate_minimax_tts
|
||||||
_generate_minimax_tts("Hello", str(tmp_path / "out.mp3"), tts_config)
|
output = _generate_minimax_tts("Hello", str(tmp_path / "out.mp3"), tts_config)
|
||||||
return mock_post
|
return mock_post, output
|
||||||
|
|
||||||
def test_global_speed_fallback(self, tmp_path, monkeypatch):
|
def test_simple_payload(self, tmp_path, monkeypatch):
|
||||||
"""Global tts.speed used when minimax.speed not set."""
|
"""New API uses flat payload with model, text, voice_id."""
|
||||||
mock_post = self._run({"speed": 1.5}, tmp_path, monkeypatch)
|
mock_post, _ = self._run({}, tmp_path, monkeypatch)
|
||||||
payload = mock_post.call_args[1]["json"]
|
payload = mock_post.call_args[1]["json"]
|
||||||
assert payload["voice_setting"]["speed"] == 1.5
|
assert "model" in payload
|
||||||
|
assert "text" in payload
|
||||||
|
assert "voice_id" in payload
|
||||||
|
assert "voice_setting" not in payload
|
||||||
|
assert "audio_setting" not in payload
|
||||||
|
assert "stream" not in payload
|
||||||
|
|
||||||
def test_provider_speed_overrides_global(self, tmp_path, monkeypatch):
|
def test_writes_raw_audio(self, tmp_path, monkeypatch):
|
||||||
"""tts.minimax.speed takes precedence over tts.speed."""
|
"""New API returns raw bytes written directly to file."""
|
||||||
mock_post = self._run(
|
_, output = self._run({}, tmp_path, monkeypatch)
|
||||||
{"speed": 1.5, "minimax": {"speed": 2.0}}, tmp_path, monkeypatch
|
assert output == str(tmp_path / "out.mp3")
|
||||||
)
|
with open(output, "rb") as f:
|
||||||
payload = mock_post.call_args[1]["json"]
|
assert f.read() == b"\x00\x01\x02\x03"
|
||||||
assert payload["voice_setting"]["speed"] == 2.0
|
|
||||||
|
|
|
||||||
|
|
@ -136,9 +136,9 @@ DEFAULT_KITTENTTS_VOICE = "Jasper"
|
||||||
DEFAULT_PIPER_VOICE = "en_US-lessac-medium" # balanced size/quality
|
DEFAULT_PIPER_VOICE = "en_US-lessac-medium" # balanced size/quality
|
||||||
DEFAULT_OPENAI_VOICE = "alloy"
|
DEFAULT_OPENAI_VOICE = "alloy"
|
||||||
DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
|
DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
|
||||||
DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
|
DEFAULT_MINIMAX_MODEL = "speech-01"
|
||||||
DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady"
|
DEFAULT_MINIMAX_VOICE_ID = "female-shaonv"
|
||||||
DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2"
|
DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.chat/v1/text_to_speech"
|
||||||
DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
|
DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
|
||||||
DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral
|
DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral
|
||||||
DEFAULT_XAI_VOICE_ID = "eve"
|
DEFAULT_XAI_VOICE_ID = "eve"
|
||||||
|
|
@ -925,10 +925,11 @@ def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Generate audio using MiniMax TTS API.
|
Generate audio using MiniMax TTS API (v1/text_to_speech).
|
||||||
|
|
||||||
MiniMax returns hex-encoded audio data. Supports streaming (SSE) and
|
The current API (api.minimax.chat/v1/text_to_speech) uses a simple payload
|
||||||
non-streaming modes. This implementation uses non-streaming for simplicity.
|
and returns raw audio bytes directly (Content-Type: audio/mpeg), unlike
|
||||||
|
the deprecated v1/t2a_v2 endpoint which returned JSON with hex-encoded audio.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text to convert (max 10,000 characters).
|
text: Text to convert (max 10,000 characters).
|
||||||
|
|
@ -947,35 +948,12 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
||||||
mm_config = tts_config.get("minimax", {})
|
mm_config = tts_config.get("minimax", {})
|
||||||
model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
|
model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
|
||||||
voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
|
voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
|
||||||
speed = mm_config.get("speed", tts_config.get("speed", 1))
|
|
||||||
vol = mm_config.get("vol", 1)
|
|
||||||
pitch = mm_config.get("pitch", 0)
|
|
||||||
base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
|
base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
|
||||||
|
|
||||||
# Determine audio format from output extension
|
|
||||||
if output_path.endswith(".wav"):
|
|
||||||
audio_format = "wav"
|
|
||||||
elif output_path.endswith(".flac"):
|
|
||||||
audio_format = "flac"
|
|
||||||
else:
|
|
||||||
audio_format = "mp3"
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": model,
|
"model": model,
|
||||||
"text": text,
|
"text": text,
|
||||||
"stream": False,
|
|
||||||
"voice_setting": {
|
|
||||||
"voice_id": voice_id,
|
"voice_id": voice_id,
|
||||||
"speed": speed,
|
|
||||||
"vol": vol,
|
|
||||||
"pitch": pitch,
|
|
||||||
},
|
|
||||||
"audio_setting": {
|
|
||||||
"sample_rate": 32000,
|
|
||||||
"bitrate": 128000,
|
|
||||||
"format": audio_format,
|
|
||||||
"channel": 1,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
|
|
@ -984,9 +962,25 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
||||||
}
|
}
|
||||||
|
|
||||||
response = requests.post(base_url, json=payload, headers=headers, timeout=60)
|
response = requests.post(base_url, json=payload, headers=headers, timeout=60)
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
|
content_type = response.headers.get("Content-Type", "")
|
||||||
|
|
||||||
|
if "audio/" in content_type:
|
||||||
|
# New API: returns raw audio directly
|
||||||
|
with open(output_path, "wb") as f:
|
||||||
|
f.write(response.content)
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
# Legacy / fallback: try parsing as JSON with hex-encoded audio
|
||||||
|
try:
|
||||||
result = response.json()
|
result = response.json()
|
||||||
|
except Exception:
|
||||||
|
response.raise_for_status()
|
||||||
|
raise RuntimeError(
|
||||||
|
f"MiniMax TTS returned unexpected Content-Type '{content_type}' "
|
||||||
|
f"({len(response.content)} bytes)"
|
||||||
|
)
|
||||||
|
|
||||||
base_resp = result.get("base_resp", {})
|
base_resp = result.get("base_resp", {})
|
||||||
status_code = base_resp.get("status_code", -1)
|
status_code = base_resp.get("status_code", -1)
|
||||||
|
|
||||||
|
|
@ -998,7 +992,7 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
||||||
if not hex_audio:
|
if not hex_audio:
|
||||||
raise RuntimeError("MiniMax TTS returned empty audio data")
|
raise RuntimeError("MiniMax TTS returned empty audio data")
|
||||||
|
|
||||||
# MiniMax returns hex-encoded audio (not base64)
|
# Legacy: hex-encoded audio
|
||||||
audio_bytes = bytes.fromhex(hex_audio)
|
audio_bytes = bytes.fromhex(hex_audio)
|
||||||
|
|
||||||
with open(output_path, "wb") as f:
|
with open(output_path, "wb") as f:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue