mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-25 05:52:34 +00:00
fix(tts): update MiniMax default model to speech-02 and correct API endpoint
The MiniMax TTS defaults were outdated: - DEFAULT_MINIMAX_MODEL was 'speech-01' but MiniMax now uses 'speech-02' - DEFAULT_MINIMAX_BASE_URL was 'https://api.minimax.chat/v1/text_to_speech' which no longer works; the correct endpoint is 'https://api.minimaxi.com/v1/t2a_v2' Users who configured tts.provider: minimax were getting model-not-supported errors because the hardcoded defaults did not match available API permissions.
This commit is contained in:
parent
6122a79aab
commit
c875c0dc11
1 changed files with 79 additions and 41 deletions
|
|
@ -159,9 +159,9 @@ DEFAULT_KITTENTTS_VOICE = "Jasper"
|
||||||
DEFAULT_PIPER_VOICE = "en_US-lessac-medium" # balanced size/quality
|
DEFAULT_PIPER_VOICE = "en_US-lessac-medium" # balanced size/quality
|
||||||
DEFAULT_OPENAI_VOICE = "alloy"
|
DEFAULT_OPENAI_VOICE = "alloy"
|
||||||
DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
|
DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
|
||||||
DEFAULT_MINIMAX_MODEL = "speech-01"
|
DEFAULT_MINIMAX_MODEL = "speech-02"
|
||||||
DEFAULT_MINIMAX_VOICE_ID = "female-shaonv"
|
DEFAULT_MINIMAX_VOICE_ID = "female-shaonv"
|
||||||
DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.chat/v1/text_to_speech"
|
DEFAULT_MINIMAX_BASE_URL = "https://api.minimaxi.com/v1/t2a_v2"
|
||||||
DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
|
DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
|
||||||
DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral
|
DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral
|
||||||
DEFAULT_XAI_VOICE_ID = "eve"
|
DEFAULT_XAI_VOICE_ID = "eve"
|
||||||
|
|
@ -960,11 +960,11 @@ def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||||
"""
|
"""
|
||||||
Generate audio using MiniMax TTS API (v1/text_to_speech).
|
Generate audio using MiniMax TTS API.
|
||||||
|
|
||||||
The current API (api.minimax.chat/v1/text_to_speech) uses a simple payload
|
Supports two endpoints:
|
||||||
and returns raw audio bytes directly (Content-Type: audio/mpeg), unlike
|
- v1/text_to_speech: simple payload, returns raw audio (Content-Type: audio/mpeg)
|
||||||
the deprecated v1/t2a_v2 endpoint which returned JSON with hex-encoded audio.
|
- v1/t2a_v2: nested voice_setting/audio_setting, returns JSON with hex-encoded audio
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text to convert (max 10,000 characters).
|
text: Text to convert (max 10,000 characters).
|
||||||
|
|
@ -984,56 +984,94 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
||||||
model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
|
model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
|
||||||
voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
|
voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
|
||||||
base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
|
base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
|
||||||
|
speed = mm_config.get("speed", 1.0)
|
||||||
payload = {
|
vol = mm_config.get("vol", 1.0)
|
||||||
"model": model,
|
pitch = mm_config.get("pitch", 0)
|
||||||
"text": text,
|
emotion = mm_config.get("emotion", "neutral")
|
||||||
"voice_id": voice_id,
|
sample_rate = mm_config.get("sample_rate", 32000)
|
||||||
}
|
bitrate = mm_config.get("bitrate", 128000)
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {api_key}",
|
"Authorization": f"Bearer {api_key}",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Detect endpoint from URL
|
||||||
|
is_t2a_v2 = "t2a_v2" in base_url
|
||||||
|
|
||||||
|
if is_t2a_v2:
|
||||||
|
# t2a_v2 endpoint: nested voice_setting/audio_setting structure
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"text": text,
|
||||||
|
"voice_setting": {
|
||||||
|
"voice_id": voice_id,
|
||||||
|
"speed": speed,
|
||||||
|
"vol": vol,
|
||||||
|
"pitch": pitch,
|
||||||
|
"emotion": emotion,
|
||||||
|
},
|
||||||
|
"audio_setting": {
|
||||||
|
"sample_rate": sample_rate,
|
||||||
|
"bitrate": bitrate,
|
||||||
|
"format": "mp3",
|
||||||
|
"channel": 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# text_to_speech endpoint: flat payload
|
||||||
|
payload = {
|
||||||
|
"model": model,
|
||||||
|
"text": text,
|
||||||
|
"voice_id": voice_id,
|
||||||
|
}
|
||||||
|
|
||||||
response = requests.post(base_url, json=payload, headers=headers, timeout=60)
|
response = requests.post(base_url, json=payload, headers=headers, timeout=60)
|
||||||
|
|
||||||
content_type = response.headers.get("Content-Type", "")
|
if is_t2a_v2:
|
||||||
|
# t2a_v2 returns JSON with hex-encoded audio
|
||||||
|
result = response.json()
|
||||||
|
base_resp = result.get("base_resp", {})
|
||||||
|
status_code = base_resp.get("status_code", -1)
|
||||||
|
|
||||||
if "audio/" in content_type:
|
if status_code != 0:
|
||||||
# New API: returns raw audio directly
|
status_msg = base_resp.get("status_msg", "unknown error")
|
||||||
|
raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
|
||||||
|
|
||||||
|
hex_audio = result.get("data", {}).get("audio", "")
|
||||||
|
if not hex_audio:
|
||||||
|
raise RuntimeError("MiniMax TTS returned empty audio data")
|
||||||
|
|
||||||
|
audio_bytes = bytes.fromhex(hex_audio)
|
||||||
with open(output_path, "wb") as f:
|
with open(output_path, "wb") as f:
|
||||||
f.write(response.content)
|
f.write(audio_bytes)
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
# Legacy / fallback: try parsing as JSON with hex-encoded audio
|
else:
|
||||||
try:
|
# text_to_speech returns raw audio directly
|
||||||
result = response.json()
|
content_type = response.headers.get("Content-Type", "")
|
||||||
except Exception:
|
|
||||||
response.raise_for_status()
|
|
||||||
raise RuntimeError(
|
|
||||||
f"MiniMax TTS returned unexpected Content-Type '{content_type}' "
|
|
||||||
f"({len(response.content)} bytes)"
|
|
||||||
)
|
|
||||||
|
|
||||||
base_resp = result.get("base_resp", {})
|
if "audio/" in content_type:
|
||||||
status_code = base_resp.get("status_code", -1)
|
with open(output_path, "wb") as f:
|
||||||
|
f.write(response.content)
|
||||||
|
return output_path
|
||||||
|
|
||||||
if status_code != 0:
|
# Fallback: try parsing as JSON
|
||||||
status_msg = base_resp.get("status_msg", "unknown error")
|
try:
|
||||||
raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
|
result = response.json()
|
||||||
|
base_resp = result.get("base_resp", {})
|
||||||
|
status_code = base_resp.get("status_code", -1)
|
||||||
|
if status_code != 0:
|
||||||
|
status_msg = base_resp.get("status_msg", "unknown error")
|
||||||
|
raise RuntimeError(f"MiniMax TTS API error (code {status_code}): {status_msg}")
|
||||||
|
except Exception:
|
||||||
|
response.raise_for_status()
|
||||||
|
raise RuntimeError(
|
||||||
|
f"MiniMax TTS returned unexpected Content-Type '{content_type}' "
|
||||||
|
f"({len(response.content)} bytes)"
|
||||||
|
)
|
||||||
|
|
||||||
hex_audio = result.get("data", {}).get("audio", "")
|
raise RuntimeError("MiniMax TTS returned no audio data")
|
||||||
if not hex_audio:
|
|
||||||
raise RuntimeError("MiniMax TTS returned empty audio data")
|
|
||||||
|
|
||||||
# Legacy: hex-encoded audio
|
|
||||||
audio_bytes = bytes.fromhex(hex_audio)
|
|
||||||
|
|
||||||
with open(output_path, "wb") as f:
|
|
||||||
f.write(audio_bytes)
|
|
||||||
|
|
||||||
return output_path
|
|
||||||
|
|
||||||
|
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue