mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(tools): add Voxtral TTS provider (Mistral AI)
This commit is contained in:
parent
5a55d54ee2
commit
640441b865
11 changed files with 379 additions and 12 deletions
|
|
@ -588,7 +588,7 @@ platform_toolsets:
|
|||
# skills_hub - skill_hub (search/install/manage from online registries — user-driven only)
|
||||
# moa - mixture_of_agents (requires OPENROUTER_API_KEY)
|
||||
# todo - todo (in-memory task planning, no deps)
|
||||
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX key)
|
||||
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key)
|
||||
# cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
|
||||
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
|
||||
#
|
||||
|
|
@ -617,7 +617,7 @@ platform_toolsets:
|
|||
# todo - Task planning and tracking for multi-step work
|
||||
# memory - Persistent memory across sessions (personal notes + user profile)
|
||||
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
|
||||
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax)
|
||||
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral)
|
||||
# cronjob - Schedule and manage automated tasks (CLI-only)
|
||||
# rl - RL training tools (Tinker-Atropos)
|
||||
#
|
||||
|
|
|
|||
|
|
@ -458,7 +458,7 @@ DEFAULT_CONFIG = {
|
|||
|
||||
# Text-to-speech configuration
|
||||
"tts": {
|
||||
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "neutts" (local)
|
||||
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local)
|
||||
"edge": {
|
||||
"voice": "en-US-AriaNeural",
|
||||
# Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
|
||||
|
|
@ -472,6 +472,10 @@ DEFAULT_CONFIG = {
|
|||
"voice": "alloy",
|
||||
# Voices: alloy, echo, fable, onyx, nova, shimmer
|
||||
},
|
||||
"mistral": {
|
||||
"model": "voxtral-mini-tts-2603",
|
||||
"voice_id": "c69964a6-ab8b-4f8a-9465-ec0925096ec8", # Paul - Neutral
|
||||
},
|
||||
"neutts": {
|
||||
"ref_audio": "", # Path to reference voice audio (empty = bundled default)
|
||||
"ref_text": "", # Path to reference voice transcript (empty = bundled default)
|
||||
|
|
@ -1016,6 +1020,13 @@ OPTIONAL_ENV_VARS = {
|
|||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
"MISTRAL_API_KEY": {
|
||||
"description": "Mistral API key for Voxtral TTS and transcription (STT)",
|
||||
"prompt": "Mistral API key",
|
||||
"url": "https://console.mistral.ai/",
|
||||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
"GITHUB_TOKEN": {
|
||||
"description": "GitHub token for Skills Hub (higher API rate limits, skill publish)",
|
||||
"prompt": "GitHub Token",
|
||||
|
|
|
|||
|
|
@ -143,6 +143,7 @@ def _tts_label(current_provider: str) -> str:
|
|||
"openai": "OpenAI TTS",
|
||||
"elevenlabs": "ElevenLabs",
|
||||
"edge": "Edge TTS",
|
||||
"mistral": "Mistral Voxtral TTS",
|
||||
"neutts": "NeuTTS",
|
||||
}
|
||||
return mapping.get(current_provider or "edge", current_provider or "Edge TTS")
|
||||
|
|
@ -309,6 +310,7 @@ def get_nous_subscription_features(
|
|||
tts_current_provider in {"edge", "neutts"}
|
||||
or (tts_current_provider == "openai" and (managed_tts_available or direct_openai_tts))
|
||||
or (tts_current_provider == "elevenlabs" and direct_elevenlabs)
|
||||
or (tts_current_provider == "mistral" and bool(get_env_value("MISTRAL_API_KEY")))
|
||||
)
|
||||
tts_active = bool(tts_tool_enabled and tts_available)
|
||||
|
||||
|
|
|
|||
|
|
@ -557,6 +557,8 @@ def _print_setup_summary(config: dict, hermes_home):
|
|||
tool_status.append(("Text-to-Speech (OpenAI)", True, None))
|
||||
elif tts_provider == "minimax" and get_env_value("MINIMAX_API_KEY"):
|
||||
tool_status.append(("Text-to-Speech (MiniMax)", True, None))
|
||||
elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"):
|
||||
tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None))
|
||||
elif tts_provider == "neutts":
|
||||
try:
|
||||
import importlib.util
|
||||
|
|
@ -1044,6 +1046,7 @@ def _setup_tts_provider(config: dict):
|
|||
"elevenlabs": "ElevenLabs",
|
||||
"openai": "OpenAI TTS",
|
||||
"minimax": "MiniMax TTS",
|
||||
"mistral": "Mistral Voxtral TTS",
|
||||
"neutts": "NeuTTS",
|
||||
}
|
||||
current_label = provider_labels.get(current_provider, current_provider)
|
||||
|
|
@ -1064,10 +1067,11 @@ def _setup_tts_provider(config: dict):
|
|||
"ElevenLabs (premium quality, needs API key)",
|
||||
"OpenAI TTS (good quality, needs API key)",
|
||||
"MiniMax TTS (high quality with voice cloning, needs API key)",
|
||||
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
|
||||
"NeuTTS (local on-device, free, ~300MB model download)",
|
||||
]
|
||||
)
|
||||
providers.extend(["edge", "elevenlabs", "openai", "minimax", "neutts"])
|
||||
providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts"])
|
||||
choices.append(f"Keep current ({current_label})")
|
||||
keep_current_idx = len(choices) - 1
|
||||
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
|
||||
|
|
@ -1145,6 +1149,18 @@ def _setup_tts_provider(config: dict):
|
|||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
elif selected == "mistral":
|
||||
existing = get_env_value("MISTRAL_API_KEY")
|
||||
if not existing:
|
||||
print()
|
||||
api_key = prompt("Mistral API key for TTS", password=True)
|
||||
if api_key:
|
||||
save_env_value("MISTRAL_API_KEY", api_key)
|
||||
print_success("Mistral TTS API key saved")
|
||||
else:
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
# Save the selection
|
||||
if "tts" not in config:
|
||||
config["tts"] = {}
|
||||
|
|
|
|||
|
|
@ -181,6 +181,14 @@ TOOL_CATEGORIES = {
|
|||
],
|
||||
"tts_provider": "elevenlabs",
|
||||
},
|
||||
{
|
||||
"name": "Mistral (Voxtral TTS)",
|
||||
"tag": "Multilingual, native Opus, needs MISTRAL_API_KEY",
|
||||
"env_vars": [
|
||||
{"key": "MISTRAL_API_KEY", "prompt": "Mistral API key", "url": "https://console.mistral.ai/"},
|
||||
],
|
||||
"tts_provider": "mistral",
|
||||
},
|
||||
],
|
||||
},
|
||||
"web": {
|
||||
|
|
|
|||
|
|
@ -249,8 +249,12 @@ def check_config(groq_key, eleven_key):
|
|||
|
||||
if stt_provider == "groq" and not groq_key:
|
||||
warn("STT config says groq but GROQ_API_KEY is missing")
|
||||
if stt_provider == "mistral" and not os.getenv("MISTRAL_API_KEY"):
|
||||
warn("STT config says mistral but MISTRAL_API_KEY is missing")
|
||||
if tts_provider == "elevenlabs" and not eleven_key:
|
||||
warn("TTS config says elevenlabs but ELEVENLABS_API_KEY is missing")
|
||||
if tts_provider == "mistral" and not os.getenv("MISTRAL_API_KEY"):
|
||||
warn("TTS config says mistral but MISTRAL_API_KEY is missing")
|
||||
except Exception as e:
|
||||
warn("config.yaml", f"parse error: {e}")
|
||||
else:
|
||||
|
|
|
|||
245
tests/tools/test_tts_mistral.py
Normal file
245
tests/tools/test_tts_mistral.py
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
"""Tests for the Mistral (Voxtral) TTS provider in tools/tts_tool.py."""
|
||||
|
||||
import base64
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_env(monkeypatch):
|
||||
for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"):
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_mistral_module():
|
||||
mock_client = MagicMock()
|
||||
mock_client.__enter__ = MagicMock(return_value=mock_client)
|
||||
mock_client.__exit__ = MagicMock(return_value=False)
|
||||
mock_mistral_cls = MagicMock(return_value=mock_client)
|
||||
fake_module = MagicMock()
|
||||
fake_module.Mistral = mock_mistral_cls
|
||||
with patch.dict("sys.modules", {"mistralai": fake_module, "mistralai.client": fake_module}):
|
||||
yield mock_client
|
||||
|
||||
|
||||
class TestGenerateMistralTts:
|
||||
def test_missing_api_key_raises_value_error(self, tmp_path, mock_mistral_module):
|
||||
from tools.tts_tool import _generate_mistral_tts
|
||||
|
||||
output_path = str(tmp_path / "test.mp3")
|
||||
with pytest.raises(ValueError, match="MISTRAL_API_KEY"):
|
||||
_generate_mistral_tts("Hello", output_path, {})
|
||||
|
||||
def test_successful_generation(self, tmp_path, mock_mistral_module, monkeypatch):
|
||||
from tools.tts_tool import _generate_mistral_tts
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
audio_content = b"fake-audio-bytes"
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(audio_content).decode()
|
||||
)
|
||||
|
||||
output_path = str(tmp_path / "test.mp3")
|
||||
result = _generate_mistral_tts("Hello world", output_path, {})
|
||||
|
||||
assert result == output_path
|
||||
assert (tmp_path / "test.mp3").read_bytes() == audio_content
|
||||
mock_mistral_module.audio.speech.complete.assert_called_once()
|
||||
mock_mistral_module.__exit__.assert_called_once()
|
||||
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
||||
assert call_kwargs["input"] == "Hello world"
|
||||
assert call_kwargs["response_format"] == "mp3"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"extension, expected_format",
|
||||
[(".ogg", "opus"), (".wav", "wav"), (".flac", "flac"), (".mp3", "mp3")],
|
||||
)
|
||||
def test_response_format_from_extension(
|
||||
self, tmp_path, mock_mistral_module, monkeypatch, extension, expected_format
|
||||
):
|
||||
from tools.tts_tool import _generate_mistral_tts
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(b"data").decode()
|
||||
)
|
||||
|
||||
output_path = str(tmp_path / f"test{extension}")
|
||||
_generate_mistral_tts("Hi", output_path, {})
|
||||
|
||||
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
||||
assert call_kwargs["response_format"] == expected_format
|
||||
|
||||
def test_voice_id_passed_when_configured(
|
||||
self, tmp_path, mock_mistral_module, monkeypatch
|
||||
):
|
||||
from tools.tts_tool import _generate_mistral_tts
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(b"data").decode()
|
||||
)
|
||||
|
||||
config = {"mistral": {"voice_id": "my-voice-uuid"}}
|
||||
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
|
||||
|
||||
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
||||
assert call_kwargs["voice_id"] == "my-voice-uuid"
|
||||
|
||||
def test_default_voice_id_when_absent(
|
||||
self, tmp_path, mock_mistral_module, monkeypatch
|
||||
):
|
||||
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(b"data").decode()
|
||||
)
|
||||
|
||||
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
|
||||
|
||||
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
||||
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
|
||||
|
||||
def test_default_voice_id_when_empty_string(
|
||||
self, tmp_path, mock_mistral_module, monkeypatch
|
||||
):
|
||||
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(b"data").decode()
|
||||
)
|
||||
|
||||
config = {"mistral": {"voice_id": ""}}
|
||||
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
|
||||
|
||||
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
||||
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
|
||||
|
||||
def test_api_error_sanitized(self, tmp_path, mock_mistral_module, monkeypatch):
|
||||
from tools.tts_tool import _generate_mistral_tts
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
mock_mistral_module.audio.speech.complete.side_effect = RuntimeError(
|
||||
"secret-key-in-error"
|
||||
)
|
||||
|
||||
with pytest.raises(RuntimeError, match="RuntimeError") as exc_info:
|
||||
_generate_mistral_tts("Hello", str(tmp_path / "test.mp3"), {})
|
||||
assert "secret-key-in-error" not in str(exc_info.value)
|
||||
|
||||
def test_default_model_used(self, tmp_path, mock_mistral_module, monkeypatch):
|
||||
from tools.tts_tool import DEFAULT_MISTRAL_TTS_MODEL, _generate_mistral_tts
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(b"data").decode()
|
||||
)
|
||||
|
||||
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
|
||||
|
||||
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
||||
assert call_kwargs["model"] == DEFAULT_MISTRAL_TTS_MODEL
|
||||
|
||||
def test_model_from_config_overrides_default(
|
||||
self, tmp_path, mock_mistral_module, monkeypatch
|
||||
):
|
||||
from tools.tts_tool import _generate_mistral_tts
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(b"data").decode()
|
||||
)
|
||||
|
||||
config = {"mistral": {"model": "voxtral-large-tts-9999"}}
|
||||
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
|
||||
|
||||
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
||||
assert call_kwargs["model"] == "voxtral-large-tts-9999"
|
||||
|
||||
|
||||
class TestTtsDispatcherMistral:
|
||||
def test_dispatcher_routes_to_mistral(
|
||||
self, tmp_path, mock_mistral_module, monkeypatch
|
||||
):
|
||||
import json
|
||||
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(b"audio").decode()
|
||||
)
|
||||
|
||||
output_path = str(tmp_path / "out.mp3")
|
||||
with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
|
||||
result = json.loads(text_to_speech_tool("Hello", output_path=output_path))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["provider"] == "mistral"
|
||||
mock_mistral_module.audio.speech.complete.assert_called_once()
|
||||
|
||||
def test_dispatcher_returns_error_when_sdk_not_installed(self, tmp_path, monkeypatch):
|
||||
import json
|
||||
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
with patch(
|
||||
"tools.tts_tool._import_mistral_client", side_effect=ImportError("no module")
|
||||
), patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
|
||||
result = json.loads(
|
||||
text_to_speech_tool("Hello", output_path=str(tmp_path / "out.mp3"))
|
||||
)
|
||||
|
||||
assert result["success"] is False
|
||||
assert "mistralai" in result["error"]
|
||||
|
||||
|
||||
class TestCheckTtsRequirementsMistral:
|
||||
def test_mistral_sdk_and_key_returns_true(self, mock_mistral_module, monkeypatch):
|
||||
from tools.tts_tool import check_tts_requirements
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
|
||||
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
|
||||
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
|
||||
patch("tools.tts_tool._check_neutts_available", return_value=False):
|
||||
assert check_tts_requirements() is True
|
||||
|
||||
def test_mistral_key_missing_returns_false(self, mock_mistral_module):
|
||||
from tools.tts_tool import check_tts_requirements
|
||||
|
||||
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
|
||||
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
|
||||
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
|
||||
patch("tools.tts_tool._check_neutts_available", return_value=False):
|
||||
assert check_tts_requirements() is False
|
||||
|
||||
|
||||
class TestMistralTtsOpus:
|
||||
def test_telegram_produces_ogg_and_voice_compatible(
|
||||
self, tmp_path, mock_mistral_module, monkeypatch
|
||||
):
|
||||
import json
|
||||
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
|
||||
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
|
||||
monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
|
||||
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
|
||||
audio_data=base64.b64encode(b"opus-audio").decode()
|
||||
)
|
||||
|
||||
with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
|
||||
result = json.loads(text_to_speech_tool("Hello"))
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["file_path"].endswith(".ogg")
|
||||
assert result["voice_compatible"] is True
|
||||
assert "[[audio_as_voice]]" in result["media_tag"]
|
||||
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
|
||||
assert call_kwargs["response_format"] == "opus"
|
||||
|
|
@ -2,11 +2,12 @@
|
|||
"""
|
||||
Text-to-Speech Tool Module
|
||||
|
||||
Supports five TTS providers:
|
||||
Supports six TTS providers:
|
||||
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
|
||||
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
|
||||
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
|
||||
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
|
||||
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
|
||||
|
||||
Output formats:
|
||||
|
|
@ -23,6 +24,7 @@ Usage:
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
|
|
@ -62,6 +64,11 @@ def _import_openai_client():
|
|||
from openai import OpenAI as OpenAIClient
|
||||
return OpenAIClient
|
||||
|
||||
def _import_mistral_client():
|
||||
"""Lazy import Mistral client. Returns the class or raises ImportError."""
|
||||
from mistralai.client import Mistral
|
||||
return Mistral
|
||||
|
||||
def _import_sounddevice():
|
||||
"""Lazy import sounddevice. Returns the module or raises ImportError/OSError."""
|
||||
import sounddevice as sd
|
||||
|
|
@ -82,6 +89,8 @@ DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
|
|||
DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
|
||||
DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady"
|
||||
DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2"
|
||||
DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
|
||||
DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral
|
||||
|
||||
def _get_default_output_dir() -> str:
|
||||
from hermes_constants import get_hermes_dir
|
||||
|
|
@ -365,6 +374,55 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
|||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Provider: Mistral (Voxtral TTS)
|
||||
# ===========================================================================
|
||||
def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate audio using Mistral Voxtral TTS API.
|
||||
|
||||
The API returns base64-encoded audio; this function decodes it
|
||||
and writes the raw bytes to *output_path*.
|
||||
Supports native Opus output for Telegram voice bubbles.
|
||||
"""
|
||||
api_key = os.getenv("MISTRAL_API_KEY", "")
|
||||
if not api_key:
|
||||
raise ValueError("MISTRAL_API_KEY not set. Get one at https://console.mistral.ai/")
|
||||
|
||||
mi_config = tts_config.get("mistral", {})
|
||||
model = mi_config.get("model", DEFAULT_MISTRAL_TTS_MODEL)
|
||||
voice_id = mi_config.get("voice_id") or DEFAULT_MISTRAL_TTS_VOICE_ID
|
||||
|
||||
if output_path.endswith(".ogg"):
|
||||
response_format = "opus"
|
||||
elif output_path.endswith(".wav"):
|
||||
response_format = "wav"
|
||||
elif output_path.endswith(".flac"):
|
||||
response_format = "flac"
|
||||
else:
|
||||
response_format = "mp3"
|
||||
|
||||
Mistral = _import_mistral_client()
|
||||
try:
|
||||
with Mistral(api_key=api_key) as client:
|
||||
response = client.audio.speech.complete(
|
||||
model=model,
|
||||
input=text,
|
||||
voice_id=voice_id,
|
||||
response_format=response_format,
|
||||
)
|
||||
audio_bytes = base64.b64decode(response.audio_data)
|
||||
except ValueError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Mistral TTS failed: %s", e, exc_info=True)
|
||||
raise RuntimeError(f"Mistral TTS failed: {type(e).__name__}") from e
|
||||
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(audio_bytes)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# NeuTTS (local, on-device TTS via neutts_cli)
|
||||
# ===========================================================================
|
||||
|
|
@ -493,7 +551,7 @@ def text_to_speech_tool(
|
|||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Use .ogg for Telegram with providers that support native Opus output,
|
||||
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
|
||||
if want_opus and provider in ("openai", "elevenlabs"):
|
||||
if want_opus and provider in ("openai", "elevenlabs", "mistral"):
|
||||
file_path = out_dir / f"tts_{timestamp}.ogg"
|
||||
else:
|
||||
file_path = out_dir / f"tts_{timestamp}.mp3"
|
||||
|
|
@ -530,6 +588,18 @@ def text_to_speech_tool(
|
|||
logger.info("Generating speech with MiniMax TTS...")
|
||||
_generate_minimax_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "mistral":
|
||||
try:
|
||||
_import_mistral_client()
|
||||
except ImportError:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": "Mistral provider selected but 'mistralai' package not installed. "
|
||||
"Run: pip install 'hermes-agent[mistral]'"
|
||||
}, ensure_ascii=False)
|
||||
logger.info("Generating speech with Mistral Voxtral TTS...")
|
||||
_generate_mistral_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "neutts":
|
||||
if not _check_neutts_available():
|
||||
return json.dumps({
|
||||
|
|
@ -584,8 +654,7 @@ def text_to_speech_tool(
|
|||
if opus_path:
|
||||
file_str = opus_path
|
||||
voice_compatible = True
|
||||
elif provider in ("elevenlabs", "openai"):
|
||||
# These providers can output Opus natively if the path ends in .ogg
|
||||
elif provider in ("elevenlabs", "openai", "mistral"):
|
||||
voice_compatible = file_str.endswith(".ogg")
|
||||
|
||||
file_size = os.path.getsize(file_str)
|
||||
|
|
@ -653,6 +722,12 @@ def check_tts_requirements() -> bool:
|
|||
pass
|
||||
if os.getenv("MINIMAX_API_KEY"):
|
||||
return True
|
||||
try:
|
||||
_import_mistral_client()
|
||||
if os.getenv("MISTRAL_API_KEY"):
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
if _check_neutts_available():
|
||||
return True
|
||||
return False
|
||||
|
|
|
|||
|
|
@ -145,6 +145,7 @@ ELEVENLABS_API_KEY=***
|
|||
- `neutts` → free local/on-device TTS
|
||||
- `elevenlabs` → best quality
|
||||
- `openai` → good middle ground
|
||||
- `mistral` → multilingual, native Opus
|
||||
|
||||
### If you use `hermes setup`
|
||||
|
||||
|
|
|
|||
|
|
@ -864,6 +864,7 @@ You can switch between providers at any time with `hermes model` — no restart
|
|||
| Image generation | [FAL](https://fal.ai/) | `FAL_KEY` |
|
||||
| Premium TTS voices | [ElevenLabs](https://elevenlabs.io/) | `ELEVENLABS_API_KEY` |
|
||||
| OpenAI TTS + voice transcription | [OpenAI](https://platform.openai.com/api-keys) | `VOICE_TOOLS_OPENAI_KEY` |
|
||||
| Mistral TTS + voice transcription | [Mistral](https://console.mistral.ai/) | `MISTRAL_API_KEY` |
|
||||
| RL Training | [Tinker](https://tinker-console.thinkingmachines.ai/) + [WandB](https://wandb.ai/) | `TINKER_API_KEY`, `WANDB_API_KEY` |
|
||||
| Cross-session user modeling | [Honcho](https://honcho.dev/) | `HONCHO_API_KEY` |
|
||||
| Semantic long-term memory | [Supermemory](https://supermemory.ai) | `SUPERMEMORY_API_KEY` |
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription
|
|||
|
||||
## Text-to-Speech
|
||||
|
||||
Convert text to speech with five providers:
|
||||
Convert text to speech with six providers:
|
||||
|
||||
| Provider | Quality | Cost | API Key |
|
||||
|----------|---------|------|---------|
|
||||
|
|
@ -18,6 +18,7 @@ Convert text to speech with five providers:
|
|||
| **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` |
|
||||
| **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
|
||||
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
|
||||
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
|
||||
| **NeuTTS** | Good | Free | None needed |
|
||||
|
||||
### Platform Delivery
|
||||
|
|
@ -34,7 +35,7 @@ Convert text to speech with five providers:
|
|||
```yaml
|
||||
# In ~/.hermes/config.yaml
|
||||
tts:
|
||||
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "neutts"
|
||||
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts"
|
||||
edge:
|
||||
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
||||
elevenlabs:
|
||||
|
|
@ -50,6 +51,9 @@ tts:
|
|||
speed: 1 # 0.5 - 2.0
|
||||
vol: 1 # 0 - 10
|
||||
pitch: 0 # -12 - 12
|
||||
mistral:
|
||||
model: "voxtral-mini-tts-2603"
|
||||
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
|
||||
neutts:
|
||||
ref_audio: ''
|
||||
ref_text: ''
|
||||
|
|
@ -61,7 +65,7 @@ tts:
|
|||
|
||||
Telegram voice bubbles require Opus/OGG audio format:
|
||||
|
||||
- **OpenAI and ElevenLabs** produce Opus natively — no extra setup
|
||||
- **OpenAI, ElevenLabs, and Mistral** produce Opus natively — no extra setup
|
||||
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
|
||||
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
|
|
@ -80,7 +84,7 @@ sudo dnf install ffmpeg
|
|||
Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
|
||||
|
||||
:::tip
|
||||
If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider.
|
||||
If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider.
|
||||
:::
|
||||
|
||||
## Voice Message Transcription (STT)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue