feat(tools): add Voxtral TTS provider (Mistral AI)

This commit is contained in:
jjovalle99 2026-04-06 19:04:00 +01:00 committed by Teknium
parent 5a55d54ee2
commit 640441b865
11 changed files with 379 additions and 12 deletions

View file

@ -588,7 +588,7 @@ platform_toolsets:
# skills_hub - skill_hub (search/install/manage from online registries — user-driven only)
# moa - mixture_of_agents (requires OPENROUTER_API_KEY)
# todo - todo (in-memory task planning, no deps)
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX key)
# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key)
# cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
# rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
#
@ -617,7 +617,7 @@ platform_toolsets:
# todo - Task planning and tracking for multi-step work
# memory - Persistent memory across sessions (personal notes + user profile)
# session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax)
# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral)
# cronjob - Schedule and manage automated tasks (CLI-only)
# rl - RL training tools (Tinker-Atropos)
#

View file

@ -458,7 +458,7 @@ DEFAULT_CONFIG = {
# Text-to-speech configuration
"tts": {
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "neutts" (local)
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local)
"edge": {
"voice": "en-US-AriaNeural",
# Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural
@ -472,6 +472,10 @@ DEFAULT_CONFIG = {
"voice": "alloy",
# Voices: alloy, echo, fable, onyx, nova, shimmer
},
"mistral": {
"model": "voxtral-mini-tts-2603",
"voice_id": "c69964a6-ab8b-4f8a-9465-ec0925096ec8", # Paul - Neutral
},
"neutts": {
"ref_audio": "", # Path to reference voice audio (empty = bundled default)
"ref_text": "", # Path to reference voice transcript (empty = bundled default)
@ -1016,6 +1020,13 @@ OPTIONAL_ENV_VARS = {
"password": True,
"category": "tool",
},
"MISTRAL_API_KEY": {
"description": "Mistral API key for Voxtral TTS and transcription (STT)",
"prompt": "Mistral API key",
"url": "https://console.mistral.ai/",
"password": True,
"category": "tool",
},
"GITHUB_TOKEN": {
"description": "GitHub token for Skills Hub (higher API rate limits, skill publish)",
"prompt": "GitHub Token",

View file

@ -143,6 +143,7 @@ def _tts_label(current_provider: str) -> str:
"openai": "OpenAI TTS",
"elevenlabs": "ElevenLabs",
"edge": "Edge TTS",
"mistral": "Mistral Voxtral TTS",
"neutts": "NeuTTS",
}
return mapping.get(current_provider or "edge", current_provider or "Edge TTS")
@ -309,6 +310,7 @@ def get_nous_subscription_features(
tts_current_provider in {"edge", "neutts"}
or (tts_current_provider == "openai" and (managed_tts_available or direct_openai_tts))
or (tts_current_provider == "elevenlabs" and direct_elevenlabs)
or (tts_current_provider == "mistral" and bool(get_env_value("MISTRAL_API_KEY")))
)
tts_active = bool(tts_tool_enabled and tts_available)

View file

@ -557,6 +557,8 @@ def _print_setup_summary(config: dict, hermes_home):
tool_status.append(("Text-to-Speech (OpenAI)", True, None))
elif tts_provider == "minimax" and get_env_value("MINIMAX_API_KEY"):
tool_status.append(("Text-to-Speech (MiniMax)", True, None))
elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"):
tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None))
elif tts_provider == "neutts":
try:
import importlib.util
@ -1044,6 +1046,7 @@ def _setup_tts_provider(config: dict):
"elevenlabs": "ElevenLabs",
"openai": "OpenAI TTS",
"minimax": "MiniMax TTS",
"mistral": "Mistral Voxtral TTS",
"neutts": "NeuTTS",
}
current_label = provider_labels.get(current_provider, current_provider)
@ -1064,10 +1067,11 @@ def _setup_tts_provider(config: dict):
"ElevenLabs (premium quality, needs API key)",
"OpenAI TTS (good quality, needs API key)",
"MiniMax TTS (high quality with voice cloning, needs API key)",
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
"NeuTTS (local on-device, free, ~300MB model download)",
]
)
providers.extend(["edge", "elevenlabs", "openai", "minimax", "neutts"])
providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts"])
choices.append(f"Keep current ({current_label})")
keep_current_idx = len(choices) - 1
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
@ -1145,6 +1149,18 @@ def _setup_tts_provider(config: dict):
print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge"
elif selected == "mistral":
existing = get_env_value("MISTRAL_API_KEY")
if not existing:
print()
api_key = prompt("Mistral API key for TTS", password=True)
if api_key:
save_env_value("MISTRAL_API_KEY", api_key)
print_success("Mistral TTS API key saved")
else:
print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge"
# Save the selection
if "tts" not in config:
config["tts"] = {}

View file

@ -181,6 +181,14 @@ TOOL_CATEGORIES = {
],
"tts_provider": "elevenlabs",
},
{
"name": "Mistral (Voxtral TTS)",
"tag": "Multilingual, native Opus, needs MISTRAL_API_KEY",
"env_vars": [
{"key": "MISTRAL_API_KEY", "prompt": "Mistral API key", "url": "https://console.mistral.ai/"},
],
"tts_provider": "mistral",
},
],
},
"web": {

View file

@ -249,8 +249,12 @@ def check_config(groq_key, eleven_key):
if stt_provider == "groq" and not groq_key:
warn("STT config says groq but GROQ_API_KEY is missing")
if stt_provider == "mistral" and not os.getenv("MISTRAL_API_KEY"):
warn("STT config says mistral but MISTRAL_API_KEY is missing")
if tts_provider == "elevenlabs" and not eleven_key:
warn("TTS config says elevenlabs but ELEVENLABS_API_KEY is missing")
if tts_provider == "mistral" and not os.getenv("MISTRAL_API_KEY"):
warn("TTS config says mistral but MISTRAL_API_KEY is missing")
except Exception as e:
warn("config.yaml", f"parse error: {e}")
else:

View file

@ -0,0 +1,245 @@
"""Tests for the Mistral (Voxtral) TTS provider in tools/tts_tool.py."""
import base64
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture(autouse=True)
def clean_env(monkeypatch):
for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"):
monkeypatch.delenv(key, raising=False)
@pytest.fixture
def mock_mistral_module():
mock_client = MagicMock()
mock_client.__enter__ = MagicMock(return_value=mock_client)
mock_client.__exit__ = MagicMock(return_value=False)
mock_mistral_cls = MagicMock(return_value=mock_client)
fake_module = MagicMock()
fake_module.Mistral = mock_mistral_cls
with patch.dict("sys.modules", {"mistralai": fake_module, "mistralai.client": fake_module}):
yield mock_client
class TestGenerateMistralTts:
def test_missing_api_key_raises_value_error(self, tmp_path, mock_mistral_module):
from tools.tts_tool import _generate_mistral_tts
output_path = str(tmp_path / "test.mp3")
with pytest.raises(ValueError, match="MISTRAL_API_KEY"):
_generate_mistral_tts("Hello", output_path, {})
def test_successful_generation(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
audio_content = b"fake-audio-bytes"
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(audio_content).decode()
)
output_path = str(tmp_path / "test.mp3")
result = _generate_mistral_tts("Hello world", output_path, {})
assert result == output_path
assert (tmp_path / "test.mp3").read_bytes() == audio_content
mock_mistral_module.audio.speech.complete.assert_called_once()
mock_mistral_module.__exit__.assert_called_once()
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["input"] == "Hello world"
assert call_kwargs["response_format"] == "mp3"
@pytest.mark.parametrize(
"extension, expected_format",
[(".ogg", "opus"), (".wav", "wav"), (".flac", "flac"), (".mp3", "mp3")],
)
def test_response_format_from_extension(
self, tmp_path, mock_mistral_module, monkeypatch, extension, expected_format
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
output_path = str(tmp_path / f"test{extension}")
_generate_mistral_tts("Hi", output_path, {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["response_format"] == expected_format
def test_voice_id_passed_when_configured(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"voice_id": "my-voice-uuid"}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == "my-voice-uuid"
def test_default_voice_id_when_absent(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
def test_default_voice_id_when_empty_string(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"voice_id": ""}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
def test_api_error_sanitized(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.side_effect = RuntimeError(
"secret-key-in-error"
)
with pytest.raises(RuntimeError, match="RuntimeError") as exc_info:
_generate_mistral_tts("Hello", str(tmp_path / "test.mp3"), {})
assert "secret-key-in-error" not in str(exc_info.value)
def test_default_model_used(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_MODEL, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["model"] == DEFAULT_MISTRAL_TTS_MODEL
def test_model_from_config_overrides_default(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"model": "voxtral-large-tts-9999"}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["model"] == "voxtral-large-tts-9999"
class TestTtsDispatcherMistral:
def test_dispatcher_routes_to_mistral(
self, tmp_path, mock_mistral_module, monkeypatch
):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"audio").decode()
)
output_path = str(tmp_path / "out.mp3")
with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
result = json.loads(text_to_speech_tool("Hello", output_path=output_path))
assert result["success"] is True
assert result["provider"] == "mistral"
mock_mistral_module.audio.speech.complete.assert_called_once()
def test_dispatcher_returns_error_when_sdk_not_installed(self, tmp_path, monkeypatch):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
with patch(
"tools.tts_tool._import_mistral_client", side_effect=ImportError("no module")
), patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
result = json.loads(
text_to_speech_tool("Hello", output_path=str(tmp_path / "out.mp3"))
)
assert result["success"] is False
assert "mistralai" in result["error"]
class TestCheckTtsRequirementsMistral:
def test_mistral_sdk_and_key_returns_true(self, mock_mistral_module, monkeypatch):
from tools.tts_tool import check_tts_requirements
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is True
def test_mistral_key_missing_returns_false(self, mock_mistral_module):
from tools.tts_tool import check_tts_requirements
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is False
class TestMistralTtsOpus:
def test_telegram_produces_ogg_and_voice_compatible(
self, tmp_path, mock_mistral_module, monkeypatch
):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"opus-audio").decode()
)
with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
result = json.loads(text_to_speech_tool("Hello"))
assert result["success"] is True
assert result["file_path"].endswith(".ogg")
assert result["voice_compatible"] is True
assert "[[audio_as_voice]]" in result["media_tag"]
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["response_format"] == "opus"

View file

@ -2,11 +2,12 @@
"""
Text-to-Speech Tool Module
Supports five TTS providers:
Supports six TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
Output formats:
@ -23,6 +24,7 @@ Usage:
"""
import asyncio
import base64
import datetime
import json
import logging
@ -62,6 +64,11 @@ def _import_openai_client():
from openai import OpenAI as OpenAIClient
return OpenAIClient
def _import_mistral_client():
"""Lazy import Mistral client. Returns the class or raises ImportError."""
from mistralai.client import Mistral
return Mistral
def _import_sounddevice():
"""Lazy import sounddevice. Returns the module or raises ImportError/OSError."""
import sounddevice as sd
@ -82,6 +89,8 @@ DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"
DEFAULT_MINIMAX_MODEL = "speech-2.8-hd"
DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady"
DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2"
DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603"
DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral
def _get_default_output_dir() -> str:
from hermes_constants import get_hermes_dir
@ -365,6 +374,55 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
return output_path
# ===========================================================================
# Provider: Mistral (Voxtral TTS)
# ===========================================================================
def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate audio using Mistral Voxtral TTS API.
The API returns base64-encoded audio; this function decodes it
and writes the raw bytes to *output_path*.
Supports native Opus output for Telegram voice bubbles.
"""
api_key = os.getenv("MISTRAL_API_KEY", "")
if not api_key:
raise ValueError("MISTRAL_API_KEY not set. Get one at https://console.mistral.ai/")
mi_config = tts_config.get("mistral", {})
model = mi_config.get("model", DEFAULT_MISTRAL_TTS_MODEL)
voice_id = mi_config.get("voice_id") or DEFAULT_MISTRAL_TTS_VOICE_ID
if output_path.endswith(".ogg"):
response_format = "opus"
elif output_path.endswith(".wav"):
response_format = "wav"
elif output_path.endswith(".flac"):
response_format = "flac"
else:
response_format = "mp3"
Mistral = _import_mistral_client()
try:
with Mistral(api_key=api_key) as client:
response = client.audio.speech.complete(
model=model,
input=text,
voice_id=voice_id,
response_format=response_format,
)
audio_bytes = base64.b64decode(response.audio_data)
except ValueError:
raise
except Exception as e:
logger.error("Mistral TTS failed: %s", e, exc_info=True)
raise RuntimeError(f"Mistral TTS failed: {type(e).__name__}") from e
with open(output_path, "wb") as f:
f.write(audio_bytes)
return output_path
# ===========================================================================
# NeuTTS (local, on-device TTS via neutts_cli)
# ===========================================================================
@ -493,7 +551,7 @@ def text_to_speech_tool(
out_dir.mkdir(parents=True, exist_ok=True)
# Use .ogg for Telegram with providers that support native Opus output,
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
if want_opus and provider in ("openai", "elevenlabs"):
if want_opus and provider in ("openai", "elevenlabs", "mistral"):
file_path = out_dir / f"tts_{timestamp}.ogg"
else:
file_path = out_dir / f"tts_{timestamp}.mp3"
@ -530,6 +588,18 @@ def text_to_speech_tool(
logger.info("Generating speech with MiniMax TTS...")
_generate_minimax_tts(text, file_str, tts_config)
elif provider == "mistral":
try:
_import_mistral_client()
except ImportError:
return json.dumps({
"success": False,
"error": "Mistral provider selected but 'mistralai' package not installed. "
"Run: pip install 'hermes-agent[mistral]'"
}, ensure_ascii=False)
logger.info("Generating speech with Mistral Voxtral TTS...")
_generate_mistral_tts(text, file_str, tts_config)
elif provider == "neutts":
if not _check_neutts_available():
return json.dumps({
@ -584,8 +654,7 @@ def text_to_speech_tool(
if opus_path:
file_str = opus_path
voice_compatible = True
elif provider in ("elevenlabs", "openai"):
# These providers can output Opus natively if the path ends in .ogg
elif provider in ("elevenlabs", "openai", "mistral"):
voice_compatible = file_str.endswith(".ogg")
file_size = os.path.getsize(file_str)
@ -653,6 +722,12 @@ def check_tts_requirements() -> bool:
pass
if os.getenv("MINIMAX_API_KEY"):
return True
try:
_import_mistral_client()
if os.getenv("MISTRAL_API_KEY"):
return True
except ImportError:
pass
if _check_neutts_available():
return True
return False

View file

@ -145,6 +145,7 @@ ELEVENLABS_API_KEY=***
- `neutts` → free local/on-device TTS
- `elevenlabs` → best quality
- `openai` → good middle ground
- `mistral` → multilingual, native Opus
### If you use `hermes setup`

View file

@ -864,6 +864,7 @@ You can switch between providers at any time with `hermes model` — no restart
| Image generation | [FAL](https://fal.ai/) | `FAL_KEY` |
| Premium TTS voices | [ElevenLabs](https://elevenlabs.io/) | `ELEVENLABS_API_KEY` |
| OpenAI TTS + voice transcription | [OpenAI](https://platform.openai.com/api-keys) | `VOICE_TOOLS_OPENAI_KEY` |
| Mistral TTS + voice transcription | [Mistral](https://console.mistral.ai/) | `MISTRAL_API_KEY` |
| RL Training | [Tinker](https://tinker-console.thinkingmachines.ai/) + [WandB](https://wandb.ai/) | `TINKER_API_KEY`, `WANDB_API_KEY` |
| Cross-session user modeling | [Honcho](https://honcho.dev/) | `HONCHO_API_KEY` |
| Semantic long-term memory | [Supermemory](https://supermemory.ai) | `SUPERMEMORY_API_KEY` |

View file

@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription
## Text-to-Speech
Convert text to speech with five providers:
Convert text to speech with six providers:
| Provider | Quality | Cost | API Key |
|----------|---------|------|---------|
@ -18,6 +18,7 @@ Convert text to speech with five providers:
| **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` |
| **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
| **NeuTTS** | Good | Free | None needed |
### Platform Delivery
@ -34,7 +35,7 @@ Convert text to speech with five providers:
```yaml
# In ~/.hermes/config.yaml
tts:
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "neutts"
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts"
edge:
voice: "en-US-AriaNeural" # 322 voices, 74 languages
elevenlabs:
@ -50,6 +51,9 @@ tts:
speed: 1 # 0.5 - 2.0
vol: 1 # 0 - 10
pitch: 0 # -12 - 12
mistral:
model: "voxtral-mini-tts-2603"
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
neutts:
ref_audio: ''
ref_text: ''
@ -61,7 +65,7 @@ tts:
Telegram voice bubbles require Opus/OGG audio format:
- **OpenAI and ElevenLabs** produce Opus natively — no extra setup
- **OpenAI, ElevenLabs, and Mistral** produce Opus natively — no extra setup
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
@ -80,7 +84,7 @@ sudo dnf install ffmpeg
Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
:::tip
If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider.
If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider.
:::
## Voice Message Transcription (STT)