diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 5807cef7a..e9284d813 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -588,7 +588,7 @@ platform_toolsets: # skills_hub - skill_hub (search/install/manage from online registries — user-driven only) # moa - mixture_of_agents (requires OPENROUTER_API_KEY) # todo - todo (in-memory task planning, no deps) -# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX key) +# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key) # cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks) # rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY) # @@ -617,7 +617,7 @@ platform_toolsets: # todo - Task planning and tracking for multi-step work # memory - Persistent memory across sessions (personal notes + user profile) # session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization) -# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax) +# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral) # cronjob - Schedule and manage automated tasks (CLI-only) # rl - RL training tools (Tinker-Atropos) # diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 2cb6a8d62..89606edc2 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -458,7 +458,7 @@ DEFAULT_CONFIG = { # Text-to-speech configuration "tts": { - "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "neutts" (local) + "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "minimax" | "mistral" | "neutts" (local) "edge": { "voice": "en-US-AriaNeural", # Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural @@ -472,6 +472,10 @@ DEFAULT_CONFIG = { "voice": "alloy", # Voices: alloy, echo, fable, onyx, nova, shimmer }, + "mistral": { + "model": "voxtral-mini-tts-2603", + "voice_id": "c69964a6-ab8b-4f8a-9465-ec0925096ec8", # Paul - Neutral + }, "neutts": { "ref_audio": "", # Path to reference voice audio (empty = bundled default) "ref_text": "", # Path to reference voice transcript (empty = bundled default) @@ -1016,6 +1020,13 @@ OPTIONAL_ENV_VARS = { "password": True, "category": "tool", }, + "MISTRAL_API_KEY": { + "description": "Mistral API key for Voxtral TTS and transcription (STT)", + "prompt": "Mistral API key", + "url": "https://console.mistral.ai/", + "password": True, + "category": "tool", + }, "GITHUB_TOKEN": { "description": "GitHub token for Skills Hub (higher API rate limits, skill publish)", "prompt": "GitHub Token", diff --git a/hermes_cli/nous_subscription.py b/hermes_cli/nous_subscription.py index fe86ac206..f1e4366c1 100644 --- a/hermes_cli/nous_subscription.py +++ b/hermes_cli/nous_subscription.py @@ -143,6 +143,7 @@ def _tts_label(current_provider: str) -> str: "openai": "OpenAI TTS", "elevenlabs": "ElevenLabs", "edge": "Edge TTS", + "mistral": "Mistral Voxtral TTS", "neutts": "NeuTTS", } return mapping.get(current_provider or "edge", current_provider or "Edge TTS") @@ -309,6 +310,7 @@ def get_nous_subscription_features( tts_current_provider in {"edge", "neutts"} or (tts_current_provider == "openai" and (managed_tts_available or direct_openai_tts)) or (tts_current_provider == "elevenlabs" and direct_elevenlabs) + or (tts_current_provider == "mistral" and bool(get_env_value("MISTRAL_API_KEY"))) ) tts_active = bool(tts_tool_enabled and tts_available) diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 2291758f7..ca877606f 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -557,6 +557,8 @@ def _print_setup_summary(config: dict, hermes_home): tool_status.append(("Text-to-Speech (OpenAI)", True, None)) elif tts_provider == "minimax" and get_env_value("MINIMAX_API_KEY"): tool_status.append(("Text-to-Speech (MiniMax)", True, None)) + elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"): + tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None)) elif tts_provider == "neutts": try: import importlib.util @@ -1044,6 +1046,7 @@ def _setup_tts_provider(config: dict): "elevenlabs": "ElevenLabs", "openai": "OpenAI TTS", "minimax": "MiniMax TTS", + "mistral": "Mistral Voxtral TTS", "neutts": "NeuTTS", } current_label = provider_labels.get(current_provider, current_provider) @@ -1064,10 +1067,11 @@ def _setup_tts_provider(config: dict): "ElevenLabs (premium quality, needs API key)", "OpenAI TTS (good quality, needs API key)", "MiniMax TTS (high quality with voice cloning, needs API key)", + "Mistral Voxtral TTS (multilingual, native Opus, needs API key)", "NeuTTS (local on-device, free, ~300MB model download)", ] ) - providers.extend(["edge", "elevenlabs", "openai", "minimax", "neutts"]) + providers.extend(["edge", "elevenlabs", "openai", "minimax", "mistral", "neutts"]) choices.append(f"Keep current ({current_label})") keep_current_idx = len(choices) - 1 idx = prompt_choice("Select TTS provider:", choices, keep_current_idx) @@ -1145,6 +1149,18 @@ def _setup_tts_provider(config: dict): print_warning("No API key provided. Falling back to Edge TTS.") selected = "edge" + elif selected == "mistral": + existing = get_env_value("MISTRAL_API_KEY") + if not existing: + print() + api_key = prompt("Mistral API key for TTS", password=True) + if api_key: + save_env_value("MISTRAL_API_KEY", api_key) + print_success("Mistral TTS API key saved") + else: + print_warning("No API key provided. Falling back to Edge TTS.") + selected = "edge" + # Save the selection if "tts" not in config: config["tts"] = {} diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index d86ffd281..291914876 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -181,6 +181,14 @@ TOOL_CATEGORIES = { ], "tts_provider": "elevenlabs", }, + { + "name": "Mistral (Voxtral TTS)", + "tag": "Multilingual, native Opus, needs MISTRAL_API_KEY", + "env_vars": [ + {"key": "MISTRAL_API_KEY", "prompt": "Mistral API key", "url": "https://console.mistral.ai/"}, + ], + "tts_provider": "mistral", + }, ], }, "web": { diff --git a/scripts/discord-voice-doctor.py b/scripts/discord-voice-doctor.py index 4fd55f9e8..6fc3f7b15 100755 --- a/scripts/discord-voice-doctor.py +++ b/scripts/discord-voice-doctor.py @@ -249,8 +249,12 @@ def check_config(groq_key, eleven_key): if stt_provider == "groq" and not groq_key: warn("STT config says groq but GROQ_API_KEY is missing") + if stt_provider == "mistral" and not os.getenv("MISTRAL_API_KEY"): + warn("STT config says mistral but MISTRAL_API_KEY is missing") if tts_provider == "elevenlabs" and not eleven_key: warn("TTS config says elevenlabs but ELEVENLABS_API_KEY is missing") + if tts_provider == "mistral" and not os.getenv("MISTRAL_API_KEY"): + warn("TTS config says mistral but MISTRAL_API_KEY is missing") except Exception as e: warn("config.yaml", f"parse error: {e}") else: diff --git a/tests/tools/test_tts_mistral.py b/tests/tools/test_tts_mistral.py new file mode 100644 index 000000000..a62afd8db --- /dev/null +++ b/tests/tools/test_tts_mistral.py @@ -0,0 +1,245 @@ +"""Tests for the Mistral (Voxtral) TTS provider in tools/tts_tool.py.""" + +import base64 +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"): + monkeypatch.delenv(key, raising=False) + + +@pytest.fixture +def mock_mistral_module(): + mock_client = MagicMock() + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_mistral_cls = MagicMock(return_value=mock_client) + fake_module = MagicMock() + fake_module.Mistral = mock_mistral_cls + with patch.dict("sys.modules", {"mistralai": fake_module, "mistralai.client": fake_module}): + yield mock_client + + +class TestGenerateMistralTts: + def test_missing_api_key_raises_value_error(self, tmp_path, mock_mistral_module): + from tools.tts_tool import _generate_mistral_tts + + output_path = str(tmp_path / "test.mp3") + with pytest.raises(ValueError, match="MISTRAL_API_KEY"): + _generate_mistral_tts("Hello", output_path, {}) + + def test_successful_generation(self, tmp_path, mock_mistral_module, monkeypatch): + from tools.tts_tool import _generate_mistral_tts + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + audio_content = b"fake-audio-bytes" + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(audio_content).decode() + ) + + output_path = str(tmp_path / "test.mp3") + result = _generate_mistral_tts("Hello world", output_path, {}) + + assert result == output_path + assert (tmp_path / "test.mp3").read_bytes() == audio_content + mock_mistral_module.audio.speech.complete.assert_called_once() + mock_mistral_module.__exit__.assert_called_once() + call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1] + assert call_kwargs["input"] == "Hello world" + assert call_kwargs["response_format"] == "mp3" + + @pytest.mark.parametrize( + "extension, expected_format", + [(".ogg", "opus"), (".wav", "wav"), (".flac", "flac"), (".mp3", "mp3")], + ) + def test_response_format_from_extension( + self, tmp_path, mock_mistral_module, monkeypatch, extension, expected_format + ): + from tools.tts_tool import _generate_mistral_tts + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(b"data").decode() + ) + + output_path = str(tmp_path / f"test{extension}") + _generate_mistral_tts("Hi", output_path, {}) + + call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1] + assert call_kwargs["response_format"] == expected_format + + def test_voice_id_passed_when_configured( + self, tmp_path, mock_mistral_module, monkeypatch + ): + from tools.tts_tool import _generate_mistral_tts + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(b"data").decode() + ) + + config = {"mistral": {"voice_id": "my-voice-uuid"}} + _generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config) + + call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1] + assert call_kwargs["voice_id"] == "my-voice-uuid" + + def test_default_voice_id_when_absent( + self, tmp_path, mock_mistral_module, monkeypatch + ): + from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(b"data").decode() + ) + + _generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {}) + + call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1] + assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID + + def test_default_voice_id_when_empty_string( + self, tmp_path, mock_mistral_module, monkeypatch + ): + from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(b"data").decode() + ) + + config = {"mistral": {"voice_id": ""}} + _generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config) + + call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1] + assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID + + def test_api_error_sanitized(self, tmp_path, mock_mistral_module, monkeypatch): + from tools.tts_tool import _generate_mistral_tts + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + mock_mistral_module.audio.speech.complete.side_effect = RuntimeError( + "secret-key-in-error" + ) + + with pytest.raises(RuntimeError, match="RuntimeError") as exc_info: + _generate_mistral_tts("Hello", str(tmp_path / "test.mp3"), {}) + assert "secret-key-in-error" not in str(exc_info.value) + + def test_default_model_used(self, tmp_path, mock_mistral_module, monkeypatch): + from tools.tts_tool import DEFAULT_MISTRAL_TTS_MODEL, _generate_mistral_tts + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(b"data").decode() + ) + + _generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {}) + + call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1] + assert call_kwargs["model"] == DEFAULT_MISTRAL_TTS_MODEL + + def test_model_from_config_overrides_default( + self, tmp_path, mock_mistral_module, monkeypatch + ): + from tools.tts_tool import _generate_mistral_tts + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(b"data").decode() + ) + + config = {"mistral": {"model": "voxtral-large-tts-9999"}} + _generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config) + + call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1] + assert call_kwargs["model"] == "voxtral-large-tts-9999" + + +class TestTtsDispatcherMistral: + def test_dispatcher_routes_to_mistral( + self, tmp_path, mock_mistral_module, monkeypatch + ): + import json + + from tools.tts_tool import text_to_speech_tool + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(b"audio").decode() + ) + + output_path = str(tmp_path / "out.mp3") + with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}): + result = json.loads(text_to_speech_tool("Hello", output_path=output_path)) + + assert result["success"] is True + assert result["provider"] == "mistral" + mock_mistral_module.audio.speech.complete.assert_called_once() + + def test_dispatcher_returns_error_when_sdk_not_installed(self, tmp_path, monkeypatch): + import json + + from tools.tts_tool import text_to_speech_tool + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + with patch( + "tools.tts_tool._import_mistral_client", side_effect=ImportError("no module") + ), patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}): + result = json.loads( + text_to_speech_tool("Hello", output_path=str(tmp_path / "out.mp3")) + ) + + assert result["success"] is False + assert "mistralai" in result["error"] + + +class TestCheckTtsRequirementsMistral: + def test_mistral_sdk_and_key_returns_true(self, mock_mistral_module, monkeypatch): + from tools.tts_tool import check_tts_requirements + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \ + patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \ + patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \ + patch("tools.tts_tool._check_neutts_available", return_value=False): + assert check_tts_requirements() is True + + def test_mistral_key_missing_returns_false(self, mock_mistral_module): + from tools.tts_tool import check_tts_requirements + + with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \ + patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \ + patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \ + patch("tools.tts_tool._check_neutts_available", return_value=False): + assert check_tts_requirements() is False + + +class TestMistralTtsOpus: + def test_telegram_produces_ogg_and_voice_compatible( + self, tmp_path, mock_mistral_module, monkeypatch + ): + import json + + from tools.tts_tool import text_to_speech_tool + + monkeypatch.setenv("MISTRAL_API_KEY", "test-key") + monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram") + mock_mistral_module.audio.speech.complete.return_value = MagicMock( + audio_data=base64.b64encode(b"opus-audio").decode() + ) + + with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}): + result = json.loads(text_to_speech_tool("Hello")) + + assert result["success"] is True + assert result["file_path"].endswith(".ogg") + assert result["voice_compatible"] is True + assert "[[audio_as_voice]]" in result["media_tag"] + call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1] + assert call_kwargs["response_format"] == "opus" diff --git a/tools/tts_tool.py b/tools/tts_tool.py index be8bc11e3..1423e2e78 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -2,11 +2,12 @@ """ Text-to-Speech Tool Module -Supports five TTS providers: +Supports six TTS providers: - Edge TTS (default, free, no API key): Microsoft Edge neural voices - ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY - OpenAI TTS: Good quality, needs OPENAI_API_KEY - MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY +- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY - NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed Output formats: @@ -23,6 +24,7 @@ Usage: """ import asyncio +import base64 import datetime import json import logging @@ -62,6 +64,11 @@ def _import_openai_client(): from openai import OpenAI as OpenAIClient return OpenAIClient +def _import_mistral_client(): + """Lazy import Mistral client. Returns the class or raises ImportError.""" + from mistralai.client import Mistral + return Mistral + def _import_sounddevice(): """Lazy import sounddevice. Returns the module or raises ImportError/OSError.""" import sounddevice as sd @@ -82,6 +89,8 @@ DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1" DEFAULT_MINIMAX_MODEL = "speech-2.8-hd" DEFAULT_MINIMAX_VOICE_ID = "English_Graceful_Lady" DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1/t2a_v2" +DEFAULT_MISTRAL_TTS_MODEL = "voxtral-mini-tts-2603" +DEFAULT_MISTRAL_TTS_VOICE_ID = "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral def _get_default_output_dir() -> str: from hermes_constants import get_hermes_dir @@ -365,6 +374,55 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any return output_path +# =========================================================================== +# Provider: Mistral (Voxtral TTS) +# =========================================================================== +def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """Generate audio using Mistral Voxtral TTS API. + + The API returns base64-encoded audio; this function decodes it + and writes the raw bytes to *output_path*. + Supports native Opus output for Telegram voice bubbles. + """ + api_key = os.getenv("MISTRAL_API_KEY", "") + if not api_key: + raise ValueError("MISTRAL_API_KEY not set. Get one at https://console.mistral.ai/") + + mi_config = tts_config.get("mistral", {}) + model = mi_config.get("model", DEFAULT_MISTRAL_TTS_MODEL) + voice_id = mi_config.get("voice_id") or DEFAULT_MISTRAL_TTS_VOICE_ID + + if output_path.endswith(".ogg"): + response_format = "opus" + elif output_path.endswith(".wav"): + response_format = "wav" + elif output_path.endswith(".flac"): + response_format = "flac" + else: + response_format = "mp3" + + Mistral = _import_mistral_client() + try: + with Mistral(api_key=api_key) as client: + response = client.audio.speech.complete( + model=model, + input=text, + voice_id=voice_id, + response_format=response_format, + ) + audio_bytes = base64.b64decode(response.audio_data) + except ValueError: + raise + except Exception as e: + logger.error("Mistral TTS failed: %s", e, exc_info=True) + raise RuntimeError(f"Mistral TTS failed: {type(e).__name__}") from e + + with open(output_path, "wb") as f: + f.write(audio_bytes) + + return output_path + + # =========================================================================== # NeuTTS (local, on-device TTS via neutts_cli) # =========================================================================== @@ -493,7 +551,7 @@ def text_to_speech_tool( out_dir.mkdir(parents=True, exist_ok=True) # Use .ogg for Telegram with providers that support native Opus output, # otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later). - if want_opus and provider in ("openai", "elevenlabs"): + if want_opus and provider in ("openai", "elevenlabs", "mistral"): file_path = out_dir / f"tts_{timestamp}.ogg" else: file_path = out_dir / f"tts_{timestamp}.mp3" @@ -530,6 +588,18 @@ def text_to_speech_tool( logger.info("Generating speech with MiniMax TTS...") _generate_minimax_tts(text, file_str, tts_config) + elif provider == "mistral": + try: + _import_mistral_client() + except ImportError: + return json.dumps({ + "success": False, + "error": "Mistral provider selected but 'mistralai' package not installed. " + "Run: pip install 'hermes-agent[mistral]'" + }, ensure_ascii=False) + logger.info("Generating speech with Mistral Voxtral TTS...") + _generate_mistral_tts(text, file_str, tts_config) + elif provider == "neutts": if not _check_neutts_available(): return json.dumps({ @@ -584,8 +654,7 @@ def text_to_speech_tool( if opus_path: file_str = opus_path voice_compatible = True - elif provider in ("elevenlabs", "openai"): - # These providers can output Opus natively if the path ends in .ogg + elif provider in ("elevenlabs", "openai", "mistral"): voice_compatible = file_str.endswith(".ogg") file_size = os.path.getsize(file_str) @@ -653,6 +722,12 @@ def check_tts_requirements() -> bool: pass if os.getenv("MINIMAX_API_KEY"): return True + try: + _import_mistral_client() + if os.getenv("MISTRAL_API_KEY"): + return True + except ImportError: + pass if _check_neutts_available(): return True return False diff --git a/website/docs/guides/use-voice-mode-with-hermes.md b/website/docs/guides/use-voice-mode-with-hermes.md index 8aca66bc1..42b335559 100644 --- a/website/docs/guides/use-voice-mode-with-hermes.md +++ b/website/docs/guides/use-voice-mode-with-hermes.md @@ -145,6 +145,7 @@ ELEVENLABS_API_KEY=*** - `neutts` → free local/on-device TTS - `elevenlabs` → best quality - `openai` → good middle ground +- `mistral` → multilingual, native Opus ### If you use `hermes setup` diff --git a/website/docs/integrations/providers.md b/website/docs/integrations/providers.md index 133990b44..83ccda05d 100644 --- a/website/docs/integrations/providers.md +++ b/website/docs/integrations/providers.md @@ -864,6 +864,7 @@ You can switch between providers at any time with `hermes model` — no restart | Image generation | [FAL](https://fal.ai/) | `FAL_KEY` | | Premium TTS voices | [ElevenLabs](https://elevenlabs.io/) | `ELEVENLABS_API_KEY` | | OpenAI TTS + voice transcription | [OpenAI](https://platform.openai.com/api-keys) | `VOICE_TOOLS_OPENAI_KEY` | +| Mistral TTS + voice transcription | [Mistral](https://console.mistral.ai/) | `MISTRAL_API_KEY` | | RL Training | [Tinker](https://tinker-console.thinkingmachines.ai/) + [WandB](https://wandb.ai/) | `TINKER_API_KEY`, `WANDB_API_KEY` | | Cross-session user modeling | [Honcho](https://honcho.dev/) | `HONCHO_API_KEY` | | Semantic long-term memory | [Supermemory](https://supermemory.ai) | `SUPERMEMORY_API_KEY` | diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 0cd4ed699..656a41fd8 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription ## Text-to-Speech -Convert text to speech with five providers: +Convert text to speech with six providers: | Provider | Quality | Cost | API Key | |----------|---------|------|---------| @@ -18,6 +18,7 @@ Convert text to speech with five providers: | **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` | | **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` | | **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` | +| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` | | **NeuTTS** | Good | Free | None needed | ### Platform Delivery @@ -34,7 +35,7 @@ Convert text to speech with five providers: ```yaml # In ~/.hermes/config.yaml tts: - provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "neutts" + provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts" edge: voice: "en-US-AriaNeural" # 322 voices, 74 languages elevenlabs: @@ -50,6 +51,9 @@ tts: speed: 1 # 0.5 - 2.0 vol: 1 # 0 - 10 pitch: 0 # -12 - 12 + mistral: + model: "voxtral-mini-tts-2603" + voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default) neutts: ref_audio: '' ref_text: '' @@ -61,7 +65,7 @@ tts: Telegram voice bubbles require Opus/OGG audio format: -- **OpenAI and ElevenLabs** produce Opus natively — no extra setup +- **OpenAI, ElevenLabs, and Mistral** produce Opus natively — no extra setup - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert: - **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles @@ -80,7 +84,7 @@ sudo dnf install ffmpeg Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). :::tip -If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider. +If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider. ::: ## Voice Message Transcription (STT)