hermes-agent/tests/tools/test_tts_mistral.py
Teknium 8d302e37a8
feat(tts): add Piper as a native local TTS provider (closes #8508) (#17885)
Piper (OHF-Voice/piper1-gpl) is a fast, local neural TTS engine from the
Home Assistant project that supports 44 languages with zero API keys.
Adds it as a native built-in provider alongside edge/neutts/kittentts,
installable via 'hermes tools' with one keystroke.

What ships:

- New 'piper' built-in provider in tools/tts_tool.py
  - Lazy import via _import_piper()
  - Module-level voice cache keyed on (model_path, use_cuda) so switching
    voices doesn't invalidate older cached voices
  - _resolve_piper_voice_path() accepts either an absolute .onnx path or a
    voice name (auto-downloaded on first use via 'python -m
    piper.download_voices --download-dir <cache>')
  - Voice cache at ~/.hermes/cache/piper-voices/ (profile-aware via
    get_hermes_dir)
  - Optional SynthesisConfig knobs: length_scale, noise_scale,
    noise_w_scale, volume, normalize_audio, use_cuda — passed through
    only when configured, so older piper-tts versions aren't broken
  - WAV output then ffmpeg conversion path (same as neutts/kittentts) so
    Telegram voice bubbles work when ffmpeg is present
  - Piper added to BUILTIN_TTS_PROVIDERS so a user's
    tts.providers.piper.command cannot shadow the native provider
    (regression test included)

- 'hermes tools' wizard entry
  - Piper appears under Voice and TTS as local free, with
    'pip install piper-tts' auto-install via post_setup handler
  - Prints voice-catalog URL and default-voice info after install

- config.yaml defaults
  - tts.piper.voice defaults to en_US-lessac-medium
  - Commented advanced knobs for discoverability

- Docs
  - New 'Piper (local, 44 languages)' section in features/tts.md
    explaining install path, voice switching, pre-downloaded voices,
    and advanced knobs
  - Piper listed in the ten-provider table and ffmpeg table
  - Custom-command-providers section updated to drop the Piper example
    (now native) and add a piper-custom example for users with their own
    trained .onnx models
  - overview.md bumps provider count to ten

- Tests (tests/tools/test_tts_piper.py, 16 tests)
  - Registration (BUILTIN_TTS_PROVIDERS, PROVIDER_MAX_TEXT_LENGTH)
  - _resolve_piper_voice_path across every branch: direct .onnx path,
    cached voice name, fresh download with correct CLI args, download
    failure, successful-exit-but-missing-files, empty voice to default
  - _generate_piper_tts: loads voice once, reuses cache, voice-name
    download wiring, advanced knobs flow through SynthesisConfig
  - text_to_speech_tool end-to-end dispatch and missing-package error
  - check_tts_requirements: piper availability toggles the return value
  - Regression guard: piper cannot be shadowed by a command provider
    with the same name
  - Pre-existing test_tts_mistral test broadened to mock the new
    piper/kittentts/command-provider checks (otherwise it false-passes
    when piper is installed in the test venv)

E2E verification (live):

Actual pip install piper-tts, config piper + en_US-lessac-low,
text_to_speech_tool call, voice auto-downloaded from HuggingFace,
WAV synthesized, ffmpeg-converted to Ogg/Opus. Second call hits the
cache (~60ms). Cache dir populated with .onnx and .onnx.json.

This caught a real bug during development: the first pass used '-d' as
the download-dir flag; the actual piper.download_voices CLI wants
'--download-dir'. Fixed before PR opened.
2026-04-30 02:53:20 -07:00

223 lines
9.2 KiB
Python

"""Tests for the Mistral (Voxtral) TTS provider in tools/tts_tool.py."""
import base64
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture(autouse=True)
def clean_env(monkeypatch):
for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"):
monkeypatch.delenv(key, raising=False)
@pytest.fixture
def mock_mistral_module():
mock_client = MagicMock()
mock_client.__enter__ = MagicMock(return_value=mock_client)
mock_client.__exit__ = MagicMock(return_value=False)
mock_mistral_cls = MagicMock(return_value=mock_client)
fake_module = MagicMock()
fake_module.Mistral = mock_mistral_cls
with patch.dict("sys.modules", {"mistralai": fake_module, "mistralai.client": fake_module}):
yield mock_client
class TestGenerateMistralTts:
def test_missing_api_key_raises_value_error(self, tmp_path, mock_mistral_module):
from tools.tts_tool import _generate_mistral_tts
output_path = str(tmp_path / "test.mp3")
with pytest.raises(ValueError, match="MISTRAL_API_KEY"):
_generate_mistral_tts("Hello", output_path, {})
def test_successful_generation(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
audio_content = b"fake-audio-bytes"
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(audio_content).decode()
)
output_path = str(tmp_path / "test.mp3")
result = _generate_mistral_tts("Hello world", output_path, {})
assert result == output_path
assert (tmp_path / "test.mp3").read_bytes() == audio_content
mock_mistral_module.audio.speech.complete.assert_called_once()
mock_mistral_module.__exit__.assert_called_once()
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["input"] == "Hello world"
assert call_kwargs["response_format"] == "mp3"
@pytest.mark.parametrize(
"extension, expected_format",
[(".ogg", "opus"), (".wav", "wav"), (".flac", "flac"), (".mp3", "mp3")],
)
def test_response_format_from_extension(
self, tmp_path, mock_mistral_module, monkeypatch, extension, expected_format
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
output_path = str(tmp_path / f"test{extension}")
_generate_mistral_tts("Hi", output_path, {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["response_format"] == expected_format
def test_voice_id_passed_when_configured(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"voice_id": "my-voice-uuid"}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == "my-voice-uuid"
def test_default_voice_id_when_absent(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
def test_default_voice_id_when_empty_string(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_VOICE_ID, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"voice_id": ""}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["voice_id"] == DEFAULT_MISTRAL_TTS_VOICE_ID
def test_api_error_sanitized(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.side_effect = RuntimeError(
"secret-key-in-error"
)
with pytest.raises(RuntimeError, match="RuntimeError") as exc_info:
_generate_mistral_tts("Hello", str(tmp_path / "test.mp3"), {})
assert "secret-key-in-error" not in str(exc_info.value)
def test_default_model_used(self, tmp_path, mock_mistral_module, monkeypatch):
from tools.tts_tool import DEFAULT_MISTRAL_TTS_MODEL, _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), {})
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["model"] == DEFAULT_MISTRAL_TTS_MODEL
def test_model_from_config_overrides_default(
self, tmp_path, mock_mistral_module, monkeypatch
):
from tools.tts_tool import _generate_mistral_tts
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"data").decode()
)
config = {"mistral": {"model": "voxtral-large-tts-9999"}}
_generate_mistral_tts("Hi", str(tmp_path / "test.mp3"), config)
call_kwargs = mock_mistral_module.audio.speech.complete.call_args[1]
assert call_kwargs["model"] == "voxtral-large-tts-9999"
class TestTtsDispatcherMistral:
def test_dispatcher_routes_to_mistral(
self, tmp_path, mock_mistral_module, monkeypatch
):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
mock_mistral_module.audio.speech.complete.return_value = MagicMock(
audio_data=base64.b64encode(b"audio").decode()
)
output_path = str(tmp_path / "out.mp3")
with patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
result = json.loads(text_to_speech_tool("Hello", output_path=output_path))
assert result["success"] is True
assert result["provider"] == "mistral"
mock_mistral_module.audio.speech.complete.assert_called_once()
def test_dispatcher_returns_error_when_sdk_not_installed(self, tmp_path, monkeypatch):
import json
from tools.tts_tool import text_to_speech_tool
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
with patch(
"tools.tts_tool._import_mistral_client", side_effect=ImportError("no module")
), patch("tools.tts_tool._load_tts_config", return_value={"provider": "mistral"}):
result = json.loads(
text_to_speech_tool("Hello", output_path=str(tmp_path / "out.mp3"))
)
assert result["success"] is False
assert "mistralai" in result["error"]
class TestCheckTtsRequirementsMistral:
def test_mistral_sdk_and_key_returns_true(self, mock_mistral_module, monkeypatch):
from tools.tts_tool import check_tts_requirements
monkeypatch.setenv("MISTRAL_API_KEY", "test-key")
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is True
def test_mistral_key_missing_returns_false(self, mock_mistral_module):
from tools.tts_tool import check_tts_requirements
with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \
patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \
patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \
patch("tools.tts_tool._check_neutts_available", return_value=False), \
patch("tools.tts_tool._check_kittentts_available", return_value=False), \
patch("tools.tts_tool._check_piper_available", return_value=False), \
patch("tools.tts_tool._has_any_command_tts_provider", return_value=False):
assert check_tts_requirements() is False