hermes-agent/tests/tools/test_tts_piper.py
Cdddo 160bb565b4 feat(tts): expose speaker_id on built-in Piper provider
The built-in Piper provider (tts.provider: piper, Python piper-tts
package) already constructs piper.SynthesisConfig for the advanced
tuning knobs, but did not forward speaker_id from the user config.

This wires tts.piper.speaker_id through to SynthesisConfig.speaker_id
so multi-speaker ONNX models (e.g. libritts_r) can be addressed via
config without dropping to the command-provider path.

Changes:
- Add speaker_id to the has_advanced tuple so setting it triggers
  SynthesisConfig construction (same gating as the other knobs).
- Pass speaker_id=speaker_id to SynthesisConfig. Defaults to 0
  (Piper's own default; single-speaker models ignore the field).
- Tolerant parse: bad input (non-int strings, lists, dicts) is
  dropped to 0 instead of raising. Booleans are rejected outright
  (True/False would silently coerce to 1/0 and hide a config
  mistake). Mirrors the same shape as the command-provider's
  _resolve_command_tts_optional_number helper.

speaker_id is applied per-call via syn_config.speaker_id, so the
PiperVoice cache key is intentionally left as just (model, cuda) --
the same loaded model serves all speakers. Tests cover the
config knob, the tolerant parse, and the no-reload invariant.

sentence_silence is intentionally not added here: the Python
piper-tts SynthesisConfig does not expose that field (CLI-only).
2026-06-19 07:04:58 -07:00

396 lines
17 KiB
Python

"""
Tests for the native Piper TTS provider.
These tests pin the resolution / caching / dispatch paths for Piper
without requiring the ``piper-tts`` package to actually be installed
(the synthesis step is monkey-patched to avoid needing the ONNX wheel).
"""
import json
import sys
import types
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from tools import tts_tool
from tools.tts_tool import (
BUILTIN_TTS_PROVIDERS,
DEFAULT_PIPER_VOICE,
PROVIDER_MAX_TEXT_LENGTH,
_check_piper_available,
_resolve_piper_voice_path,
check_tts_requirements,
text_to_speech_tool,
)
# ---------------------------------------------------------------------------
# Registry / constants
# ---------------------------------------------------------------------------
class TestPiperRegistration:
def test_piper_is_a_builtin_provider(self):
assert "piper" in BUILTIN_TTS_PROVIDERS
def test_piper_has_a_text_length_cap(self):
assert PROVIDER_MAX_TEXT_LENGTH.get("piper", 0) > 0
# ---------------------------------------------------------------------------
# _check_piper_available
# ---------------------------------------------------------------------------
class TestCheckPiperAvailable:
def test_returns_bool_without_raising(self):
# We don't care about the current environment's answer — just that
# the probe never raises on a machine without piper installed.
assert isinstance(_check_piper_available(), bool)
# ---------------------------------------------------------------------------
# _resolve_piper_voice_path
# ---------------------------------------------------------------------------
class TestResolvePiperVoicePath:
def test_direct_onnx_path_returned_as_is(self, tmp_path):
model = tmp_path / "custom.onnx"
model.write_bytes(b"fake onnx bytes")
result = _resolve_piper_voice_path(str(model), tmp_path)
assert result == str(model)
def test_cached_voice_name_not_redownloaded(self, tmp_path):
"""If both <voice>.onnx and <voice>.onnx.json exist in the
download dir, no subprocess is spawned."""
voice = "en_US-test-medium"
(tmp_path / f"{voice}.onnx").write_bytes(b"model")
(tmp_path / f"{voice}.onnx.json").write_text("{}")
with patch("tools.tts_tool.subprocess.run") as mock_run:
result = _resolve_piper_voice_path(voice, tmp_path)
mock_run.assert_not_called()
assert result == str(tmp_path / f"{voice}.onnx")
def test_missing_voice_triggers_download(self, tmp_path):
voice = "en_US-new-medium"
def fake_run(cmd, *a, **kw):
# Simulate a successful download: write the expected files.
(tmp_path / f"{voice}.onnx").write_bytes(b"model")
(tmp_path / f"{voice}.onnx.json").write_text("{}")
return MagicMock(returncode=0, stderr="", stdout="")
with patch("tools.tts_tool.subprocess.run", side_effect=fake_run) as mock_run:
result = _resolve_piper_voice_path(voice, tmp_path)
mock_run.assert_called_once()
# Verify the command shape: python -m piper.download_voices <voice> --download-dir <dir>
call_args = mock_run.call_args.args[0]
assert "piper.download_voices" in " ".join(call_args)
assert voice in call_args
assert "--download-dir" in call_args
assert str(tmp_path) in call_args
assert result == str(tmp_path / f"{voice}.onnx")
def test_download_failure_raises_runtime(self, tmp_path):
voice = "en_US-broken-medium"
fake_result = MagicMock(returncode=1, stderr="voice not found", stdout="")
with patch("tools.tts_tool.subprocess.run", return_value=fake_result):
with pytest.raises(RuntimeError, match="Piper voice download failed"):
_resolve_piper_voice_path(voice, tmp_path)
def test_download_success_but_missing_file_raises(self, tmp_path):
voice = "en_US-weird-medium"
fake_result = MagicMock(returncode=0, stderr="", stdout="")
# Subprocess "succeeds" but doesn't actually write the files.
with patch("tools.tts_tool.subprocess.run", return_value=fake_result):
with pytest.raises(RuntimeError, match="completed but .+ is missing"):
_resolve_piper_voice_path(voice, tmp_path)
def test_empty_voice_falls_back_to_default_name(self, tmp_path):
(tmp_path / f"{DEFAULT_PIPER_VOICE}.onnx").write_bytes(b"model")
(tmp_path / f"{DEFAULT_PIPER_VOICE}.onnx.json").write_text("{}")
result = _resolve_piper_voice_path("", tmp_path)
assert result.endswith(f"{DEFAULT_PIPER_VOICE}.onnx")
# ---------------------------------------------------------------------------
# _generate_piper_tts — stubbed so we don't need piper-tts installed
# ---------------------------------------------------------------------------
class _StubPiperVoice:
"""Stand-in for piper.PiperVoice used by the synthesis tests."""
loaded: list[str] = []
calls: list[tuple] = []
@classmethod
def load(cls, model_path, use_cuda=False):
cls.loaded.append(model_path)
instance = cls()
instance.model_path = model_path
instance.use_cuda = use_cuda
return instance
def synthesize_wav(self, text, wav_file, syn_config=None):
# Minimal valid WAV: an empty frame set is fine for our size check.
# The wave module accepts any frames; we just need the file to exist
# with non-zero bytes after close.
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(22050)
wav_file.writeframes(b"\x00\x00" * 1024)
_StubPiperVoice.calls.append((text, getattr(self, "model_path", ""), syn_config))
@pytest.fixture(autouse=True)
def _reset_piper_cache():
"""Clear the module-level voice cache between tests."""
tts_tool._piper_voice_cache.clear()
_StubPiperVoice.loaded = []
_StubPiperVoice.calls = []
yield
tts_tool._piper_voice_cache.clear()
class TestGeneratePiperTts:
def _prepare_voice_files(self, tmp_path, voice=DEFAULT_PIPER_VOICE):
model = tmp_path / f"{voice}.onnx"
model.write_bytes(b"model")
(tmp_path / f"{voice}.onnx.json").write_text("{}")
return model
def test_loads_voice_and_writes_wav(self, tmp_path, monkeypatch):
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
out_path = str(tmp_path / "out.wav")
config = {"piper": {"voice": str(model)}}
result = tts_tool._generate_piper_tts("hello", out_path, config)
assert result == out_path
assert Path(out_path).exists()
assert Path(out_path).stat().st_size > 0
assert _StubPiperVoice.loaded == [str(model)]
assert _StubPiperVoice.calls[0][0] == "hello"
def test_voice_cache_reused_across_calls(self, tmp_path, monkeypatch):
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
config = {"piper": {"voice": str(model)}}
tts_tool._generate_piper_tts("one", str(tmp_path / "a.wav"), config)
tts_tool._generate_piper_tts("two", str(tmp_path / "b.wav"), config)
# load() should have been called exactly once for the same model+cuda key.
assert _StubPiperVoice.loaded == [str(model)]
# But both synthesize calls went through.
assert [c[0] for c in _StubPiperVoice.calls] == ["one", "two"]
def test_voice_name_triggers_download(self, tmp_path, monkeypatch):
"""A config voice of ``en_US-lessac-medium`` should be resolved via
_resolve_piper_voice_path (which would normally download)."""
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
def fake_resolve(voice, download_dir):
model = download_dir / f"{voice}.onnx"
model.write_bytes(b"model")
return str(model)
monkeypatch.setattr(tts_tool, "_resolve_piper_voice_path", fake_resolve)
config = {"piper": {"voice": "en_US-lessac-medium", "voices_dir": str(tmp_path)}}
result = tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
assert Path(result).exists()
assert _StubPiperVoice.loaded[0].endswith("en_US-lessac-medium.onnx")
def test_advanced_knobs_passed_as_synconfig(self, tmp_path, monkeypatch):
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
# Fake SynthesisConfig so we can assert the knobs flowed through.
fake_syn_cls = MagicMock()
class FakePiperModule:
SynthesisConfig = fake_syn_cls
# The SynthesisConfig import happens inline inside _generate_piper_tts
# via ``from piper import SynthesisConfig``. Inject a fake piper
# module so that that import resolves.
monkeypatch.setitem(sys.modules, "piper", FakePiperModule)
config = {
"piper": {
"voice": str(model),
"length_scale": 2.0,
"volume": 0.8,
},
}
tts_tool._generate_piper_tts(
"slow voice", str(tmp_path / "out.wav"), config,
)
# SynthesisConfig was constructed with the advanced knobs.
fake_syn_cls.assert_called_once()
kwargs = fake_syn_cls.call_args.kwargs
assert kwargs["length_scale"] == 2.0
assert kwargs["volume"] == 0.8
def test_speaker_id_passed_through_to_synconfig(self, tmp_path, monkeypatch):
"""speaker_id flows from config to SynthesisConfig when set."""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
config = {"piper": {"voice": str(model), "speaker_id": 2}}
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
fake_syn_cls.assert_called_once()
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 2
def test_speaker_id_alone_triggers_synconfig(self, tmp_path, monkeypatch):
"""Setting ONLY speaker_id (no other advanced knobs) still constructs SynthesisConfig.
Regression guard: has_advanced must include speaker_id, otherwise
this knob gets silently dropped on the simplest configuration.
"""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
config = {"piper": {"voice": str(model), "speaker_id": 1}}
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
fake_syn_cls.assert_called_once()
def test_speaker_id_default_zero_when_unset(self, tmp_path, monkeypatch):
"""No speaker_id in config → SynthesisConfig.speaker_id == 0 (Piper's default)."""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
config = {"piper": {"voice": str(model), "length_scale": 1.5}}
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
def test_speaker_id_bool_rejected_to_zero(self, tmp_path, monkeypatch):
"""True/False would coerce to 1/0 and hide a config mistake — reject outright."""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
for bad in (True, False):
fake_syn_cls.reset_mock()
config = {"piper": {"voice": str(model), "speaker_id": bad}}
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{bad}.wav"), config)
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
def test_speaker_id_non_int_dropped_to_zero(self, tmp_path, monkeypatch):
"""Unparseable config (string, list, dict) drops to 0 instead of raising."""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
for bad in ("two", [1, 2], {"k": 1}, None):
fake_syn_cls.reset_mock()
config = {"piper": {"voice": str(model), "speaker_id": bad}}
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{type(bad).__name__}.wav"), config)
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
def test_speaker_id_does_not_invalidate_voice_cache(self, tmp_path, monkeypatch):
"""Switching speaker_id between calls must NOT trigger a model reload.
PiperVoice is bound to a model, not a speaker — speaker is applied
per-call via syn_config.speaker_id. The voice cache should serve the
same PiperVoice instance for the same (model, cuda) regardless of
how many distinct speaker_ids the user cycles through.
"""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
for speaker in (0, 1, 2, 3):
config = {"piper": {"voice": str(model), "speaker_id": speaker}}
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{speaker}.wav"), config)
# Only one PiperVoice.load() call across four calls with different speakers.
assert _StubPiperVoice.loaded == [str(model)]
# ---------------------------------------------------------------------------
# text_to_speech_tool end-to-end (provider == "piper")
# ---------------------------------------------------------------------------
class TestTextToSpeechToolWithPiper:
def test_dispatches_to_piper(self, tmp_path, monkeypatch):
model = tmp_path / f"{DEFAULT_PIPER_VOICE}.onnx"
model.write_bytes(b"model")
(tmp_path / f"{DEFAULT_PIPER_VOICE}.onnx.json").write_text("{}")
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
cfg = {"provider": "piper", "piper": {"voice": str(model)}}
monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: cfg)
result = text_to_speech_tool(text="hi", output_path=str(tmp_path / "clip.wav"))
data = json.loads(result)
assert data["success"] is True, data
assert data["provider"] == "piper"
assert Path(data["file_path"]).exists()
def test_missing_package_surfaces_error(self, tmp_path, monkeypatch):
def raise_import():
raise ImportError("No module named 'piper'")
monkeypatch.setattr(tts_tool, "_import_piper", raise_import)
cfg = {"provider": "piper"}
monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: cfg)
result = text_to_speech_tool(text="hi", output_path=str(tmp_path / "clip.wav"))
data = json.loads(result)
assert data["success"] is False
assert "piper-tts" in data["error"]
# ---------------------------------------------------------------------------
# check_tts_requirements
# ---------------------------------------------------------------------------
class TestCheckTtsRequirementsPiper:
def test_piper_install_satisfies_requirements(self, monkeypatch):
# Drop every other provider so we can isolate the piper signal.
monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: (_ for _ in ()).throw(ImportError()))
monkeypatch.setattr(tts_tool, "_import_elevenlabs", lambda: (_ for _ in ()).throw(ImportError()))
monkeypatch.setattr(tts_tool, "_import_openai_client", lambda: (_ for _ in ()).throw(ImportError()))
monkeypatch.setattr(tts_tool, "_import_mistral_client", lambda: (_ for _ in ()).throw(ImportError()))
monkeypatch.setattr(tts_tool, "_check_neutts_available", lambda: False)
monkeypatch.setattr(tts_tool, "_check_kittentts_available", lambda: False)
monkeypatch.setattr(tts_tool, "_has_any_command_tts_provider", lambda: False)
monkeypatch.setattr(tts_tool, "_has_openai_audio_backend", lambda: False)
for env in ("MINIMAX_API_KEY", "XAI_API_KEY", "GEMINI_API_KEY",
"GOOGLE_API_KEY", "MISTRAL_API_KEY", "ELEVENLABS_API_KEY"):
monkeypatch.delenv(env, raising=False)
# Now toggle the piper check on and off.
monkeypatch.setattr(tts_tool, "_check_piper_available", lambda: False)
assert check_tts_requirements() is False
monkeypatch.setattr(tts_tool, "_check_piper_available", lambda: True)
assert check_tts_requirements() is True