feat(tts): expose speaker_id on built-in Piper provider

The built-in Piper provider (tts.provider: piper, Python piper-tts package) already constructs piper.SynthesisConfig for the advanced tuning knobs, but did not forward speaker_id from the user config. This wires tts.piper.speaker_id through to SynthesisConfig.speaker_id so multi-speaker ONNX models (e.g. libritts_r) can be addressed via config without dropping to the command-provider path. Changes: - Add speaker_id to the has_advanced tuple so setting it triggers SynthesisConfig construction (same gating as the other knobs). - Pass speaker_id=speaker_id to SynthesisConfig. Defaults to 0 (Piper's own default; single-speaker models ignore the field). - Tolerant parse: bad input (non-int strings, lists, dicts) is dropped to 0 instead of raising. Booleans are rejected outright (True/False would silently coerce to 1/0 and hide a config mistake). Mirrors the same shape as the command-provider's _resolve_command_tts_optional_number helper. speaker_id is applied per-call via syn_config.speaker_id, so the PiperVoice cache key is intentionally left as just (model, cuda) -- the same loaded model serves all speakers. Tests cover the config knob, the tolerant parse, and the no-reload invariant. sentence_silence is intentionally not added here: the Python piper-tts SynthesisConfig does not expose that field (CLI-only).
2026-06-21 10:22:18 +00:00 · 2026-06-18 20:51:37 -06:00 · 2026-06-18 20:51:37 -06:00 · 160bb565b4
commit 160bb565b4
parent a7b4fbcbc1
2 changed files with 113 additions and 2 deletions
--- a/tests/tools/test_tts_piper.py
+++ b/tests/tools/test_tts_piper.py
@ -8,6 +8,7 @@ without requiring the ``piper-tts`` package to actually be installed

 import json
 import sys
+import types
 from pathlib import Path
 from unittest.mock import MagicMock, patch

@ -219,7 +220,7 @@ class TestGeneratePiperTts:

        # The SynthesisConfig import happens inline inside _generate_piper_tts
        # via ``from piper import SynthesisConfig``. Inject a fake piper
-        # module so that import resolves.
+        # module so that that import resolves.
        monkeypatch.setitem(sys.modules, "piper", FakePiperModule)

        config = {
@ -239,6 +240,96 @@ class TestGeneratePiperTts:
        assert kwargs["length_scale"] == 2.0
        assert kwargs["volume"] == 0.8

+    def test_speaker_id_passed_through_to_synconfig(self, tmp_path, monkeypatch):
+        """speaker_id flows from config to SynthesisConfig when set."""
+        model = self._prepare_voice_files(tmp_path)
+        monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
+
+        fake_syn_cls = MagicMock()
+        monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
+
+        config = {"piper": {"voice": str(model), "speaker_id": 2}}
+        tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
+
+        fake_syn_cls.assert_called_once()
+        assert fake_syn_cls.call_args.kwargs["speaker_id"] == 2
+
+    def test_speaker_id_alone_triggers_synconfig(self, tmp_path, monkeypatch):
+        """Setting ONLY speaker_id (no other advanced knobs) still constructs SynthesisConfig.
+
+        Regression guard: has_advanced must include speaker_id, otherwise
+        this knob gets silently dropped on the simplest configuration.
+        """
+        model = self._prepare_voice_files(tmp_path)
+        monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
+
+        fake_syn_cls = MagicMock()
+        monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
+
+        config = {"piper": {"voice": str(model), "speaker_id": 1}}
+        tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
+
+        fake_syn_cls.assert_called_once()
+
+    def test_speaker_id_default_zero_when_unset(self, tmp_path, monkeypatch):
+        """No speaker_id in config → SynthesisConfig.speaker_id == 0 (Piper's default)."""
+        model = self._prepare_voice_files(tmp_path)
+        monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
+
+        fake_syn_cls = MagicMock()
+        monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
+
+        config = {"piper": {"voice": str(model), "length_scale": 1.5}}
+        tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
+
+        assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
+
+    def test_speaker_id_bool_rejected_to_zero(self, tmp_path, monkeypatch):
+        """True/False would coerce to 1/0 and hide a config mistake — reject outright."""
+        model = self._prepare_voice_files(tmp_path)
+        monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
+
+        fake_syn_cls = MagicMock()
+        monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
+
+        for bad in (True, False):
+            fake_syn_cls.reset_mock()
+            config = {"piper": {"voice": str(model), "speaker_id": bad}}
+            tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{bad}.wav"), config)
+            assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
+
+    def test_speaker_id_non_int_dropped_to_zero(self, tmp_path, monkeypatch):
+        """Unparseable config (string, list, dict) drops to 0 instead of raising."""
+        model = self._prepare_voice_files(tmp_path)
+        monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
+
+        fake_syn_cls = MagicMock()
+        monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
+
+        for bad in ("two", [1, 2], {"k": 1}, None):
+            fake_syn_cls.reset_mock()
+            config = {"piper": {"voice": str(model), "speaker_id": bad}}
+            tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{type(bad).__name__}.wav"), config)
+            assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
+
+    def test_speaker_id_does_not_invalidate_voice_cache(self, tmp_path, monkeypatch):
+        """Switching speaker_id between calls must NOT trigger a model reload.
+
+        PiperVoice is bound to a model, not a speaker — speaker is applied
+        per-call via syn_config.speaker_id. The voice cache should serve the
+        same PiperVoice instance for the same (model, cuda) regardless of
+        how many distinct speaker_ids the user cycles through.
+        """
+        model = self._prepare_voice_files(tmp_path)
+        monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
+
+        for speaker in (0, 1, 2, 3):
+            config = {"piper": {"voice": str(model), "speaker_id": speaker}}
+            tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{speaker}.wav"), config)
+
+        # Only one PiperVoice.load() call across four calls with different speakers.
+        assert _StubPiperVoice.loaded == [str(model)]
+

 # ---------------------------------------------------------------------------
 # text_to_speech_tool end-to-end (provider == "piper")
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -1889,6 +1889,18 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])

    model_path = _resolve_piper_voice_path(voice_name, download_dir)

+    # Tolerant speaker_id parse: drop bad input (non-int strings, lists, dicts)
+    # to 0 (Piper's own default). Booleans are rejected outright — True/False
+    # would silently coerce to 1/0 and hide a config mistake.
+    _raw_speaker = piper_config.get("speaker_id", 0)
+    if isinstance(_raw_speaker, bool) or not isinstance(_raw_speaker, int):
+        speaker_id = 0
+    else:
+        speaker_id = _raw_speaker
+
+    # speaker_id is applied per-call via syn_config.speaker_id — the same
+    # PiperVoice instance serves all speakers, so it stays out of the cache
+    # key. Multi-speaker workflows share one model load.
    cache_key = f"{model_path}::cuda={use_cuda}"
    global _piper_voice_cache
    if cache_key not in _piper_voice_cache:
@ -1903,7 +1915,14 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])
    syn_config = None
    has_advanced = any(
        k in piper_config
-        for k in ("length_scale", "noise_scale", "noise_w_scale", "volume", "normalize_audio")
+        for k in (
+            "length_scale",
+            "noise_scale",
+            "noise_w_scale",
+            "volume",
+            "normalize_audio",
+            "speaker_id",
+        )
    )
    if has_advanced:
        try:
@ -1914,6 +1933,7 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])
                noise_w_scale=float(piper_config.get("noise_w_scale", 0.8)),
                volume=float(piper_config.get("volume", 1.0)),
                normalize_audio=bool(piper_config.get("normalize_audio", True)),
+                speaker_id=speaker_id,
            )
        except ImportError:
            logger.warning(