feat(tts): expose speaker_id on built-in Piper provider

The built-in Piper provider (tts.provider: piper, Python piper-tts
package) already constructs piper.SynthesisConfig for the advanced
tuning knobs, but did not forward speaker_id from the user config.

This wires tts.piper.speaker_id through to SynthesisConfig.speaker_id
so multi-speaker ONNX models (e.g. libritts_r) can be addressed via
config without dropping to the command-provider path.

Changes:
- Add speaker_id to the has_advanced tuple so setting it triggers
  SynthesisConfig construction (same gating as the other knobs).
- Pass speaker_id=speaker_id to SynthesisConfig. Defaults to 0
  (Piper's own default; single-speaker models ignore the field).
- Tolerant parse: bad input (non-int strings, lists, dicts) is
  dropped to 0 instead of raising. Booleans are rejected outright
  (True/False would silently coerce to 1/0 and hide a config
  mistake). Mirrors the same shape as the command-provider's
  _resolve_command_tts_optional_number helper.

speaker_id is applied per-call via syn_config.speaker_id, so the
PiperVoice cache key is intentionally left as just (model, cuda) --
the same loaded model serves all speakers. Tests cover the
config knob, the tolerant parse, and the no-reload invariant.

sentence_silence is intentionally not added here: the Python
piper-tts SynthesisConfig does not expose that field (CLI-only).
This commit is contained in:
Cdddo 2026-06-18 20:51:37 -06:00 committed by Teknium
parent a7b4fbcbc1
commit 160bb565b4
2 changed files with 113 additions and 2 deletions

View file

@ -8,6 +8,7 @@ without requiring the ``piper-tts`` package to actually be installed
import json
import sys
import types
from pathlib import Path
from unittest.mock import MagicMock, patch
@ -219,7 +220,7 @@ class TestGeneratePiperTts:
# The SynthesisConfig import happens inline inside _generate_piper_tts
# via ``from piper import SynthesisConfig``. Inject a fake piper
# module so that import resolves.
# module so that that import resolves.
monkeypatch.setitem(sys.modules, "piper", FakePiperModule)
config = {
@ -239,6 +240,96 @@ class TestGeneratePiperTts:
assert kwargs["length_scale"] == 2.0
assert kwargs["volume"] == 0.8
def test_speaker_id_passed_through_to_synconfig(self, tmp_path, monkeypatch):
"""speaker_id flows from config to SynthesisConfig when set."""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
config = {"piper": {"voice": str(model), "speaker_id": 2}}
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
fake_syn_cls.assert_called_once()
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 2
def test_speaker_id_alone_triggers_synconfig(self, tmp_path, monkeypatch):
"""Setting ONLY speaker_id (no other advanced knobs) still constructs SynthesisConfig.
Regression guard: has_advanced must include speaker_id, otherwise
this knob gets silently dropped on the simplest configuration.
"""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
config = {"piper": {"voice": str(model), "speaker_id": 1}}
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
fake_syn_cls.assert_called_once()
def test_speaker_id_default_zero_when_unset(self, tmp_path, monkeypatch):
"""No speaker_id in config → SynthesisConfig.speaker_id == 0 (Piper's default)."""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
config = {"piper": {"voice": str(model), "length_scale": 1.5}}
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
def test_speaker_id_bool_rejected_to_zero(self, tmp_path, monkeypatch):
"""True/False would coerce to 1/0 and hide a config mistake — reject outright."""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
for bad in (True, False):
fake_syn_cls.reset_mock()
config = {"piper": {"voice": str(model), "speaker_id": bad}}
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{bad}.wav"), config)
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
def test_speaker_id_non_int_dropped_to_zero(self, tmp_path, monkeypatch):
"""Unparseable config (string, list, dict) drops to 0 instead of raising."""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
fake_syn_cls = MagicMock()
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
for bad in ("two", [1, 2], {"k": 1}, None):
fake_syn_cls.reset_mock()
config = {"piper": {"voice": str(model), "speaker_id": bad}}
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{type(bad).__name__}.wav"), config)
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
def test_speaker_id_does_not_invalidate_voice_cache(self, tmp_path, monkeypatch):
"""Switching speaker_id between calls must NOT trigger a model reload.
PiperVoice is bound to a model, not a speaker speaker is applied
per-call via syn_config.speaker_id. The voice cache should serve the
same PiperVoice instance for the same (model, cuda) regardless of
how many distinct speaker_ids the user cycles through.
"""
model = self._prepare_voice_files(tmp_path)
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
for speaker in (0, 1, 2, 3):
config = {"piper": {"voice": str(model), "speaker_id": speaker}}
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{speaker}.wav"), config)
# Only one PiperVoice.load() call across four calls with different speakers.
assert _StubPiperVoice.loaded == [str(model)]
# ---------------------------------------------------------------------------
# text_to_speech_tool end-to-end (provider == "piper")

View file

@ -1889,6 +1889,18 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])
model_path = _resolve_piper_voice_path(voice_name, download_dir)
# Tolerant speaker_id parse: drop bad input (non-int strings, lists, dicts)
# to 0 (Piper's own default). Booleans are rejected outright — True/False
# would silently coerce to 1/0 and hide a config mistake.
_raw_speaker = piper_config.get("speaker_id", 0)
if isinstance(_raw_speaker, bool) or not isinstance(_raw_speaker, int):
speaker_id = 0
else:
speaker_id = _raw_speaker
# speaker_id is applied per-call via syn_config.speaker_id — the same
# PiperVoice instance serves all speakers, so it stays out of the cache
# key. Multi-speaker workflows share one model load.
cache_key = f"{model_path}::cuda={use_cuda}"
global _piper_voice_cache
if cache_key not in _piper_voice_cache:
@ -1903,7 +1915,14 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])
syn_config = None
has_advanced = any(
k in piper_config
for k in ("length_scale", "noise_scale", "noise_w_scale", "volume", "normalize_audio")
for k in (
"length_scale",
"noise_scale",
"noise_w_scale",
"volume",
"normalize_audio",
"speaker_id",
)
)
if has_advanced:
try:
@ -1914,6 +1933,7 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])
noise_w_scale=float(piper_config.get("noise_w_scale", 0.8)),
volume=float(piper_config.get("volume", 1.0)),
normalize_audio=bool(piper_config.get("normalize_audio", True)),
speaker_id=speaker_id,
)
except ImportError:
logger.warning(