mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
feat(tts): expose speaker_id on built-in Piper provider
The built-in Piper provider (tts.provider: piper, Python piper-tts package) already constructs piper.SynthesisConfig for the advanced tuning knobs, but did not forward speaker_id from the user config. This wires tts.piper.speaker_id through to SynthesisConfig.speaker_id so multi-speaker ONNX models (e.g. libritts_r) can be addressed via config without dropping to the command-provider path. Changes: - Add speaker_id to the has_advanced tuple so setting it triggers SynthesisConfig construction (same gating as the other knobs). - Pass speaker_id=speaker_id to SynthesisConfig. Defaults to 0 (Piper's own default; single-speaker models ignore the field). - Tolerant parse: bad input (non-int strings, lists, dicts) is dropped to 0 instead of raising. Booleans are rejected outright (True/False would silently coerce to 1/0 and hide a config mistake). Mirrors the same shape as the command-provider's _resolve_command_tts_optional_number helper. speaker_id is applied per-call via syn_config.speaker_id, so the PiperVoice cache key is intentionally left as just (model, cuda) -- the same loaded model serves all speakers. Tests cover the config knob, the tolerant parse, and the no-reload invariant. sentence_silence is intentionally not added here: the Python piper-tts SynthesisConfig does not expose that field (CLI-only).
This commit is contained in:
parent
a7b4fbcbc1
commit
160bb565b4
2 changed files with 113 additions and 2 deletions
|
|
@ -8,6 +8,7 @@ without requiring the ``piper-tts`` package to actually be installed
|
|||
|
||||
import json
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
|
@ -219,7 +220,7 @@ class TestGeneratePiperTts:
|
|||
|
||||
# The SynthesisConfig import happens inline inside _generate_piper_tts
|
||||
# via ``from piper import SynthesisConfig``. Inject a fake piper
|
||||
# module so that import resolves.
|
||||
# module so that that import resolves.
|
||||
monkeypatch.setitem(sys.modules, "piper", FakePiperModule)
|
||||
|
||||
config = {
|
||||
|
|
@ -239,6 +240,96 @@ class TestGeneratePiperTts:
|
|||
assert kwargs["length_scale"] == 2.0
|
||||
assert kwargs["volume"] == 0.8
|
||||
|
||||
def test_speaker_id_passed_through_to_synconfig(self, tmp_path, monkeypatch):
|
||||
"""speaker_id flows from config to SynthesisConfig when set."""
|
||||
model = self._prepare_voice_files(tmp_path)
|
||||
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
|
||||
|
||||
fake_syn_cls = MagicMock()
|
||||
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
|
||||
|
||||
config = {"piper": {"voice": str(model), "speaker_id": 2}}
|
||||
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
|
||||
|
||||
fake_syn_cls.assert_called_once()
|
||||
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 2
|
||||
|
||||
def test_speaker_id_alone_triggers_synconfig(self, tmp_path, monkeypatch):
|
||||
"""Setting ONLY speaker_id (no other advanced knobs) still constructs SynthesisConfig.
|
||||
|
||||
Regression guard: has_advanced must include speaker_id, otherwise
|
||||
this knob gets silently dropped on the simplest configuration.
|
||||
"""
|
||||
model = self._prepare_voice_files(tmp_path)
|
||||
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
|
||||
|
||||
fake_syn_cls = MagicMock()
|
||||
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
|
||||
|
||||
config = {"piper": {"voice": str(model), "speaker_id": 1}}
|
||||
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
|
||||
|
||||
fake_syn_cls.assert_called_once()
|
||||
|
||||
def test_speaker_id_default_zero_when_unset(self, tmp_path, monkeypatch):
|
||||
"""No speaker_id in config → SynthesisConfig.speaker_id == 0 (Piper's default)."""
|
||||
model = self._prepare_voice_files(tmp_path)
|
||||
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
|
||||
|
||||
fake_syn_cls = MagicMock()
|
||||
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
|
||||
|
||||
config = {"piper": {"voice": str(model), "length_scale": 1.5}}
|
||||
tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config)
|
||||
|
||||
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
|
||||
|
||||
def test_speaker_id_bool_rejected_to_zero(self, tmp_path, monkeypatch):
|
||||
"""True/False would coerce to 1/0 and hide a config mistake — reject outright."""
|
||||
model = self._prepare_voice_files(tmp_path)
|
||||
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
|
||||
|
||||
fake_syn_cls = MagicMock()
|
||||
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
|
||||
|
||||
for bad in (True, False):
|
||||
fake_syn_cls.reset_mock()
|
||||
config = {"piper": {"voice": str(model), "speaker_id": bad}}
|
||||
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{bad}.wav"), config)
|
||||
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
|
||||
|
||||
def test_speaker_id_non_int_dropped_to_zero(self, tmp_path, monkeypatch):
|
||||
"""Unparseable config (string, list, dict) drops to 0 instead of raising."""
|
||||
model = self._prepare_voice_files(tmp_path)
|
||||
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
|
||||
|
||||
fake_syn_cls = MagicMock()
|
||||
monkeypatch.setitem(sys.modules, "piper", types.SimpleNamespace(SynthesisConfig=fake_syn_cls))
|
||||
|
||||
for bad in ("two", [1, 2], {"k": 1}, None):
|
||||
fake_syn_cls.reset_mock()
|
||||
config = {"piper": {"voice": str(model), "speaker_id": bad}}
|
||||
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{type(bad).__name__}.wav"), config)
|
||||
assert fake_syn_cls.call_args.kwargs["speaker_id"] == 0
|
||||
|
||||
def test_speaker_id_does_not_invalidate_voice_cache(self, tmp_path, monkeypatch):
|
||||
"""Switching speaker_id between calls must NOT trigger a model reload.
|
||||
|
||||
PiperVoice is bound to a model, not a speaker — speaker is applied
|
||||
per-call via syn_config.speaker_id. The voice cache should serve the
|
||||
same PiperVoice instance for the same (model, cuda) regardless of
|
||||
how many distinct speaker_ids the user cycles through.
|
||||
"""
|
||||
model = self._prepare_voice_files(tmp_path)
|
||||
monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice)
|
||||
|
||||
for speaker in (0, 1, 2, 3):
|
||||
config = {"piper": {"voice": str(model), "speaker_id": speaker}}
|
||||
tts_tool._generate_piper_tts("hi", str(tmp_path / f"out-{speaker}.wav"), config)
|
||||
|
||||
# Only one PiperVoice.load() call across four calls with different speakers.
|
||||
assert _StubPiperVoice.loaded == [str(model)]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# text_to_speech_tool end-to-end (provider == "piper")
|
||||
|
|
|
|||
|
|
@ -1889,6 +1889,18 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])
|
|||
|
||||
model_path = _resolve_piper_voice_path(voice_name, download_dir)
|
||||
|
||||
# Tolerant speaker_id parse: drop bad input (non-int strings, lists, dicts)
|
||||
# to 0 (Piper's own default). Booleans are rejected outright — True/False
|
||||
# would silently coerce to 1/0 and hide a config mistake.
|
||||
_raw_speaker = piper_config.get("speaker_id", 0)
|
||||
if isinstance(_raw_speaker, bool) or not isinstance(_raw_speaker, int):
|
||||
speaker_id = 0
|
||||
else:
|
||||
speaker_id = _raw_speaker
|
||||
|
||||
# speaker_id is applied per-call via syn_config.speaker_id — the same
|
||||
# PiperVoice instance serves all speakers, so it stays out of the cache
|
||||
# key. Multi-speaker workflows share one model load.
|
||||
cache_key = f"{model_path}::cuda={use_cuda}"
|
||||
global _piper_voice_cache
|
||||
if cache_key not in _piper_voice_cache:
|
||||
|
|
@ -1903,7 +1915,14 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])
|
|||
syn_config = None
|
||||
has_advanced = any(
|
||||
k in piper_config
|
||||
for k in ("length_scale", "noise_scale", "noise_w_scale", "volume", "normalize_audio")
|
||||
for k in (
|
||||
"length_scale",
|
||||
"noise_scale",
|
||||
"noise_w_scale",
|
||||
"volume",
|
||||
"normalize_audio",
|
||||
"speaker_id",
|
||||
)
|
||||
)
|
||||
if has_advanced:
|
||||
try:
|
||||
|
|
@ -1914,6 +1933,7 @@ def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any])
|
|||
noise_w_scale=float(piper_config.get("noise_w_scale", 0.8)),
|
||||
volume=float(piper_config.get("volume", 1.0)),
|
||||
normalize_audio=bool(piper_config.get("normalize_audio", True)),
|
||||
speaker_id=speaker_id,
|
||||
)
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue