diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 7907af2253..0c3f40ab67 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -779,7 +779,7 @@ DEFAULT_CONFIG = { # limit (OpenAI 4096, xAI 15000, MiniMax 10000, ElevenLabs 5k-40k model-aware, # Gemini 5000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000). "tts": { - "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local) + "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "gemini" | "neutts" (local) | "kittentts" (local) | "piper" (local) "edge": { "voice": "en-US-AriaNeural", # Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural @@ -809,6 +809,19 @@ DEFAULT_CONFIG = { "model": "neuphonic/neutts-air-q4-gguf", # HuggingFace model repo "device": "cpu", # cpu, cuda, or mps }, + "piper": { + # Voice name (e.g. "en_US-lessac-medium") downloaded on first + # use, OR an absolute path to a pre-downloaded .onnx file. + # Full voice list: https://github.com/OHF-Voice/piper1-gpl/blob/main/docs/VOICES.md + "voice": "en_US-lessac-medium", + # "voices_dir": "", # Override voice cache dir; default = ~/.hermes/cache/piper-voices/ + # "use_cuda": False, # Requires onnxruntime-gpu + # "length_scale": 1.0, # 2.0 = twice as slow + # "noise_scale": 0.667, + # "noise_w_scale": 0.8, + # "volume": 1.0, + # "normalize_audio": True, + }, }, "stt": { diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index 28b1a50cf0..5edb227d95 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -227,6 +227,14 @@ TOOL_CATEGORIES = { "tts_provider": "kittentts", "post_setup": "kittentts", }, + { + "name": "Piper", + "badge": "local · free", + "tag": "Local neural TTS, 44 languages (voices ~20-90MB)", + "env_vars": [], + "tts_provider": "piper", + "post_setup": "piper", + }, ], }, "web": { @@ -624,6 +632,33 @@ def _run_post_setup(post_setup_key: str): _print_warning(" kittentts install timed out (>5min)") _print_info(f" Run manually: python -m pip install -U '{wheel_url}' soundfile") + elif post_setup_key == "piper": + try: + __import__("piper") + _print_success(" piper-tts is already installed") + except ImportError: + import subprocess + _print_info(" Installing piper-tts (~14MB wheel, voices downloaded on first use)...") + try: + result = subprocess.run( + [sys.executable, "-m", "pip", "install", "-U", "piper-tts", "--quiet"], + capture_output=True, text=True, timeout=300, + ) + if result.returncode == 0: + _print_success(" piper-tts installed") + else: + _print_warning(" piper-tts install failed:") + _print_info(f" {result.stderr.strip()[:300]}") + _print_info(" Run manually: python -m pip install -U piper-tts") + return + except subprocess.TimeoutExpired: + _print_warning(" piper-tts install timed out (>5min)") + _print_info(" Run manually: python -m pip install -U piper-tts") + return + _print_info(" Default voice: en_US-lessac-medium (downloaded on first TTS call)") + _print_info(" Full voice list: https://github.com/OHF-Voice/piper1-gpl/blob/main/docs/VOICES.md") + _print_info(" Switch voices by setting tts.piper.voice in ~/.hermes/config.yaml") + elif post_setup_key == "spotify": # Run the full `hermes auth spotify` flow — if the user has no # client_id yet, this drops them into the interactive wizard diff --git a/tests/tools/test_tts_command_providers.py b/tests/tools/test_tts_command_providers.py index eae5d06d02..583abcb588 100644 --- a/tests/tools/test_tts_command_providers.py +++ b/tests/tools/test_tts_command_providers.py @@ -81,29 +81,39 @@ class TestResolveCommandProviderConfig: def test_user_declared_command_provider_resolves(self): cfg = { "providers": { - "piper": {"type": "command", "command": "piper foo"}, + "piper-cli": {"type": "command", "command": "piper-cli foo"}, }, } - resolved = _resolve_command_provider_config("piper", cfg) + resolved = _resolve_command_provider_config("piper-cli", cfg) assert resolved is not None - assert resolved["command"] == "piper foo" + assert resolved["command"] == "piper-cli foo" def test_type_command_is_implied_when_command_is_set(self): - cfg = {"providers": {"piper": {"command": "piper foo"}}} - resolved = _resolve_command_provider_config("piper", cfg) + cfg = {"providers": {"piper-cli": {"command": "piper-cli foo"}}} + resolved = _resolve_command_provider_config("piper-cli", cfg) assert resolved is not None def test_other_type_values_reject(self): - cfg = {"providers": {"piper": {"type": "python", "command": "piper foo"}}} - assert _resolve_command_provider_config("piper", cfg) is None + cfg = {"providers": {"piper-cli": {"type": "python", "command": "piper-cli foo"}}} + assert _resolve_command_provider_config("piper-cli", cfg) is None def test_empty_command_rejects(self): - cfg = {"providers": {"piper": {"type": "command", "command": " "}}} - assert _resolve_command_provider_config("piper", cfg) is None + cfg = {"providers": {"piper-cli": {"type": "command", "command": " "}}} + assert _resolve_command_provider_config("piper-cli", cfg) is None def test_case_insensitive_lookup(self): - cfg = {"providers": {"piper": {"type": "command", "command": "x"}}} - assert _resolve_command_provider_config("PIPER", cfg) is not None + cfg = {"providers": {"piper-cli": {"type": "command", "command": "x"}}} + assert _resolve_command_provider_config("PIPER-CLI", cfg) is not None + + def test_native_piper_cannot_be_shadowed_by_command_entry(self): + """Regression guard for PR that added native Piper as a built-in. + A user's ``tts.providers.piper`` must not override the built-in.""" + cfg = { + "providers": { + "piper": {"type": "command", "command": "some-script"}, + }, + } + assert _resolve_command_provider_config("piper", cfg) is None class TestGetNamedProviderConfig: @@ -145,16 +155,16 @@ class TestIterCommandProviders: cfg = { "providers": { "openai": {"type": "command", "command": "shouldnt show up"}, - "piper": {"type": "command", "command": "piper"}, + "piper-cli": {"type": "command", "command": "piper-cli"}, "voxcpm": {"type": "command", "command": "voxcpm"}, "broken": {"type": "command", "command": ""}, }, } names = sorted(name for name, _ in _iter_command_providers(cfg)) - assert names == ["piper", "voxcpm"] + assert names == ["piper-cli", "voxcpm"] def test_has_any_command_provider_detects_declared(self): - cfg = {"providers": {"piper": {"type": "command", "command": "piper"}}} + cfg = {"providers": {"piper-cli": {"type": "command", "command": "piper-cli"}}} assert _has_any_command_tts_provider(cfg) is True def test_has_any_command_provider_when_none(self): @@ -216,16 +226,16 @@ class TestConfigGetters: class TestMaxTextLengthForCommandProviders: def test_default_for_command_provider(self): - cfg = {"providers": {"piper": {"type": "command", "command": "x"}}} - assert _resolve_max_text_length("piper", cfg) == DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH + cfg = {"providers": {"piper-cli": {"type": "command", "command": "x"}}} + assert _resolve_max_text_length("piper-cli", cfg) == DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH def test_override_under_providers(self): - cfg = {"providers": {"piper": {"type": "command", "command": "x", "max_text_length": 2500}}} - assert _resolve_max_text_length("piper", cfg) == 2500 + cfg = {"providers": {"piper-cli": {"type": "command", "command": "x", "max_text_length": 2500}}} + assert _resolve_max_text_length("piper-cli", cfg) == 2500 def test_override_under_legacy_tts_name_block(self): - cfg = {"piper": {"type": "command", "command": "x", "max_text_length": 7777}} - assert _resolve_max_text_length("piper", cfg) == 7777 + cfg = {"piper-cli": {"type": "command", "command": "x", "max_text_length": 7777}} + assert _resolve_max_text_length("piper-cli", cfg) == 7777 def test_non_command_unknown_provider_still_falls_back(self): assert _resolve_max_text_length("unknown", {}) > 0 diff --git a/tests/tools/test_tts_mistral.py b/tests/tools/test_tts_mistral.py index 36088f3f0a..6e98946b6c 100644 --- a/tests/tools/test_tts_mistral.py +++ b/tests/tools/test_tts_mistral.py @@ -216,5 +216,8 @@ class TestCheckTtsRequirementsMistral: with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), \ patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError), \ patch("tools.tts_tool._import_openai_client", side_effect=ImportError), \ - patch("tools.tts_tool._check_neutts_available", return_value=False): + patch("tools.tts_tool._check_neutts_available", return_value=False), \ + patch("tools.tts_tool._check_kittentts_available", return_value=False), \ + patch("tools.tts_tool._check_piper_available", return_value=False), \ + patch("tools.tts_tool._has_any_command_tts_provider", return_value=False): assert check_tts_requirements() is False diff --git a/tests/tools/test_tts_piper.py b/tests/tools/test_tts_piper.py new file mode 100644 index 0000000000..ef7330a18c --- /dev/null +++ b/tests/tools/test_tts_piper.py @@ -0,0 +1,306 @@ +""" +Tests for the native Piper TTS provider. + +These tests pin the resolution / caching / dispatch paths for Piper +without requiring the ``piper-tts`` package to actually be installed +(the synthesis step is monkey-patched to avoid needing the ONNX wheel). +""" + +import json +import os +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from tools import tts_tool +from tools.tts_tool import ( + BUILTIN_TTS_PROVIDERS, + DEFAULT_PIPER_VOICE, + PROVIDER_MAX_TEXT_LENGTH, + _check_piper_available, + _resolve_piper_voice_path, + check_tts_requirements, + text_to_speech_tool, +) + + +# --------------------------------------------------------------------------- +# Registry / constants +# --------------------------------------------------------------------------- + +class TestPiperRegistration: + def test_piper_is_a_builtin_provider(self): + assert "piper" in BUILTIN_TTS_PROVIDERS + + def test_piper_has_a_text_length_cap(self): + assert PROVIDER_MAX_TEXT_LENGTH.get("piper", 0) > 0 + + +# --------------------------------------------------------------------------- +# _check_piper_available +# --------------------------------------------------------------------------- + +class TestCheckPiperAvailable: + def test_returns_bool_without_raising(self): + # We don't care about the current environment's answer — just that + # the probe never raises on a machine without piper installed. + assert isinstance(_check_piper_available(), bool) + + +# --------------------------------------------------------------------------- +# _resolve_piper_voice_path +# --------------------------------------------------------------------------- + +class TestResolvePiperVoicePath: + def test_direct_onnx_path_returned_as_is(self, tmp_path): + model = tmp_path / "custom.onnx" + model.write_bytes(b"fake onnx bytes") + result = _resolve_piper_voice_path(str(model), tmp_path) + assert result == str(model) + + def test_cached_voice_name_not_redownloaded(self, tmp_path): + """If both .onnx and .onnx.json exist in the + download dir, no subprocess is spawned.""" + voice = "en_US-test-medium" + (tmp_path / f"{voice}.onnx").write_bytes(b"model") + (tmp_path / f"{voice}.onnx.json").write_text("{}") + + with patch("tools.tts_tool.subprocess.run") as mock_run: + result = _resolve_piper_voice_path(voice, tmp_path) + + mock_run.assert_not_called() + assert result == str(tmp_path / f"{voice}.onnx") + + def test_missing_voice_triggers_download(self, tmp_path): + voice = "en_US-new-medium" + + def fake_run(cmd, *a, **kw): + # Simulate a successful download: write the expected files. + (tmp_path / f"{voice}.onnx").write_bytes(b"model") + (tmp_path / f"{voice}.onnx.json").write_text("{}") + return MagicMock(returncode=0, stderr="", stdout="") + + with patch("tools.tts_tool.subprocess.run", side_effect=fake_run) as mock_run: + result = _resolve_piper_voice_path(voice, tmp_path) + + mock_run.assert_called_once() + # Verify the command shape: python -m piper.download_voices --download-dir + call_args = mock_run.call_args.args[0] + assert "piper.download_voices" in " ".join(call_args) + assert voice in call_args + assert "--download-dir" in call_args + assert str(tmp_path) in call_args + assert result == str(tmp_path / f"{voice}.onnx") + + def test_download_failure_raises_runtime(self, tmp_path): + voice = "en_US-broken-medium" + fake_result = MagicMock(returncode=1, stderr="voice not found", stdout="") + with patch("tools.tts_tool.subprocess.run", return_value=fake_result): + with pytest.raises(RuntimeError, match="Piper voice download failed"): + _resolve_piper_voice_path(voice, tmp_path) + + def test_download_success_but_missing_file_raises(self, tmp_path): + voice = "en_US-weird-medium" + fake_result = MagicMock(returncode=0, stderr="", stdout="") + # Subprocess "succeeds" but doesn't actually write the files. + with patch("tools.tts_tool.subprocess.run", return_value=fake_result): + with pytest.raises(RuntimeError, match="completed but .+ is missing"): + _resolve_piper_voice_path(voice, tmp_path) + + def test_empty_voice_falls_back_to_default_name(self, tmp_path): + (tmp_path / f"{DEFAULT_PIPER_VOICE}.onnx").write_bytes(b"model") + (tmp_path / f"{DEFAULT_PIPER_VOICE}.onnx.json").write_text("{}") + result = _resolve_piper_voice_path("", tmp_path) + assert result.endswith(f"{DEFAULT_PIPER_VOICE}.onnx") + + +# --------------------------------------------------------------------------- +# _generate_piper_tts — stubbed so we don't need piper-tts installed +# --------------------------------------------------------------------------- + +class _StubPiperVoice: + """Stand-in for piper.PiperVoice used by the synthesis tests.""" + + loaded: list[str] = [] + calls: list[tuple] = [] + + @classmethod + def load(cls, model_path, use_cuda=False): + cls.loaded.append(model_path) + instance = cls() + instance.model_path = model_path + instance.use_cuda = use_cuda + return instance + + def synthesize_wav(self, text, wav_file, syn_config=None): + # Minimal valid WAV: an empty frame set is fine for our size check. + # The wave module accepts any frames; we just need the file to exist + # with non-zero bytes after close. + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(22050) + wav_file.writeframes(b"\x00\x00" * 1024) + _StubPiperVoice.calls.append((text, getattr(self, "model_path", ""), syn_config)) + + +@pytest.fixture(autouse=True) +def _reset_piper_cache(): + """Clear the module-level voice cache between tests.""" + tts_tool._piper_voice_cache.clear() + _StubPiperVoice.loaded = [] + _StubPiperVoice.calls = [] + yield + tts_tool._piper_voice_cache.clear() + + +class TestGeneratePiperTts: + def _prepare_voice_files(self, tmp_path, voice=DEFAULT_PIPER_VOICE): + model = tmp_path / f"{voice}.onnx" + model.write_bytes(b"model") + (tmp_path / f"{voice}.onnx.json").write_text("{}") + return model + + def test_loads_voice_and_writes_wav(self, tmp_path, monkeypatch): + model = self._prepare_voice_files(tmp_path) + monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice) + + out_path = str(tmp_path / "out.wav") + config = {"piper": {"voice": str(model)}} + + result = tts_tool._generate_piper_tts("hello", out_path, config) + + assert result == out_path + assert Path(out_path).exists() + assert Path(out_path).stat().st_size > 0 + assert _StubPiperVoice.loaded == [str(model)] + assert _StubPiperVoice.calls[0][0] == "hello" + + def test_voice_cache_reused_across_calls(self, tmp_path, monkeypatch): + model = self._prepare_voice_files(tmp_path) + monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice) + + config = {"piper": {"voice": str(model)}} + tts_tool._generate_piper_tts("one", str(tmp_path / "a.wav"), config) + tts_tool._generate_piper_tts("two", str(tmp_path / "b.wav"), config) + + # load() should have been called exactly once for the same model+cuda key. + assert _StubPiperVoice.loaded == [str(model)] + # But both synthesize calls went through. + assert [c[0] for c in _StubPiperVoice.calls] == ["one", "two"] + + def test_voice_name_triggers_download(self, tmp_path, monkeypatch): + """A config voice of ``en_US-lessac-medium`` should be resolved via + _resolve_piper_voice_path (which would normally download).""" + monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice) + + def fake_resolve(voice, download_dir): + model = download_dir / f"{voice}.onnx" + model.write_bytes(b"model") + return str(model) + + monkeypatch.setattr(tts_tool, "_resolve_piper_voice_path", fake_resolve) + + config = {"piper": {"voice": "en_US-lessac-medium", "voices_dir": str(tmp_path)}} + result = tts_tool._generate_piper_tts("hi", str(tmp_path / "out.wav"), config) + + assert Path(result).exists() + assert _StubPiperVoice.loaded[0].endswith("en_US-lessac-medium.onnx") + + def test_advanced_knobs_passed_as_synconfig(self, tmp_path, monkeypatch): + model = self._prepare_voice_files(tmp_path) + monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice) + + # Fake SynthesisConfig so we can assert the knobs flowed through. + fake_syn_cls = MagicMock() + + class FakePiperModule: + SynthesisConfig = fake_syn_cls + + # The SynthesisConfig import happens inline inside _generate_piper_tts + # via ``from piper import SynthesisConfig``. Inject a fake piper + # module so that import resolves. + monkeypatch.setitem(sys.modules, "piper", FakePiperModule) + + config = { + "piper": { + "voice": str(model), + "length_scale": 2.0, + "volume": 0.8, + }, + } + tts_tool._generate_piper_tts( + "slow voice", str(tmp_path / "out.wav"), config, + ) + + # SynthesisConfig was constructed with the advanced knobs. + fake_syn_cls.assert_called_once() + kwargs = fake_syn_cls.call_args.kwargs + assert kwargs["length_scale"] == 2.0 + assert kwargs["volume"] == 0.8 + + +# --------------------------------------------------------------------------- +# text_to_speech_tool end-to-end (provider == "piper") +# --------------------------------------------------------------------------- + +class TestTextToSpeechToolWithPiper: + def test_dispatches_to_piper(self, tmp_path, monkeypatch): + model = tmp_path / f"{DEFAULT_PIPER_VOICE}.onnx" + model.write_bytes(b"model") + (tmp_path / f"{DEFAULT_PIPER_VOICE}.onnx.json").write_text("{}") + + monkeypatch.setattr(tts_tool, "_import_piper", lambda: _StubPiperVoice) + + cfg = {"provider": "piper", "piper": {"voice": str(model)}} + monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: cfg) + + result = text_to_speech_tool(text="hi", output_path=str(tmp_path / "clip.wav")) + data = json.loads(result) + + assert data["success"] is True, data + assert data["provider"] == "piper" + assert Path(data["file_path"]).exists() + + def test_missing_package_surfaces_error(self, tmp_path, monkeypatch): + def raise_import(): + raise ImportError("No module named 'piper'") + + monkeypatch.setattr(tts_tool, "_import_piper", raise_import) + + cfg = {"provider": "piper"} + monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: cfg) + + result = text_to_speech_tool(text="hi", output_path=str(tmp_path / "clip.wav")) + data = json.loads(result) + + assert data["success"] is False + assert "piper-tts" in data["error"] + + +# --------------------------------------------------------------------------- +# check_tts_requirements +# --------------------------------------------------------------------------- + +class TestCheckTtsRequirementsPiper: + def test_piper_install_satisfies_requirements(self, monkeypatch): + # Drop every other provider so we can isolate the piper signal. + monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: (_ for _ in ()).throw(ImportError())) + monkeypatch.setattr(tts_tool, "_import_elevenlabs", lambda: (_ for _ in ()).throw(ImportError())) + monkeypatch.setattr(tts_tool, "_import_openai_client", lambda: (_ for _ in ()).throw(ImportError())) + monkeypatch.setattr(tts_tool, "_import_mistral_client", lambda: (_ for _ in ()).throw(ImportError())) + monkeypatch.setattr(tts_tool, "_check_neutts_available", lambda: False) + monkeypatch.setattr(tts_tool, "_check_kittentts_available", lambda: False) + monkeypatch.setattr(tts_tool, "_has_any_command_tts_provider", lambda: False) + monkeypatch.setattr(tts_tool, "_has_openai_audio_backend", lambda: False) + for env in ("MINIMAX_API_KEY", "XAI_API_KEY", "GEMINI_API_KEY", + "GOOGLE_API_KEY", "MISTRAL_API_KEY", "ELEVENLABS_API_KEY"): + monkeypatch.delenv(env, raising=False) + + # Now toggle the piper check on and off. + monkeypatch.setattr(tts_tool, "_check_piper_available", lambda: False) + assert check_tts_requirements() is False + + monkeypatch.setattr(tts_tool, "_check_piper_available", lambda: True) + assert check_tts_requirements() is True diff --git a/tools/tts_tool.py b/tools/tts_tool.py index a89dff605c..7473b32a1d 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -12,6 +12,7 @@ Built-in TTS providers: - xAI TTS: Grok voices, needs XAI_API_KEY - NeuTTS (local, free, no API key): On-device TTS via neutts - KittenTTS (local, free, no API key): On-device 25MB model +- Piper (local, free, no API key): OHF-Voice/piper1-gpl neural VITS, 44 languages Custom command providers: - Users can declare any number of named providers with ``type: command`` @@ -109,6 +110,18 @@ def _import_kittentts(): return KittenTTS +def _import_piper(): + """Lazy import Piper. Returns the PiperVoice class or raises ImportError. + + Piper is an optional, fully-local neural TTS engine (Home Assistant / + Open Home Foundation). ``pip install piper-tts`` provides cross-platform + wheels (Linux / macOS / Windows, x86_64 + ARM64) with embedded espeak-ng. + Voice models (.onnx + .onnx.json) are downloaded on first use. + """ + from piper import PiperVoice + return PiperVoice + + # =========================================================================== # Defaults # =========================================================================== @@ -120,6 +133,7 @@ DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5" DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts" DEFAULT_KITTENTTS_MODEL = "KittenML/kitten-tts-nano-0.8-int8" # 25MB DEFAULT_KITTENTTS_VOICE = "Jasper" +DEFAULT_PIPER_VOICE = "en_US-lessac-medium" # balanced size/quality DEFAULT_OPENAI_VOICE = "alloy" DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1" DEFAULT_MINIMAX_MODEL = "speech-2.8-hd" @@ -163,6 +177,7 @@ PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = { "elevenlabs": 10000, # fallback when model-aware lookup can't resolve (multilingual_v2) "neutts": 2000, # local model, quality falls off on long text "kittentts": 2000, # local 25MB model + "piper": 5000, # local VITS model, phoneme-based; practical cap } # ElevenLabs caps vary by model_id. https://elevenlabs.io/docs/overview/models @@ -308,6 +323,7 @@ BUILTIN_TTS_PROVIDERS = frozenset({ "gemini", "neutts", "kittentts", + "piper", }) DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS = 120 @@ -1293,6 +1309,167 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> return output_path +# =========================================================================== +# Provider: Piper (local, neural VITS, 44 languages) +# =========================================================================== + +# Module-level cache for Piper voice instances. Voices are keyed on their +# absolute .onnx model path so switching voices doesn't invalidate older +# cached voices. +_piper_voice_cache: Dict[str, Any] = {} + + +def _check_piper_available() -> bool: + """Check whether the piper-tts package is importable.""" + try: + import importlib.util + return importlib.util.find_spec("piper") is not None + except Exception: + return False + + +def _get_piper_voices_dir() -> Path: + """Return the directory where Hermes caches Piper voice models. + + Resolves to ``~/.hermes/cache/piper-voices/`` under the active + HERMES_HOME so voice downloads follow profile boundaries. + """ + from hermes_constants import get_hermes_dir + root = Path(get_hermes_dir("cache/piper-voices", "piper_voices_cache")) + root.mkdir(parents=True, exist_ok=True) + return root + + +def _resolve_piper_voice_path(voice: str, download_dir: Path) -> str: + """Resolve *voice* (a model name or path) to a concrete .onnx file path. + + Accepts any of: + - Absolute / expanded path to an .onnx file the user already has + - A voice *name* like ``en_US-lessac-medium`` (downloads to + ``download_dir`` on first use via ``python -m piper.download_voices``) + + Raises RuntimeError if the model can't be located or downloaded. + """ + if not voice: + voice = DEFAULT_PIPER_VOICE + + # Case 1: user gave a direct file path. + candidate = Path(voice).expanduser() + if candidate.suffix.lower() == ".onnx" and candidate.exists(): + return str(candidate) + + # Case 2: user gave a voice *name*. See if it's already downloaded. + cached = download_dir / f"{voice}.onnx" + if cached.exists() and (download_dir / f"{voice}.onnx.json").exists(): + return str(cached) + + # Case 3: download the voice. piper ships a download helper module. + import sys as _sys + logger.info("[Piper] Downloading voice '%s' to %s (first use)", voice, download_dir) + try: + result = subprocess.run( + [_sys.executable, "-m", "piper.download_voices", voice, + "--download-dir", str(download_dir)], + capture_output=True, text=True, timeout=300, + ) + except subprocess.TimeoutExpired as exc: + raise RuntimeError( + f"Piper voice download timed out after 300s for '{voice}'" + ) from exc + + if result.returncode != 0: + stderr = (result.stderr or "").strip() or "no stderr output" + raise RuntimeError( + f"Piper voice download failed for '{voice}': {stderr[:400]}" + ) + + if not cached.exists(): + raise RuntimeError( + f"Piper voice download completed but {cached} is missing — " + f"check voice name (see: https://github.com/OHF-Voice/piper1-gpl/" + f"blob/main/docs/VOICES.md)" + ) + return str(cached) + + +def _generate_piper_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """Generate speech using the local Piper engine. + + Loads the voice model once per process (cached by absolute path) and + writes a WAV file. Caller is responsible for converting to MP3/Opus + via ffmpeg when a different output format is required. + """ + PiperVoice = _import_piper() + import wave + + piper_config = tts_config.get("piper", {}) if isinstance(tts_config, dict) else {} + voice_name = piper_config.get("voice") or DEFAULT_PIPER_VOICE + download_dir = Path(piper_config.get("voices_dir") or _get_piper_voices_dir()).expanduser() + download_dir.mkdir(parents=True, exist_ok=True) + use_cuda = bool(piper_config.get("use_cuda", False)) + + model_path = _resolve_piper_voice_path(voice_name, download_dir) + + cache_key = f"{model_path}::cuda={use_cuda}" + global _piper_voice_cache + if cache_key not in _piper_voice_cache: + logger.info("[Piper] Loading voice: %s", model_path) + _piper_voice_cache[cache_key] = PiperVoice.load(model_path, use_cuda=use_cuda) + logger.info("[Piper] Voice loaded") + voice = _piper_voice_cache[cache_key] + + # Optional synthesis knobs — only pass a SynthesisConfig when at least + # one advanced knob is configured, so we don't depend on a newer Piper + # version than the user's installed one unless we need to. + syn_config = None + has_advanced = any( + k in piper_config + for k in ("length_scale", "noise_scale", "noise_w_scale", "volume", "normalize_audio") + ) + if has_advanced: + try: + from piper import SynthesisConfig # type: ignore + syn_config = SynthesisConfig( + length_scale=float(piper_config.get("length_scale", 1.0)), + noise_scale=float(piper_config.get("noise_scale", 0.667)), + noise_w_scale=float(piper_config.get("noise_w_scale", 0.8)), + volume=float(piper_config.get("volume", 1.0)), + normalize_audio=bool(piper_config.get("normalize_audio", True)), + ) + except ImportError: + logger.warning( + "[Piper] SynthesisConfig not available in this piper-tts " + "version — advanced knobs ignored" + ) + + # Piper outputs WAV. Caller handles downstream MP3/Opus conversion. + wav_path = output_path + if not output_path.endswith(".wav"): + wav_path = output_path.rsplit(".", 1)[0] + ".wav" + + with wave.open(wav_path, "wb") as wav_file: + if syn_config is not None: + voice.synthesize_wav(text, wav_file, syn_config=syn_config) + else: + voice.synthesize_wav(text, wav_file) + + # Convert to desired format if caller requested mp3/ogg + if wav_path != output_path: + ffmpeg = shutil.which("ffmpeg") + if ffmpeg: + conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path] + subprocess.run(conv_cmd, check=True, timeout=30) + try: + os.remove(wav_path) + except OSError: + pass + else: + # No ffmpeg — keep WAV and return that path + os.rename(wav_path, output_path) + + return output_path + + # =========================================================================== # Provider: KittenTTS (local, lightweight) # =========================================================================== @@ -1517,6 +1694,19 @@ def text_to_speech_tool( logger.info("Generating speech with KittenTTS (local, ~25MB)...") _generate_kittentts(text, file_str, tts_config) + elif provider == "piper": + try: + _import_piper() + except ImportError: + return json.dumps({ + "success": False, + "error": "Piper provider selected but 'piper-tts' package not installed. " + "Run 'hermes tools' and select Piper under TTS, or install manually: " + "pip install piper-tts", + }, ensure_ascii=False) + logger.info("Generating speech with Piper (local)...") + _generate_piper_tts(text, file_str, tts_config) + else: # Default: Edge TTS (free), with NeuTTS as local fallback edge_available = True @@ -1566,7 +1756,7 @@ def text_to_speech_tool( if opus_path: file_str = opus_path voice_compatible = file_str.endswith(".ogg") - elif provider in ("edge", "neutts", "minimax", "xai", "kittentts") and not file_str.endswith(".ogg"): + elif provider in ("edge", "neutts", "minimax", "xai", "kittentts", "piper") and not file_str.endswith(".ogg"): opus_path = _convert_to_opus(file_str) if opus_path: file_str = opus_path @@ -1657,6 +1847,8 @@ def check_tts_requirements() -> bool: return True if _check_kittentts_available(): return True + if _check_piper_available(): + return True return False @@ -1954,6 +2146,7 @@ if __name__ == "__main__": f"{'set' if resolve_openai_audio_api_key() else 'not set (VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY)'}" ) print(f" MiniMax: {'API key set' if get_env_value('MINIMAX_API_KEY') else 'not set (MINIMAX_API_KEY)'}") + print(f" Piper: {'installed' if _check_piper_available() else 'not installed (pip install piper-tts)'}") print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}") print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}") diff --git a/website/docs/user-guide/features/overview.md b/website/docs/user-guide/features/overview.md index 36bebe8b02..fa61d68547 100644 --- a/website/docs/user-guide/features/overview.md +++ b/website/docs/user-guide/features/overview.md @@ -31,7 +31,7 @@ Hermes Agent includes a rich set of capabilities that extend far beyond basic ch - **[Browser Automation](browser.md)** — Full browser automation with multiple backends: Browserbase cloud, Browser Use cloud, local Chrome via CDP, or local Chromium. Navigate websites, fill forms, and extract information. - **[Vision & Image Paste](vision.md)** — Multimodal vision support. Paste images from your clipboard into the CLI and ask the agent to analyze, describe, or work with them using any vision-capable model. - **[Image Generation](image-generation.md)** — Generate images from text prompts using FAL.ai. Nine models supported (FLUX 2 Klein/Pro, GPT-Image 1.5/2, Nano Banana Pro, Ideogram V3, Recraft V4 Pro, Qwen, Z-Image Turbo); pick one via `hermes tools`. -- **[Voice & TTS](tts.md)** — Text-to-speech output and voice message transcription across all messaging platforms, with nine provider options: Edge TTS (free), ElevenLabs, OpenAI TTS, MiniMax, Mistral Voxtral, Google Gemini, xAI, NeuTTS, and KittenTTS. +- **[Voice & TTS](tts.md)** — Text-to-speech output and voice message transcription across all messaging platforms, with ten native provider options: Edge TTS (free), ElevenLabs, OpenAI TTS, MiniMax, Mistral Voxtral, Google Gemini, xAI, NeuTTS, KittenTTS, and Piper — plus custom command providers for any local TTS CLI. ## Integrations diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 2a77edc4c1..fa632a83b4 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -14,7 +14,7 @@ If you have a paid [Nous Portal](https://portal.nousresearch.com) subscription, ## Text-to-Speech -Convert text to speech with nine providers: +Convert text to speech with ten providers: | Provider | Quality | Cost | API Key | |----------|---------|------|---------| @@ -27,6 +27,7 @@ Convert text to speech with nine providers: | **xAI TTS** | Excellent | Paid | `XAI_API_KEY` | | **NeuTTS** | Good | Free (local) | None needed | | **KittenTTS** | Good | Free (local) | None needed | +| **Piper** | Good | Free (local) | None needed | ### Platform Delivery @@ -42,7 +43,7 @@ Convert text to speech with nine providers: ```yaml # In ~/.hermes/config.yaml tts: - provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "gemini" | "xai" | "neutts" | "kittentts" + provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "gemini" | "xai" | "neutts" | "kittentts" | "piper" speed: 1.0 # Global speed multiplier (provider-specific settings override this) edge: voice: "en-US-AriaNeural" # 322 voices, 74 languages @@ -83,6 +84,15 @@ tts: voice: Jasper # Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo speed: 1.0 # 0.5 - 2.0 clean_text: true # Expand numbers, currencies, units + piper: + voice: en_US-lessac-medium # voice name (auto-downloaded) OR absolute path to .onnx + # voices_dir: '' # default: ~/.hermes/cache/piper-voices/ + # use_cuda: false # requires onnxruntime-gpu + # length_scale: 1.0 # 2.0 = twice as slow + # noise_scale: 0.667 + # noise_w_scale: 0.8 + # volume: 1.0 # 0.5 = half as loud + # normalize_audio: true ``` **Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed). @@ -98,6 +108,7 @@ Telegram voice bubbles require Opus/OGG audio format: - **xAI TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles - **KittenTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles +- **Piper** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles ```bash # Ubuntu/Debian @@ -110,27 +121,51 @@ brew install ffmpeg sudo dnf install ffmpeg ``` -Without ffmpeg, Edge TTS, MiniMax TTS, NeuTTS, and KittenTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). +Without ffmpeg, Edge TTS, MiniMax TTS, NeuTTS, KittenTTS, and Piper audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). :::tip If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider. ::: +### Piper (local, 44 languages) + +Piper is a fast, local neural TTS engine from the Open Home Foundation (the Home Assistant maintainers). It runs entirely on CPU, supports **44 languages** with pre-trained voices, and needs no API key. + +**Install via `hermes tools`** → Voice & TTS → Piper — Hermes runs `pip install piper-tts` for you. Or install manually: `pip install piper-tts`. + +**Switch to Piper:** + +```yaml +tts: + provider: piper + piper: + voice: en_US-lessac-medium +``` + +On the first TTS call for a voice that isn't cached locally, Hermes runs `python -m piper.download_voices ` and downloads the model (~20-90MB depending on quality tier) into `~/.hermes/cache/piper-voices/`. Subsequent calls reuse the cached model. + +**Picking a voice.** The [full voice catalog](https://github.com/OHF-Voice/piper1-gpl/blob/main/docs/VOICES.md) covers English, Spanish, French, German, Italian, Dutch, Portuguese, Russian, Polish, Turkish, Chinese, Arabic, Hindi, and more — each with `x_low` / `low` / `medium` / `high` quality tiers. Sample voices at [rhasspy.github.io/piper-samples](https://rhasspy.github.io/piper-samples/). + +**Using a pre-downloaded voice.** Set `tts.piper.voice` to an absolute path ending in `.onnx`: + +```yaml +tts: + piper: + voice: /path/to/my-custom-voice.onnx +``` + +**Advanced knobs** (`tts.piper.length_scale` / `noise_scale` / `noise_w_scale` / `volume` / `normalize_audio`, `use_cuda`) correspond 1:1 to Piper's `SynthesisConfig`. They're ignored on older `piper-tts` versions. + ### Custom command providers -If a TTS engine you want isn't natively supported (Piper, VoxCPM, MLX-Kokoro, XTTS CLI, a voice-cloning script, anything else that exposes a CLI), you can wire it in as a **command-type provider** without writing any Python. Hermes writes the input text to a temp UTF-8 file, runs your shell command, and reads the audio file the command produced. +If a TTS engine you want isn't natively supported (VoxCPM, MLX-Kokoro, XTTS CLI, a voice-cloning script, anything else that exposes a CLI), you can wire it in as a **command-type provider** without writing any Python. Hermes writes the input text to a temp UTF-8 file, runs your shell command, and reads the audio file the command produced. Declare one or more providers under `tts.providers.` and switch between them with `tts.provider: ` — the same way you switch between built-ins like `edge` and `openai`. ```yaml tts: - provider: piper-en # pick any name under tts.providers + provider: voxcpm # pick any name under tts.providers providers: - piper-en: - type: command - command: "piper -m ~/models/en_US-amy.onnx -f {output_path} < {input_path}" - output_format: wav - voxcpm: type: command command: "voxcpm --ref ~/voice.wav --text-file {input_path} --out {output_path}" @@ -143,6 +178,11 @@ tts: command: "python -m mlx_kokoro --in {input_path} --out {output_path} --voice {voice}" voice: af_sky output_format: wav + + piper-custom: # native Piper also supports custom .onnx via tts.piper.voice + type: command + command: "piper -m /path/to/custom.onnx -f {output_path} < {input_path}" + output_format: wav ``` #### Placeholders