diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 3c00fa4f0..7eb25965a 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -441,6 +441,16 @@ def _print_setup_summary(config: dict, hermes_home): tool_status.append(("Text-to-Speech (NeuTTS local)", True, None)) else: tool_status.append(("Text-to-Speech (NeuTTS — not installed)", False, "run 'hermes setup tts'")) + elif tts_provider == "kittentts": + try: + import importlib.util + kittentts_ok = importlib.util.find_spec("kittentts") is not None + except Exception: + kittentts_ok = False + if kittentts_ok: + tool_status.append(("Text-to-Speech (KittenTTS local)", True, None)) + else: + tool_status.append(("Text-to-Speech (KittenTTS — not installed)", False, "run 'hermes setup tts'")) else: tool_status.append(("Text-to-Speech (Edge TTS)", True, None)) @@ -901,6 +911,31 @@ def _install_neutts_deps() -> bool: return False +def _install_kittentts_deps() -> bool: + """Install KittenTTS dependencies with user approval. Returns True on success.""" + import subprocess + import sys + + wheel_url = ( + "https://github.com/KittenML/KittenTTS/releases/download/" + "0.8.1/kittentts-0.8.1-py3-none-any.whl" + ) + print() + print_info("Installing kittentts Python package (~25-80MB model downloaded on first use)...") + print() + try: + subprocess.run( + [sys.executable, "-m", "pip", "install", "-U", wheel_url, "soundfile", "--quiet"], + check=True, timeout=300, + ) + print_success("kittentts installed successfully") + return True + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + print_error(f"Failed to install kittentts: {e}") + print_info(f"Try manually: python -m pip install -U '{wheel_url}' soundfile") + return False + + def _setup_tts_provider(config: dict): """Interactive TTS provider selection with install flow for NeuTTS.""" tts_config = config.get("tts", {}) @@ -916,6 +951,7 @@ def _setup_tts_provider(config: dict): "mistral": "Mistral Voxtral TTS", "gemini": "Google Gemini TTS", "neutts": "NeuTTS", + "kittentts": "KittenTTS", } current_label = provider_labels.get(current_provider, current_provider) @@ -939,9 +975,10 @@ def _setup_tts_provider(config: dict): "Mistral Voxtral TTS (multilingual, native Opus, needs API key)", "Google Gemini TTS (30 prebuilt voices, prompt-controllable, needs API key)", "NeuTTS (local on-device, free, ~300MB model download)", + "KittenTTS (local on-device, free, lightweight ~25-80MB ONNX)", ] ) - providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "gemini", "neutts"]) + providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "gemini", "neutts", "kittentts"]) choices.append(f"Keep current ({current_label})") keep_current_idx = len(choices) - 1 idx = prompt_choice("Select TTS provider:", choices, keep_current_idx) @@ -1060,6 +1097,29 @@ def _setup_tts_provider(config: dict): print_warning("No API key provided. Falling back to Edge TTS.") selected = "edge" + elif selected == "kittentts": + # Check if already installed + try: + import importlib.util + already_installed = importlib.util.find_spec("kittentts") is not None + except Exception: + already_installed = False + + if already_installed: + print_success("KittenTTS is already installed") + else: + print() + print_info("KittenTTS is lightweight (~25-80MB, CPU-only, no API key required).") + print_info("Voices: Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo") + print() + if prompt_yes_no("Install KittenTTS now?", True): + if not _install_kittentts_deps(): + print_warning("KittenTTS installation incomplete. Falling back to Edge TTS.") + selected = "edge" + else: + print_info("Skipping install. Set tts.provider to 'kittentts' after installing manually.") + selected = "edge" + # Save the selection if "tts" not in config: config["tts"] = {} diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index cb1f39371..24c5fde5f 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -182,6 +182,14 @@ TOOL_CATEGORIES = { ], "tts_provider": "gemini", }, + { + "name": "KittenTTS", + "badge": "local · free", + "tag": "Lightweight local ONNX TTS (~25MB), no API key", + "env_vars": [], + "tts_provider": "kittentts", + "post_setup": "kittentts", + }, ], }, "web": { @@ -423,6 +431,36 @@ def _run_post_setup(post_setup_key: str): _print_warning(" Node.js not found. Install Camofox via Docker:") _print_info(" docker run -p 9377:9377 -e CAMOFOX_PORT=9377 jo-inc/camofox-browser") + elif post_setup_key == "kittentts": + try: + __import__("kittentts") + _print_success(" kittentts is already installed") + return + except ImportError: + pass + import subprocess + _print_info(" Installing kittentts (~25-80MB model, CPU-only)...") + wheel_url = ( + "https://github.com/KittenML/KittenTTS/releases/download/" + "0.8.1/kittentts-0.8.1-py3-none-any.whl" + ) + try: + result = subprocess.run( + [sys.executable, "-m", "pip", "install", "-U", wheel_url, "soundfile", "--quiet"], + capture_output=True, text=True, timeout=300, + ) + if result.returncode == 0: + _print_success(" kittentts installed") + _print_info(" Voices: Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo") + _print_info(" Models: KittenML/kitten-tts-nano-0.8-int8 (25MB), micro (41MB), mini (80MB)") + else: + _print_warning(" kittentts install failed:") + _print_info(f" {result.stderr.strip()[:300]}") + _print_info(f" Run manually: python -m pip install -U '{wheel_url}' soundfile") + except subprocess.TimeoutExpired: + _print_warning(" kittentts install timed out (>5min)") + _print_info(f" Run manually: python -m pip install -U '{wheel_url}' soundfile") + elif post_setup_key == "rl_training": try: __import__("tinker_atropos") diff --git a/tests/tools/test_tts_kittentts.py b/tests/tools/test_tts_kittentts.py new file mode 100644 index 000000000..ab841f59f --- /dev/null +++ b/tests/tools/test_tts_kittentts.py @@ -0,0 +1,198 @@ +"""Tests for the KittenTTS local provider in tools/tts_tool.py.""" + +import json +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for key in ("HERMES_SESSION_PLATFORM",): + monkeypatch.delenv(key, raising=False) + + +@pytest.fixture(autouse=True) +def clear_kittentts_cache(): + """Reset the module-level model cache between tests.""" + from tools import tts_tool as _tt + _tt._kittentts_model_cache.clear() + yield + _tt._kittentts_model_cache.clear() + + +@pytest.fixture +def mock_kittentts_module(): + """Inject a fake kittentts + soundfile module that return stub objects.""" + fake_model = MagicMock() + # 24kHz float32 PCM at ~2s of silence + fake_model.generate.return_value = np.zeros(48000, dtype=np.float32) + fake_cls = MagicMock(return_value=fake_model) + fake_kittentts = MagicMock() + fake_kittentts.KittenTTS = fake_cls + + # Stub soundfile — the real package isn't installed in CI venv, and + # _generate_kittentts does `import soundfile as sf` at runtime. + fake_sf = MagicMock() + def _fake_write(path, audio, samplerate): + # Emulate writing a real file so downstream path checks succeed. + import pathlib + pathlib.Path(path).write_bytes(b"RIFF\x00\x00\x00\x00WAVEfmt fake") + fake_sf.write = _fake_write + + with patch.dict( + "sys.modules", + {"kittentts": fake_kittentts, "soundfile": fake_sf}, + ): + yield fake_model, fake_cls + + +class TestGenerateKittenTts: + def test_successful_wav_generation(self, tmp_path, mock_kittentts_module): + from tools.tts_tool import _generate_kittentts + + fake_model, fake_cls = mock_kittentts_module + output_path = str(tmp_path / "test.wav") + result = _generate_kittentts("Hello world", output_path, {}) + + assert result == output_path + assert (tmp_path / "test.wav").exists() + fake_cls.assert_called_once() + fake_model.generate.assert_called_once() + + def test_config_passes_voice_speed_cleantext(self, tmp_path, mock_kittentts_module): + from tools.tts_tool import _generate_kittentts + + fake_model, _ = mock_kittentts_module + config = { + "kittentts": { + "model": "KittenML/kitten-tts-mini-0.8", + "voice": "Luna", + "speed": 1.25, + "clean_text": False, + } + } + _generate_kittentts("Hi there", str(tmp_path / "out.wav"), config) + + call_kwargs = fake_model.generate.call_args.kwargs + assert call_kwargs["voice"] == "Luna" + assert call_kwargs["speed"] == 1.25 + assert call_kwargs["clean_text"] is False + + def test_default_model_and_voice(self, tmp_path, mock_kittentts_module): + from tools.tts_tool import ( + DEFAULT_KITTENTTS_MODEL, + DEFAULT_KITTENTTS_VOICE, + _generate_kittentts, + ) + + fake_model, fake_cls = mock_kittentts_module + _generate_kittentts("Hi", str(tmp_path / "out.wav"), {}) + + fake_cls.assert_called_once_with(DEFAULT_KITTENTTS_MODEL) + assert fake_model.generate.call_args.kwargs["voice"] == DEFAULT_KITTENTTS_VOICE + + def test_model_is_cached_across_calls(self, tmp_path, mock_kittentts_module): + from tools.tts_tool import _generate_kittentts + + _, fake_cls = mock_kittentts_module + _generate_kittentts("One", str(tmp_path / "a.wav"), {}) + _generate_kittentts("Two", str(tmp_path / "b.wav"), {}) + + # Same model name → class instantiated exactly once + assert fake_cls.call_count == 1 + + def test_different_models_are_cached_separately(self, tmp_path, mock_kittentts_module): + from tools.tts_tool import _generate_kittentts + + _, fake_cls = mock_kittentts_module + _generate_kittentts( + "A", str(tmp_path / "a.wav"), + {"kittentts": {"model": "KittenML/kitten-tts-nano-0.8-int8"}}, + ) + _generate_kittentts( + "B", str(tmp_path / "b.wav"), + {"kittentts": {"model": "KittenML/kitten-tts-mini-0.8"}}, + ) + + assert fake_cls.call_count == 2 + + def test_non_wav_extension_triggers_ffmpeg_conversion( + self, tmp_path, mock_kittentts_module, monkeypatch + ): + """Non-.wav output path causes WAV → target ffmpeg conversion.""" + from tools import tts_tool as _tt + + calls = [] + + def fake_shutil_which(cmd): + return "/usr/bin/ffmpeg" if cmd == "ffmpeg" else None + + def fake_run(cmd, check=False, timeout=None, **kw): + calls.append(cmd) + # Emulate ffmpeg writing the output file + import pathlib + out_path = cmd[-1] + pathlib.Path(out_path).write_bytes(b"fake-mp3-data") + return MagicMock(returncode=0) + + monkeypatch.setattr(_tt.shutil, "which", fake_shutil_which) + monkeypatch.setattr(_tt.subprocess, "run", fake_run) + + output_path = str(tmp_path / "test.mp3") + result = _tt._generate_kittentts("Hi", output_path, {}) + + assert result == output_path + assert len(calls) == 1 + assert calls[0][0] == "/usr/bin/ffmpeg" + + def test_missing_kittentts_raises_import_error(self, tmp_path, monkeypatch): + """When kittentts package is not installed, _import_kittentts raises.""" + import sys + monkeypatch.setitem(sys.modules, "kittentts", None) + from tools.tts_tool import _generate_kittentts + + with pytest.raises((ImportError, TypeError)): + _generate_kittentts("Hi", str(tmp_path / "out.wav"), {}) + + +class TestCheckKittenttsAvailable: + def test_reports_available_when_package_present(self, monkeypatch): + import importlib.util + from tools.tts_tool import _check_kittentts_available + + fake_spec = MagicMock() + monkeypatch.setattr( + importlib.util, "find_spec", + lambda name: fake_spec if name == "kittentts" else None, + ) + assert _check_kittentts_available() is True + + def test_reports_unavailable_when_package_missing(self, monkeypatch): + import importlib.util + from tools.tts_tool import _check_kittentts_available + + monkeypatch.setattr(importlib.util, "find_spec", lambda name: None) + assert _check_kittentts_available() is False + + +class TestDispatcherBranch: + def test_kittentts_not_installed_returns_helpful_error(self, monkeypatch, tmp_path): + """When provider=kittentts but package missing, return JSON error with setup hint.""" + import sys + monkeypatch.setitem(sys.modules, "kittentts", None) + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + + from tools.tts_tool import text_to_speech_tool + + # Write a config telling it to use kittentts + import yaml + (tmp_path / "config.yaml").write_text( + yaml.safe_dump({"tts": {"provider": "kittentts"}}) + ) + + result = json.loads(text_to_speech_tool(text="Hello")) + assert result["success"] is False + assert "kittentts" in result["error"].lower() + assert "hermes setup tts" in result["error"].lower() diff --git a/tools/tts_tool.py b/tools/tts_tool.py index fa5a8159c..b83fa4d73 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -703,6 +703,15 @@ def _check_neutts_available() -> bool: return False +def _check_kittentts_available() -> bool: + """Check if the kittentts engine is importable (installed locally).""" + try: + import importlib.util + return importlib.util.find_spec("kittentts") is not None + except Exception: + return False + + def _default_neutts_ref_audio() -> str: """Return path to the bundled default voice reference audio.""" return str(Path(__file__).parent / "neutts_samples" / "jo.wav") @@ -955,7 +964,8 @@ def text_to_speech_tool( return json.dumps({ "success": False, "error": "KittenTTS provider selected but 'kittentts' package not installed. " - "Run: pip install https://github.com/KittenML/KittenTTS/releases/download/0.8.1/kittentts-0.8.1-py3-none-any.whl" + "Run 'hermes setup tts' and choose KittenTTS, or install manually: " + "pip install https://github.com/KittenML/KittenTTS/releases/download/0.8.1/kittentts-0.8.1-py3-none-any.whl" }, ensure_ascii=False) logger.info("Generating speech with KittenTTS (local, ~25MB)...") _generate_kittentts(text, file_str, tts_config) @@ -1084,6 +1094,8 @@ def check_tts_requirements() -> bool: pass if _check_neutts_available(): return True + if _check_kittentts_available(): + return True return False diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 6f7fc8950..2bf6430ff 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -14,7 +14,7 @@ If you have a paid [Nous Portal](https://portal.nousresearch.com) subscription, ## Text-to-Speech -Convert text to speech with eight providers: +Convert text to speech with nine providers: | Provider | Quality | Cost | API Key | |----------|---------|------|---------| @@ -25,7 +25,8 @@ Convert text to speech with eight providers: | **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` | | **Google Gemini TTS** | Excellent | Free tier | `GEMINI_API_KEY` | | **xAI TTS** | Excellent | Paid | `XAI_API_KEY` | -| **NeuTTS** | Good | Free | None needed | +| **NeuTTS** | Good | Free (local) | None needed | +| **KittenTTS** | Good | Free (local) | None needed | ### Platform Delivery @@ -41,7 +42,7 @@ Convert text to speech with eight providers: ```yaml # In ~/.hermes/config.yaml tts: - provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "gemini" | "xai" | "neutts" + provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "gemini" | "xai" | "neutts" | "kittentts" speed: 1.0 # Global speed multiplier (provider-specific settings override this) edge: voice: "en-US-AriaNeural" # 322 voices, 74 languages @@ -77,6 +78,11 @@ tts: ref_text: '' model: neuphonic/neutts-air-q4-gguf device: cpu + kittentts: + model: KittenML/kitten-tts-nano-0.8-int8 # 25MB int8; also: kitten-tts-micro-0.8 (41MB), kitten-tts-mini-0.8 (80MB) + voice: Jasper # Jasper, Bella, Luna, Bruno, Rosie, Hugo, Kiki, Leo + speed: 1.0 # 0.5 - 2.0 + clean_text: true # Expand numbers, currencies, units ``` **Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed). @@ -91,6 +97,7 @@ Telegram voice bubbles require Opus/OGG audio format: - **Google Gemini TTS** outputs raw PCM and uses **ffmpeg** to encode Opus directly for Telegram voice bubbles - **xAI TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles +- **KittenTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles ```bash # Ubuntu/Debian @@ -103,7 +110,7 @@ brew install ffmpeg sudo dnf install ffmpeg ``` -Without ffmpeg, Edge TTS, MiniMax TTS, and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). +Without ffmpeg, Edge TTS, MiniMax TTS, NeuTTS, and KittenTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble). :::tip If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider.