mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
feat: add NeuTTS optional skill + local TTS provider backend
* feat(skills): add bundled neutts optional skill Add NeuTTS optional skill with CLI scaffold, bootstrap helper, and sample voice profile. Also fixes skills_hub.py to handle binary assets (WAV files) during skill installation. Changes: - optional-skills/mlops/models/neutts/ — skill + CLI scaffold - tools/skills_hub.py — binary asset support (read_bytes, write_bytes) - tests/tools/test_skills_hub.py — regression tests for binary assets * feat(tts): add NeuTTS as local TTS provider backend Add NeuTTS as a fourth TTS provider option alongside Edge, ElevenLabs, and OpenAI. NeuTTS runs fully on-device via neutts_cli — no API key needed. Provider behavior: - Explicit: set tts.provider to 'neutts' in config.yaml - Fallback: when Edge TTS is unavailable and neutts_cli is installed, automatically falls back to NeuTTS instead of failing - check_tts_requirements() now includes NeuTTS in availability checks NeuTTS outputs WAV natively. For Telegram voice bubbles, ffmpeg converts to Opus (same pattern as Edge TTS). Changes: - tools/tts_tool.py — _generate_neutts(), _check_neutts_available(), provider dispatch, fallback logic, Opus conversion - hermes_cli/config.py — tts.neutts config defaults --------- Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>
This commit is contained in:
parent
766f4aae2b
commit
cb0deb5f9d
15 changed files with 1359 additions and 24 deletions
|
|
@ -0,0 +1,55 @@
|
|||
# NeuTTS CLI
|
||||
|
||||
Small standalone CLI for installing, checking, and running [NeuTTS](https://github.com/neuphonic/neutts) locally.
|
||||
|
||||
This scaffold is designed to be a good fit for a future Hermes optional skill:
|
||||
|
||||
- predictable commands
|
||||
- machine-friendly output for inspection
|
||||
- local voice profile management
|
||||
- direct local synthesis
|
||||
|
||||
## Commands
|
||||
|
||||
```bash
|
||||
neutts install --all
|
||||
neutts doctor
|
||||
neutts list-models
|
||||
neutts add-voice demo --ref-audio ./samples/jo.wav --ref-text-file ./samples/jo.txt
|
||||
neutts list-voices
|
||||
neutts synth --voice demo --text Hello from NeuTTS --out ./out.wav
|
||||
neutts synth --voice demo --text Quick smoke test
|
||||
```
|
||||
|
||||
## Install the bundled scaffold
|
||||
|
||||
```bash
|
||||
cd optional-skills/mlops/models/neutts/assets/neutts-cli
|
||||
python -m pip install -e .
|
||||
```
|
||||
|
||||
## Add the bundled sample profile
|
||||
|
||||
This skill bundles an upstream NeuTTS sample reference in `samples/`.
|
||||
|
||||
```bash
|
||||
cd optional-skills/mlops/models/neutts/assets/neutts-cli
|
||||
PYTHONPATH=src python -m neutts_cli.cli add-voice jo-demo \
|
||||
--ref-audio ./samples/jo.wav \
|
||||
--ref-text-file ./samples/jo.txt \
|
||||
--language en
|
||||
```
|
||||
|
||||
Then inspect it with:
|
||||
|
||||
```bash
|
||||
PYTHONPATH=src python -m neutts_cli.cli list-voices
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- `install` installs the upstream `neutts` package into the current Python environment.
|
||||
- `list-voices` shows local voice profiles created with `add-voice`.
|
||||
- `synth` uses NeuTTS reference cloning. A voice profile is just a saved reference audio/text pair.
|
||||
- `synth` accepts quoted or unquoted text and defaults to `./out.wav` when `--out` is omitted.
|
||||
- GGUF / `llama-cpp-python` acceleration can vary by platform, so the CLI prints follow-up guidance instead of forcing one build recipe.
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "neutts-cli"
|
||||
version = "0.1.0"
|
||||
description = "Standalone CLI for installing and running NeuTTS locally"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
{name = "Hermes Agent Contributors"}
|
||||
]
|
||||
dependencies = []
|
||||
|
||||
[project.scripts]
|
||||
neutts = "neutts_cli.cli:main"
|
||||
|
||||
[tool.setuptools]
|
||||
package-dir = {"" = "src"}
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
|
|
@ -0,0 +1 @@
|
|||
So I just tried Neuphonic and I’m genuinely impressed. It's super responsive, it sounds clean, supports voice cloning, and the agent feature is fun to play with too. Highly recommend it for podcasts, conversations, or even just messing around with voiceovers.
|
||||
Binary file not shown.
|
|
@ -0,0 +1,3 @@
|
|||
__all__ = ["__version__"]
|
||||
|
||||
__version__ = "0.1.0"
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import wave
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def write_wav(path: str | Path, samples, sample_rate: int) -> Path:
|
||||
output_path = Path(path).expanduser().resolve()
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError as exc:
|
||||
raise RuntimeError("numpy is required to write NeuTTS audio output") from exc
|
||||
|
||||
data = np.asarray(samples, dtype=np.float32).flatten()
|
||||
clipped = np.clip(data, -1.0, 1.0)
|
||||
pcm16 = (clipped * 32767.0).astype(np.int16)
|
||||
|
||||
with wave.open(str(output_path), "wb") as wav_file:
|
||||
wav_file.setnchannels(1)
|
||||
wav_file.setsampwidth(2)
|
||||
wav_file.setframerate(sample_rate)
|
||||
wav_file.writeframes(pcm16.tobytes())
|
||||
|
||||
return output_path
|
||||
|
|
@ -0,0 +1,204 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
from .config import AppConfig
|
||||
from .core import (
|
||||
KNOWN_MODELS,
|
||||
doctor_report,
|
||||
list_voices,
|
||||
load_voice,
|
||||
platform_notes,
|
||||
run_install,
|
||||
save_voice,
|
||||
synthesize,
|
||||
)
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Standalone CLI for local NeuTTS workflows"
|
||||
)
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
install_parser = subparsers.add_parser(
|
||||
"install", help="Install NeuTTS into the current Python environment"
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--llama",
|
||||
action="store_true",
|
||||
help="Install llama-cpp-python support via neutts[llama]",
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--onnx",
|
||||
action="store_true",
|
||||
help="Install ONNX decoder support via neutts[onnx]",
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--all", action="store_true", help="Install all upstream NeuTTS extras"
|
||||
)
|
||||
install_parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print the install command without running it",
|
||||
)
|
||||
|
||||
subparsers.add_parser("doctor", help="Inspect NeuTTS CLI environment")
|
||||
subparsers.add_parser(
|
||||
"list-models", help="Show known official NeuTTS model repositories"
|
||||
)
|
||||
subparsers.add_parser("list-voices", help="Show local voice profiles")
|
||||
|
||||
add_voice_parser = subparsers.add_parser(
|
||||
"add-voice", help="Save a local voice profile from a reference sample"
|
||||
)
|
||||
add_voice_parser.add_argument("name", help="Voice profile name")
|
||||
add_voice_parser.add_argument(
|
||||
"--ref-audio", required=True, help="Reference WAV file"
|
||||
)
|
||||
add_voice_parser.add_argument(
|
||||
"--ref-text", help="Transcript text for the reference audio"
|
||||
)
|
||||
add_voice_parser.add_argument(
|
||||
"--ref-text-file",
|
||||
help="Path to a text file containing the reference transcript",
|
||||
)
|
||||
add_voice_parser.add_argument(
|
||||
"--language", default="unknown", help="Optional language tag"
|
||||
)
|
||||
|
||||
synth_parser = subparsers.add_parser(
|
||||
"synth", help="Synthesize speech to a WAV file"
|
||||
)
|
||||
synth_parser.add_argument(
|
||||
"--text", nargs="+", required=True, help="Text to synthesize"
|
||||
)
|
||||
synth_parser.add_argument("--voice", help="Saved voice profile name")
|
||||
synth_parser.add_argument(
|
||||
"--ref-audio", help="Reference audio path when not using --voice"
|
||||
)
|
||||
synth_parser.add_argument(
|
||||
"--ref-text", help="Reference transcript when not using --voice"
|
||||
)
|
||||
synth_parser.add_argument("--out", default="out.wav", help="Output WAV file path")
|
||||
|
||||
config_parser = subparsers.add_parser(
|
||||
"config", help="View or update default synthesis settings"
|
||||
)
|
||||
config_parser.add_argument("--backbone-repo")
|
||||
config_parser.add_argument("--backbone-device")
|
||||
config_parser.add_argument("--codec-repo")
|
||||
config_parser.add_argument("--codec-device")
|
||||
config_parser.add_argument("--sample-rate", type=int)
|
||||
config_parser.add_argument(
|
||||
"--default-voice",
|
||||
help="Voice profile name to use when --voice is omitted from synth",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def _read_ref_text(args: argparse.Namespace) -> str:
|
||||
if args.ref_text:
|
||||
return args.ref_text.strip()
|
||||
if args.ref_text_file:
|
||||
with open(args.ref_text_file, "r", encoding="utf-8") as handle:
|
||||
return handle.read().strip()
|
||||
raise ValueError("Provide either --ref-text or --ref-text-file")
|
||||
|
||||
|
||||
def _normalize_text_arg(value: str | list[str]) -> str:
|
||||
if isinstance(value, list):
|
||||
return " ".join(value).strip()
|
||||
return value.strip()
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = build_parser()
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
try:
|
||||
if args.command == "install":
|
||||
commands = run_install(args.llama, args.onnx, args.all, args.dry_run)
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"commands": commands,
|
||||
"notes": platform_notes(),
|
||||
"dry_run": args.dry_run,
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
if args.command == "doctor":
|
||||
print(json.dumps(doctor_report(), indent=2))
|
||||
return 0
|
||||
|
||||
if args.command == "list-models":
|
||||
print(json.dumps(KNOWN_MODELS, indent=2))
|
||||
return 0
|
||||
|
||||
if args.command == "list-voices":
|
||||
profiles = [profile.__dict__ for profile in list_voices()]
|
||||
print(json.dumps(profiles, indent=2))
|
||||
return 0
|
||||
|
||||
if args.command == "add-voice":
|
||||
metadata_path = save_voice(
|
||||
name=args.name,
|
||||
ref_audio=args.ref_audio,
|
||||
ref_text=_read_ref_text(args),
|
||||
language=args.language,
|
||||
)
|
||||
profile = load_voice(args.name)
|
||||
print(
|
||||
json.dumps(
|
||||
{"saved": str(metadata_path), "voice": profile.__dict__}, indent=2
|
||||
)
|
||||
)
|
||||
return 0
|
||||
|
||||
if args.command == "synth":
|
||||
output = synthesize(
|
||||
text=_normalize_text_arg(args.text),
|
||||
out=args.out,
|
||||
voice=args.voice,
|
||||
ref_audio=args.ref_audio,
|
||||
ref_text=args.ref_text,
|
||||
)
|
||||
print(json.dumps({"output": str(output)}, indent=2))
|
||||
return 0
|
||||
|
||||
if args.command == "config":
|
||||
config = AppConfig.load()
|
||||
changed = False
|
||||
for field in (
|
||||
"backbone_repo",
|
||||
"backbone_device",
|
||||
"codec_repo",
|
||||
"codec_device",
|
||||
"sample_rate",
|
||||
"default_voice",
|
||||
):
|
||||
value = getattr(args, field, None)
|
||||
if value is not None:
|
||||
setattr(config, field, value)
|
||||
changed = True
|
||||
if changed:
|
||||
config.save()
|
||||
print(json.dumps(config.__dict__, indent=2))
|
||||
return 0
|
||||
|
||||
parser.error(f"Unknown command: {args.command}")
|
||||
return 2
|
||||
except Exception as exc:
|
||||
print(f"error: {exc}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
DEFAULT_BACKBONE = "neuphonic/neutts-nano"
|
||||
DEFAULT_CODEC = "neuphonic/neucodec"
|
||||
DEFAULT_SAMPLE_RATE = 24000
|
||||
|
||||
|
||||
def app_home() -> Path:
|
||||
override = os.getenv("NEUTTS_CLI_HOME")
|
||||
if override:
|
||||
return Path(override).expanduser()
|
||||
return Path.home() / ".neutts-cli"
|
||||
|
||||
|
||||
def config_path() -> Path:
|
||||
return app_home() / "config.json"
|
||||
|
||||
|
||||
def voices_dir() -> Path:
|
||||
return app_home() / "voices"
|
||||
|
||||
|
||||
@dataclass
|
||||
class AppConfig:
|
||||
backbone_repo: str = DEFAULT_BACKBONE
|
||||
backbone_device: str = "cpu"
|
||||
codec_repo: str = DEFAULT_CODEC
|
||||
codec_device: str = "cpu"
|
||||
sample_rate: int = DEFAULT_SAMPLE_RATE
|
||||
default_voice: str | None = None
|
||||
|
||||
@classmethod
|
||||
def load(cls) -> "AppConfig":
|
||||
path = config_path()
|
||||
if not path.exists():
|
||||
return cls()
|
||||
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
return cls(
|
||||
backbone_repo=data.get("backbone_repo", DEFAULT_BACKBONE),
|
||||
backbone_device=data.get("backbone_device", "cpu"),
|
||||
codec_repo=data.get("codec_repo", DEFAULT_CODEC),
|
||||
codec_device=data.get("codec_device", "cpu"),
|
||||
sample_rate=int(data.get("sample_rate", DEFAULT_SAMPLE_RATE)),
|
||||
default_voice=data.get("default_voice") or None,
|
||||
)
|
||||
|
||||
def save(self) -> Path:
|
||||
home = app_home()
|
||||
home.mkdir(parents=True, exist_ok=True)
|
||||
path = config_path()
|
||||
payload = {
|
||||
"backbone_repo": self.backbone_repo,
|
||||
"backbone_device": self.backbone_device,
|
||||
"codec_repo": self.codec_repo,
|
||||
"codec_device": self.codec_device,
|
||||
"sample_rate": self.sample_rate,
|
||||
"default_voice": self.default_voice,
|
||||
}
|
||||
path.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
|
||||
return path
|
||||
|
|
@ -0,0 +1,197 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import importlib.util
|
||||
import json
|
||||
import platform
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from .audio import write_wav
|
||||
from .config import AppConfig, app_home, voices_dir
|
||||
|
||||
|
||||
KNOWN_MODELS = [
|
||||
"neuphonic/neutts-air",
|
||||
"neuphonic/neutts-air-q8-gguf",
|
||||
"neuphonic/neutts-air-q4-gguf",
|
||||
"neuphonic/neutts-nano",
|
||||
"neuphonic/neutts-nano-q8-gguf",
|
||||
"neuphonic/neutts-nano-q4-gguf",
|
||||
"neuphonic/neutts-nano-french",
|
||||
"neuphonic/neutts-nano-german",
|
||||
"neuphonic/neutts-nano-spanish",
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceProfile:
|
||||
name: str
|
||||
ref_audio: str
|
||||
ref_text: str
|
||||
language: str = "unknown"
|
||||
|
||||
|
||||
def is_module_available(module_name: str) -> bool:
|
||||
return importlib.util.find_spec(module_name) is not None
|
||||
|
||||
|
||||
def run_install(
|
||||
include_llama: bool, include_onnx: bool, include_all: bool, dry_run: bool
|
||||
) -> list[str]:
|
||||
extras = []
|
||||
if include_all:
|
||||
extras = ["all"]
|
||||
else:
|
||||
if include_llama:
|
||||
extras.append("llama")
|
||||
if include_onnx:
|
||||
extras.append("onnx")
|
||||
|
||||
requirement = "neutts"
|
||||
if extras:
|
||||
requirement = f"neutts[{','.join(extras)}]"
|
||||
|
||||
command = [sys.executable, "-m", "pip", "install", "-U", requirement]
|
||||
rendered = " ".join(command)
|
||||
if dry_run:
|
||||
return [rendered]
|
||||
|
||||
subprocess.run(command, check=True)
|
||||
return [rendered]
|
||||
|
||||
|
||||
def platform_notes() -> list[str]:
|
||||
system = platform.system()
|
||||
if system == "Darwin":
|
||||
return [
|
||||
"For Apple Silicon GGUF acceleration, install the llama extra with BLAS/Accelerate flags.",
|
||||
"See the upstream NeuTTS README for the recommended CMAKE_ARGS invocation.",
|
||||
]
|
||||
if system == "Linux":
|
||||
return [
|
||||
"For GGUF acceleration on Linux, install OpenBLAS and then reinstall the llama extra with matching CMAKE_ARGS.",
|
||||
]
|
||||
if system == "Windows":
|
||||
return [
|
||||
"For GGUF acceleration on Windows, install OpenBLAS first and then install the llama extra from PowerShell with CMAKE_ARGS set.",
|
||||
]
|
||||
return []
|
||||
|
||||
|
||||
def doctor_report() -> dict:
|
||||
voice_count = (
|
||||
len(list(voices_dir().glob("*/voice.json"))) if voices_dir().exists() else 0
|
||||
)
|
||||
config = AppConfig.load()
|
||||
report = {
|
||||
"python": sys.version.split()[0],
|
||||
"platform": platform.platform(),
|
||||
"app_home": str(app_home()),
|
||||
"config": asdict(config),
|
||||
"neutts_installed": is_module_available("neutts"),
|
||||
"numpy_installed": is_module_available("numpy"),
|
||||
"onnxruntime_installed": is_module_available("onnxruntime"),
|
||||
"llama_cpp_installed": is_module_available("llama_cpp"),
|
||||
"ffmpeg_in_path": shutil.which("ffmpeg") is not None,
|
||||
"voice_profiles": voice_count,
|
||||
"default_voice": config.default_voice,
|
||||
}
|
||||
return report
|
||||
|
||||
|
||||
def save_voice(
|
||||
name: str, ref_audio: str, ref_text: str, language: str = "unknown"
|
||||
) -> Path:
|
||||
source_audio = Path(ref_audio).expanduser().resolve()
|
||||
if not source_audio.exists():
|
||||
raise FileNotFoundError(f"Reference audio not found: {source_audio}")
|
||||
|
||||
destination = voices_dir() / name
|
||||
destination.mkdir(parents=True, exist_ok=True)
|
||||
audio_target = destination / source_audio.name
|
||||
text_target = destination / "reference.txt"
|
||||
metadata_target = destination / "voice.json"
|
||||
|
||||
if audio_target.resolve() != source_audio:
|
||||
if audio_target.exists():
|
||||
audio_target.unlink()
|
||||
audio_target.write_bytes(source_audio.read_bytes())
|
||||
if text_target.exists():
|
||||
text_target.unlink()
|
||||
text_target.write_text(ref_text.strip() + "\n", encoding="utf-8")
|
||||
|
||||
profile = VoiceProfile(
|
||||
name=name,
|
||||
ref_audio=str(audio_target),
|
||||
ref_text=ref_text.strip(),
|
||||
language=language,
|
||||
)
|
||||
metadata_target.write_text(
|
||||
json.dumps(asdict(profile), indent=2) + "\n", encoding="utf-8"
|
||||
)
|
||||
return metadata_target
|
||||
|
||||
|
||||
def load_voice(name: str) -> VoiceProfile:
|
||||
metadata_path = voices_dir() / name / "voice.json"
|
||||
if not metadata_path.exists():
|
||||
raise FileNotFoundError(f"Voice profile not found: {name}")
|
||||
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
return VoiceProfile(**payload)
|
||||
|
||||
|
||||
def list_voices() -> list[VoiceProfile]:
|
||||
if not voices_dir().exists():
|
||||
return []
|
||||
|
||||
profiles = []
|
||||
for metadata_path in sorted(voices_dir().glob("*/voice.json")):
|
||||
payload = json.loads(metadata_path.read_text(encoding="utf-8"))
|
||||
profiles.append(VoiceProfile(**payload))
|
||||
return profiles
|
||||
|
||||
|
||||
def synthesize(
|
||||
text: str,
|
||||
out: str,
|
||||
voice: str | None = None,
|
||||
ref_audio: str | None = None,
|
||||
ref_text: str | None = None,
|
||||
) -> Path:
|
||||
if not text.strip():
|
||||
raise ValueError("Input text is required")
|
||||
|
||||
# Fall back to the configured default voice when no voice is specified
|
||||
if not voice and not ref_audio:
|
||||
config = AppConfig.load()
|
||||
if config.default_voice:
|
||||
voice = config.default_voice
|
||||
|
||||
if voice:
|
||||
profile = load_voice(voice)
|
||||
ref_audio = profile.ref_audio
|
||||
ref_text = profile.ref_text
|
||||
|
||||
if not ref_audio or not ref_text:
|
||||
raise ValueError("Provide either --voice or both --ref-audio and --ref-text")
|
||||
|
||||
if not is_module_available("neutts"):
|
||||
raise RuntimeError("NeuTTS is not installed. Run 'neutts install' first.")
|
||||
|
||||
neu_module = importlib.import_module("neutts")
|
||||
NeuTTS = getattr(neu_module, "NeuTTS")
|
||||
|
||||
config = AppConfig.load()
|
||||
tts = NeuTTS(
|
||||
backbone_repo=config.backbone_repo,
|
||||
backbone_device=config.backbone_device,
|
||||
codec_repo=config.codec_repo,
|
||||
codec_device=config.codec_device,
|
||||
)
|
||||
ref_codes = tts.encode_reference(ref_audio)
|
||||
wav = tts.infer(text, ref_codes, ref_text)
|
||||
return write_wav(out, wav, config.sample_rate)
|
||||
Loading…
Add table
Add a link
Reference in a new issue