mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(tts): document NeuTTS provider and align install guidance (#1903)
Co-authored-by: charles-édouard <59705750+ccbbccbb@users.noreply.github.com>
This commit is contained in:
parent
fb923d5efc
commit
11f029c311
7 changed files with 79 additions and 17 deletions
|
|
@ -1710,7 +1710,7 @@ def _install_neutts_deps() -> bool:
|
||||||
return True
|
return True
|
||||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
||||||
print_error(f"Failed to install neutts: {e}")
|
print_error(f"Failed to install neutts: {e}")
|
||||||
print_info("Try manually: pip install neutts[all]")
|
print_info("Try manually: python -m pip install -U neutts[all]")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ Usage:
|
||||||
python -m tools.neutts_synth --text "Hello" --out output.wav \
|
python -m tools.neutts_synth --text "Hello" --out output.wav \
|
||||||
--ref-audio samples/jo.wav --ref-text samples/jo.txt
|
--ref-audio samples/jo.wav --ref-text samples/jo.txt
|
||||||
|
|
||||||
Requires: pip install neutts[all]
|
Requires: python -m pip install -U neutts[all]
|
||||||
System: apt install espeak-ng (or brew install espeak-ng)
|
System: apt install espeak-ng (or brew install espeak-ng)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -75,7 +75,7 @@ def main():
|
||||||
try:
|
try:
|
||||||
from neutts import NeuTTS
|
from neutts import NeuTTS
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("Error: neutts not installed. Run: pip install neutts[all]", file=sys.stderr)
|
print("Error: neutts not installed. Run: python -m pip install -U neutts[all]", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
tts = NeuTTS(
|
tts = NeuTTS(
|
||||||
|
|
|
||||||
|
|
@ -423,8 +423,8 @@ def text_to_speech_tool(
|
||||||
if not _check_neutts_available():
|
if not _check_neutts_available():
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
"success": False,
|
"success": False,
|
||||||
"error": "NeuTTS provider selected but neutts_cli is not installed. "
|
"error": "NeuTTS provider selected but neutts is not installed. "
|
||||||
"Install the NeuTTS skill and run the bootstrap helper first."
|
"Run hermes setup and choose NeuTTS, or install espeak-ng and run python -m pip install -U neutts[all]."
|
||||||
}, ensure_ascii=False)
|
}, ensure_ascii=False)
|
||||||
logger.info("Generating speech with NeuTTS (local)...")
|
logger.info("Generating speech with NeuTTS (local)...")
|
||||||
_generate_neutts(text, file_str, tts_config)
|
_generate_neutts(text, file_str, tts_config)
|
||||||
|
|
|
||||||
|
|
@ -72,6 +72,12 @@ pip install hermes-agent[messaging]
|
||||||
pip install hermes-agent[tts-premium]
|
pip install hermes-agent[tts-premium]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Local NeuTTS (optional)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pip install -U neutts[all]
|
||||||
|
```
|
||||||
|
|
||||||
### Everything
|
### Everything
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|
@ -84,18 +90,21 @@ pip install hermes-agent[all]
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
brew install portaudio ffmpeg opus
|
brew install portaudio ffmpeg opus
|
||||||
|
brew install espeak-ng
|
||||||
```
|
```
|
||||||
|
|
||||||
### Ubuntu / Debian
|
### Ubuntu / Debian
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo apt install portaudio19-dev ffmpeg libopus0
|
sudo apt install portaudio19-dev ffmpeg libopus0
|
||||||
|
sudo apt install espeak-ng
|
||||||
```
|
```
|
||||||
|
|
||||||
Why these matter:
|
Why these matter:
|
||||||
- `portaudio` → microphone input / playback for CLI voice mode
|
- `portaudio` → microphone input / playback for CLI voice mode
|
||||||
- `ffmpeg` → audio conversion for TTS and messaging delivery
|
- `ffmpeg` → audio conversion for TTS and messaging delivery
|
||||||
- `opus` → Discord voice codec support
|
- `opus` → Discord voice codec support
|
||||||
|
- `espeak-ng` → phonemizer backend for NeuTTS
|
||||||
|
|
||||||
## Step 4: choose STT and TTS providers
|
## Step 4: choose STT and TTS providers
|
||||||
|
|
||||||
|
|
@ -133,9 +142,20 @@ ELEVENLABS_API_KEY=***
|
||||||
#### Text-to-speech
|
#### Text-to-speech
|
||||||
|
|
||||||
- `edge` → free and good enough for most users
|
- `edge` → free and good enough for most users
|
||||||
|
- `neutts` → free local/on-device TTS
|
||||||
- `elevenlabs` → best quality
|
- `elevenlabs` → best quality
|
||||||
- `openai` → good middle ground
|
- `openai` → good middle ground
|
||||||
|
|
||||||
|
### If you use `hermes setup`
|
||||||
|
|
||||||
|
If you choose NeuTTS in the setup wizard, Hermes checks whether `neutts` is already installed. If it is missing, the wizard tells you NeuTTS needs the Python package `neutts` and the system package `espeak-ng`, offers to install them for you, installs `espeak-ng` with your platform package manager, and then runs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pip install -U neutts[all]
|
||||||
|
```
|
||||||
|
|
||||||
|
If you skip that install or it fails, the wizard falls back to Edge TTS.
|
||||||
|
|
||||||
## Step 5: recommended config
|
## Step 5: recommended config
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
|
|
@ -159,6 +179,18 @@ tts:
|
||||||
|
|
||||||
This is a good conservative default for most people.
|
This is a good conservative default for most people.
|
||||||
|
|
||||||
|
If you want local TTS instead, switch the `tts` block to:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
tts:
|
||||||
|
provider: "neutts"
|
||||||
|
neutts:
|
||||||
|
ref_audio: ''
|
||||||
|
ref_text: ''
|
||||||
|
model: neuphonic/neutts-air-q4-gguf
|
||||||
|
device: cpu
|
||||||
|
```
|
||||||
|
|
||||||
## Use case 1: CLI voice mode
|
## Use case 1: CLI voice mode
|
||||||
|
|
||||||
## Turn it on
|
## Turn it on
|
||||||
|
|
|
||||||
|
|
@ -929,7 +929,7 @@ You can also change the reasoning effort at runtime with the `/reasoning` comman
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
tts:
|
tts:
|
||||||
provider: "edge" # "edge" | "elevenlabs" | "openai"
|
provider: "edge" # "edge" | "elevenlabs" | "openai" | "neutts"
|
||||||
edge:
|
edge:
|
||||||
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
||||||
elevenlabs:
|
elevenlabs:
|
||||||
|
|
@ -938,6 +938,11 @@ tts:
|
||||||
openai:
|
openai:
|
||||||
model: "gpt-4o-mini-tts"
|
model: "gpt-4o-mini-tts"
|
||||||
voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer
|
voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer
|
||||||
|
neutts:
|
||||||
|
ref_audio: ''
|
||||||
|
ref_text: ''
|
||||||
|
model: neuphonic/neutts-air-q4-gguf
|
||||||
|
device: cpu
|
||||||
```
|
```
|
||||||
|
|
||||||
This controls both the `text_to_speech` tool and spoken replies in voice mode (`/voice tts` in the CLI or messaging gateway).
|
This controls both the `text_to_speech` tool and spoken replies in voice mode (`/voice tts` in the CLI or messaging gateway).
|
||||||
|
|
|
||||||
|
|
@ -10,13 +10,14 @@ Hermes Agent supports both text-to-speech output and voice message transcription
|
||||||
|
|
||||||
## Text-to-Speech
|
## Text-to-Speech
|
||||||
|
|
||||||
Convert text to speech with three providers:
|
Convert text to speech with four providers:
|
||||||
|
|
||||||
| Provider | Quality | Cost | API Key |
|
| Provider | Quality | Cost | API Key |
|
||||||
|----------|---------|------|---------|
|
|----------|---------|------|---------|
|
||||||
| **Edge TTS** (default) | Good | Free | None needed |
|
| **Edge TTS** (default) | Good | Free | None needed |
|
||||||
| **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` |
|
| **ElevenLabs** | Excellent | Paid | `ELEVENLABS_API_KEY` |
|
||||||
| **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
|
| **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
|
||||||
|
| **NeuTTS** | Good | Free | None needed |
|
||||||
|
|
||||||
### Platform Delivery
|
### Platform Delivery
|
||||||
|
|
||||||
|
|
@ -32,7 +33,7 @@ Convert text to speech with three providers:
|
||||||
```yaml
|
```yaml
|
||||||
# In ~/.hermes/config.yaml
|
# In ~/.hermes/config.yaml
|
||||||
tts:
|
tts:
|
||||||
provider: "edge" # "edge" | "elevenlabs" | "openai"
|
provider: "edge" # "edge" | "elevenlabs" | "openai" | "neutts"
|
||||||
edge:
|
edge:
|
||||||
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
||||||
elevenlabs:
|
elevenlabs:
|
||||||
|
|
@ -41,6 +42,11 @@ tts:
|
||||||
openai:
|
openai:
|
||||||
model: "gpt-4o-mini-tts"
|
model: "gpt-4o-mini-tts"
|
||||||
voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer
|
voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer
|
||||||
|
neutts:
|
||||||
|
ref_audio: ''
|
||||||
|
ref_text: ''
|
||||||
|
model: neuphonic/neutts-air-q4-gguf
|
||||||
|
device: cpu
|
||||||
```
|
```
|
||||||
|
|
||||||
### Telegram Voice Bubbles & ffmpeg
|
### Telegram Voice Bubbles & ffmpeg
|
||||||
|
|
@ -49,6 +55,7 @@ Telegram voice bubbles require Opus/OGG audio format:
|
||||||
|
|
||||||
- **OpenAI and ElevenLabs** produce Opus natively — no extra setup
|
- **OpenAI and ElevenLabs** produce Opus natively — no extra setup
|
||||||
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
|
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
|
||||||
|
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Ubuntu/Debian
|
# Ubuntu/Debian
|
||||||
|
|
@ -61,7 +68,7 @@ brew install ffmpeg
|
||||||
sudo dnf install ffmpeg
|
sudo dnf install ffmpeg
|
||||||
```
|
```
|
||||||
|
|
||||||
Without ffmpeg, Edge TTS audio is sent as a regular audio file (playable, but shows as a rectangular player instead of a voice bubble).
|
Without ffmpeg, Edge TTS and NeuTTS audio are sent as regular audio files (playable, but shown as a rectangular player instead of a voice bubble).
|
||||||
|
|
||||||
:::tip
|
:::tip
|
||||||
If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider.
|
If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider.
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,9 @@ pip install hermes-agent[messaging]
|
||||||
# Premium TTS (ElevenLabs)
|
# Premium TTS (ElevenLabs)
|
||||||
pip install hermes-agent[tts-premium]
|
pip install hermes-agent[tts-premium]
|
||||||
|
|
||||||
|
# Local TTS (NeuTTS, optional)
|
||||||
|
python -m pip install -U neutts[all]
|
||||||
|
|
||||||
# Everything at once
|
# Everything at once
|
||||||
pip install hermes-agent[all]
|
pip install hermes-agent[all]
|
||||||
```
|
```
|
||||||
|
|
@ -54,6 +57,8 @@ pip install hermes-agent[all]
|
||||||
| `messaging` | `discord.py[voice]`, `python-telegram-bot`, `aiohttp` | Discord & Telegram bots |
|
| `messaging` | `discord.py[voice]`, `python-telegram-bot`, `aiohttp` | Discord & Telegram bots |
|
||||||
| `tts-premium` | `elevenlabs` | ElevenLabs TTS provider |
|
| `tts-premium` | `elevenlabs` | ElevenLabs TTS provider |
|
||||||
|
|
||||||
|
Optional local TTS provider: install `neutts` separately with `python -m pip install -U neutts[all]`. On first use it downloads the model automatically.
|
||||||
|
|
||||||
:::info
|
:::info
|
||||||
`discord.py[voice]` installs **PyNaCl** (for voice encryption) and **opus bindings** automatically. This is required for Discord voice channel support.
|
`discord.py[voice]` installs **PyNaCl** (for voice encryption) and **opus bindings** automatically. This is required for Discord voice channel support.
|
||||||
:::
|
:::
|
||||||
|
|
@ -63,9 +68,11 @@ pip install hermes-agent[all]
|
||||||
```bash
|
```bash
|
||||||
# macOS
|
# macOS
|
||||||
brew install portaudio ffmpeg opus
|
brew install portaudio ffmpeg opus
|
||||||
|
brew install espeak-ng # for NeuTTS
|
||||||
|
|
||||||
# Ubuntu/Debian
|
# Ubuntu/Debian
|
||||||
sudo apt install portaudio19-dev ffmpeg libopus0
|
sudo apt install portaudio19-dev ffmpeg libopus0
|
||||||
|
sudo apt install espeak-ng # for NeuTTS
|
||||||
```
|
```
|
||||||
|
|
||||||
| Dependency | Purpose | Required For |
|
| Dependency | Purpose | Required For |
|
||||||
|
|
@ -73,6 +80,7 @@ sudo apt install portaudio19-dev ffmpeg libopus0
|
||||||
| **PortAudio** | Microphone input and audio playback | CLI voice mode |
|
| **PortAudio** | Microphone input and audio playback | CLI voice mode |
|
||||||
| **ffmpeg** | Audio format conversion (MP3 → Opus, PCM → WAV) | All platforms |
|
| **ffmpeg** | Audio format conversion (MP3 → Opus, PCM → WAV) | All platforms |
|
||||||
| **Opus** | Discord voice codec | Discord voice channels |
|
| **Opus** | Discord voice codec | Discord voice channels |
|
||||||
|
| **espeak-ng** | Phonemizer backend | Local NeuTTS provider |
|
||||||
|
|
||||||
### API Keys
|
### API Keys
|
||||||
|
|
||||||
|
|
@ -84,8 +92,9 @@ Add to `~/.hermes/.env`:
|
||||||
GROQ_API_KEY=your-key # Groq Whisper — fast, free tier (cloud)
|
GROQ_API_KEY=your-key # Groq Whisper — fast, free tier (cloud)
|
||||||
VOICE_TOOLS_OPENAI_KEY=your-key # OpenAI Whisper — paid (cloud)
|
VOICE_TOOLS_OPENAI_KEY=your-key # OpenAI Whisper — paid (cloud)
|
||||||
|
|
||||||
# Text-to-Speech (optional — Edge TTS works without any key)
|
# Text-to-Speech (optional — Edge TTS and NeuTTS work without any key)
|
||||||
ELEVENLABS_API_KEY=your-key # ElevenLabs — premium quality
|
ELEVENLABS_API_KEY=*** # ElevenLabs — premium quality
|
||||||
|
# VOICE_TOOLS_OPENAI_KEY above also enables OpenAI TTS
|
||||||
```
|
```
|
||||||
|
|
||||||
:::tip
|
:::tip
|
||||||
|
|
@ -303,8 +312,9 @@ DISCORD_ALLOWED_USERS=your-user-id
|
||||||
# STT — local provider needs no key (pip install faster-whisper)
|
# STT — local provider needs no key (pip install faster-whisper)
|
||||||
# GROQ_API_KEY=your-key # Alternative: cloud-based, fast, free tier
|
# GROQ_API_KEY=your-key # Alternative: cloud-based, fast, free tier
|
||||||
|
|
||||||
# TTS — optional, Edge TTS (free) is the default
|
# TTS — optional. Edge TTS and NeuTTS need no key.
|
||||||
# ELEVENLABS_API_KEY=your-key # Premium quality
|
# ELEVENLABS_API_KEY=*** # Premium quality
|
||||||
|
# VOICE_TOOLS_OPENAI_KEY=*** # OpenAI TTS / Whisper
|
||||||
```
|
```
|
||||||
|
|
||||||
### Start the Gateway
|
### Start the Gateway
|
||||||
|
|
@ -385,7 +395,7 @@ stt:
|
||||||
|
|
||||||
# Text-to-Speech
|
# Text-to-Speech
|
||||||
tts:
|
tts:
|
||||||
provider: "edge" # "edge" (free) | "elevenlabs" | "openai"
|
provider: "edge" # "edge" (free) | "elevenlabs" | "openai" | "neutts"
|
||||||
edge:
|
edge:
|
||||||
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
||||||
elevenlabs:
|
elevenlabs:
|
||||||
|
|
@ -394,6 +404,11 @@ tts:
|
||||||
openai:
|
openai:
|
||||||
model: "gpt-4o-mini-tts"
|
model: "gpt-4o-mini-tts"
|
||||||
voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer
|
voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer
|
||||||
|
neutts:
|
||||||
|
ref_audio: ''
|
||||||
|
ref_text: ''
|
||||||
|
model: neuphonic/neutts-air-q4-gguf
|
||||||
|
device: cpu
|
||||||
```
|
```
|
||||||
|
|
||||||
### Environment Variables
|
### Environment Variables
|
||||||
|
|
@ -410,9 +425,9 @@ STT_OPENAI_MODEL=whisper-1 # Override default OpenAI STT model
|
||||||
GROQ_BASE_URL=https://api.groq.com/openai/v1 # Custom Groq endpoint
|
GROQ_BASE_URL=https://api.groq.com/openai/v1 # Custom Groq endpoint
|
||||||
STT_OPENAI_BASE_URL=https://api.openai.com/v1 # Custom OpenAI STT endpoint
|
STT_OPENAI_BASE_URL=https://api.openai.com/v1 # Custom OpenAI STT endpoint
|
||||||
|
|
||||||
# Text-to-Speech providers (Edge TTS needs no key)
|
# Text-to-Speech providers (Edge TTS and NeuTTS need no key)
|
||||||
ELEVENLABS_API_KEY=... # ElevenLabs (premium quality)
|
ELEVENLABS_API_KEY=*** # ElevenLabs (premium quality)
|
||||||
# OpenAI TTS uses VOICE_TOOLS_OPENAI_KEY
|
# VOICE_TOOLS_OPENAI_KEY above also enables OpenAI TTS
|
||||||
|
|
||||||
# Discord voice channel
|
# Discord voice channel
|
||||||
DISCORD_BOT_TOKEN=...
|
DISCORD_BOT_TOKEN=...
|
||||||
|
|
@ -440,6 +455,9 @@ Provider priority (automatic fallback): **local** > **groq** > **openai**
|
||||||
| **Edge TTS** | Good | Free | ~1s | No |
|
| **Edge TTS** | Good | Free | ~1s | No |
|
||||||
| **ElevenLabs** | Excellent | Paid | ~2s | Yes |
|
| **ElevenLabs** | Excellent | Paid | ~2s | Yes |
|
||||||
| **OpenAI TTS** | Good | Paid | ~1.5s | Yes |
|
| **OpenAI TTS** | Good | Paid | ~1.5s | Yes |
|
||||||
|
| **NeuTTS** | Good | Free | Depends on CPU/GPU | No |
|
||||||
|
|
||||||
|
NeuTTS uses the `tts.neutts` config block above.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue