feat(tts): add Google Gemini TTS provider (#11229)

Adds Google Gemini TTS as the seventh voice provider, with 30 prebuilt
voices (Zephyr, Puck, Kore, Enceladus, Gacrux, etc.) and natural-language
prompt control. Integrates through the existing provider chain:

- tools/tts_tool.py: new _generate_gemini_tts() calls the
  generativelanguage REST endpoint with responseModalities=[AUDIO],
  wraps the returned 24kHz mono 16-bit PCM (L16) in a WAV RIFF header,
  then ffmpeg-converts to MP3 or Opus depending on output extension.
  For .ogg output, libopus is forced explicitly so Telegram voice
  bubbles get Opus (ffmpeg defaults to Vorbis for .ogg).
- hermes_cli/tools_config.py: exposes 'Google Gemini TTS' as a provider
  option in the curses-based 'hermes tools' UI.
- hermes_cli/setup.py: adds gemini to the setup wizard picker, tool
  status display, and API key prompt branch (accepts existing
  GEMINI_API_KEY or GOOGLE_API_KEY, falls back to Edge if neither set).
- tests/tools/test_tts_gemini.py: 15 unit tests covering WAV header
  wrap correctness, env var fallback (GEMINI/GOOGLE), voice/model
  overrides, snake_case vs camelCase inlineData handling, HTTP error
  surfacing, and empty-audio edge cases.
- docs: TTS features page updated to list seven providers with the new
  gemini config block and ffmpeg notes.

Live-tested against api key against gemini-2.5-flash-preview-tts: .wav,
.mp3, and Telegram-compatible .ogg (Opus codec) all produce valid
playable audio.
This commit is contained in:
Teknium 2026-04-16 14:23:16 -07:00 committed by GitHub
parent 80855f964e
commit fce6c3cdf6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 506 additions and 6 deletions

View file

@ -430,6 +430,8 @@ def _print_setup_summary(config: dict, hermes_home):
tool_status.append(("Text-to-Speech (MiniMax)", True, None)) tool_status.append(("Text-to-Speech (MiniMax)", True, None))
elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"): elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"):
tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None)) tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None))
elif tts_provider == "gemini" and (get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")):
tool_status.append(("Text-to-Speech (Google Gemini)", True, None))
elif tts_provider == "neutts": elif tts_provider == "neutts":
try: try:
import importlib.util import importlib.util
@ -913,6 +915,7 @@ def _setup_tts_provider(config: dict):
"xai": "xAI TTS", "xai": "xAI TTS",
"minimax": "MiniMax TTS", "minimax": "MiniMax TTS",
"mistral": "Mistral Voxtral TTS", "mistral": "Mistral Voxtral TTS",
"gemini": "Google Gemini TTS",
"neutts": "NeuTTS", "neutts": "NeuTTS",
} }
current_label = provider_labels.get(current_provider, current_provider) current_label = provider_labels.get(current_provider, current_provider)
@ -935,10 +938,11 @@ def _setup_tts_provider(config: dict):
"xAI TTS (Grok voices, needs API key)", "xAI TTS (Grok voices, needs API key)",
"MiniMax TTS (high quality with voice cloning, needs API key)", "MiniMax TTS (high quality with voice cloning, needs API key)",
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)", "Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
"Google Gemini TTS (30 prebuilt voices, prompt-controllable, needs API key)",
"NeuTTS (local on-device, free, ~300MB model download)", "NeuTTS (local on-device, free, ~300MB model download)",
] ]
) )
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts"]) providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "gemini", "neutts"])
choices.append(f"Keep current ({current_label})") choices.append(f"Keep current ({current_label})")
keep_current_idx = len(choices) - 1 keep_current_idx = len(choices) - 1
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx) idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
@ -1045,6 +1049,19 @@ def _setup_tts_provider(config: dict):
print_warning("No API key provided. Falling back to Edge TTS.") print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge" selected = "edge"
elif selected == "gemini":
existing = get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")
if not existing:
print()
print_info("Get a free API key at https://aistudio.google.com/app/apikey")
api_key = prompt("Gemini API key for TTS", password=True)
if api_key:
save_env_value("GEMINI_API_KEY", api_key)
print_success("Gemini TTS API key saved")
else:
print_warning("No API key provided. Falling back to Edge TTS.")
selected = "edge"
# Save the selection # Save the selection
if "tts" not in config: if "tts" not in config:
config["tts"] = {} config["tts"] = {}

View file

@ -172,6 +172,15 @@ TOOL_CATEGORIES = {
], ],
"tts_provider": "mistral", "tts_provider": "mistral",
}, },
{
"name": "Google Gemini TTS",
"badge": "preview",
"tag": "30 prebuilt voices, controllable via prompts",
"env_vars": [
{"key": "GEMINI_API_KEY", "prompt": "Gemini API key", "url": "https://aistudio.google.com/app/apikey"},
],
"tts_provider": "gemini",
},
], ],
}, },
"web": { "web": {

View file

@ -0,0 +1,287 @@
"""Tests for the Google Gemini TTS provider in tools/tts_tool.py."""
import base64
import struct
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture(autouse=True)
def clean_env(monkeypatch):
for key in (
"GEMINI_API_KEY",
"GOOGLE_API_KEY",
"GEMINI_BASE_URL",
"HERMES_SESSION_PLATFORM",
):
monkeypatch.delenv(key, raising=False)
@pytest.fixture
def fake_pcm_bytes():
# 0.1s of silence at 24kHz mono 16-bit = 4800 bytes
return b"\x00" * 4800
@pytest.fixture
def mock_gemini_response(fake_pcm_bytes):
"""A successful Gemini generateContent response."""
resp = MagicMock()
resp.status_code = 200
resp.json.return_value = {
"candidates": [
{
"content": {
"parts": [
{
"inlineData": {
"mimeType": "audio/L16;codec=pcm;rate=24000",
"data": base64.b64encode(fake_pcm_bytes).decode(),
}
}
]
}
}
]
}
return resp
class TestWrapPcmAsWav:
def test_riff_header_structure(self):
from tools.tts_tool import _wrap_pcm_as_wav
pcm = b"\x01\x02\x03\x04" * 10
wav = _wrap_pcm_as_wav(pcm, sample_rate=24000, channels=1, sample_width=2)
assert wav[:4] == b"RIFF"
assert wav[8:12] == b"WAVE"
assert wav[12:16] == b"fmt "
# Audio format (PCM=1)
assert struct.unpack("<H", wav[20:22])[0] == 1
# Channels
assert struct.unpack("<H", wav[22:24])[0] == 1
# Sample rate
assert struct.unpack("<I", wav[24:28])[0] == 24000
# Bits per sample
assert struct.unpack("<H", wav[34:36])[0] == 16
assert wav[36:40] == b"data"
assert wav[44:] == pcm
def test_header_size_is_44(self):
from tools.tts_tool import _wrap_pcm_as_wav
pcm = b"\xff" * 100
wav = _wrap_pcm_as_wav(pcm)
assert len(wav) == 44 + len(pcm)
class TestGenerateGeminiTts:
def test_missing_api_key_raises_value_error(self, tmp_path):
from tools.tts_tool import _generate_gemini_tts
output_path = str(tmp_path / "test.wav")
with pytest.raises(ValueError, match="GEMINI_API_KEY"):
_generate_gemini_tts("Hello", output_path, {})
def test_google_api_key_fallback(self, tmp_path, monkeypatch, mock_gemini_response):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GOOGLE_API_KEY", "from-google-env")
output_path = str(tmp_path / "test.wav")
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi", output_path, {})
# Confirm it used the GOOGLE_API_KEY as the query parameter
_, kwargs = mock_post.call_args
assert kwargs["params"]["key"] == "from-google-env"
def test_wav_output_fast_path(self, tmp_path, monkeypatch, mock_gemini_response, fake_pcm_bytes):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
output_path = str(tmp_path / "test.wav")
with patch("requests.post", return_value=mock_gemini_response):
result = _generate_gemini_tts("Hi", output_path, {})
assert result == output_path
data = (tmp_path / "test.wav").read_bytes()
assert data[:4] == b"RIFF"
assert data[8:12] == b"WAVE"
# Audio payload should match the PCM we put in
assert data[44:] == fake_pcm_bytes
def test_default_voice_and_model(self, tmp_path, monkeypatch, mock_gemini_response):
from tools.tts_tool import (
DEFAULT_GEMINI_TTS_MODEL,
DEFAULT_GEMINI_TTS_VOICE,
_generate_gemini_tts,
)
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
args, kwargs = mock_post.call_args
assert DEFAULT_GEMINI_TTS_MODEL in args[0]
payload = kwargs["json"]
voice = (
payload["generationConfig"]["speechConfig"]["voiceConfig"]
["prebuiltVoiceConfig"]["voiceName"]
)
assert voice == DEFAULT_GEMINI_TTS_VOICE
def test_custom_voice(self, tmp_path, monkeypatch, mock_gemini_response):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
config = {"gemini": {"voice": "Puck"}}
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
payload = mock_post.call_args[1]["json"]
voice = (
payload["generationConfig"]["speechConfig"]["voiceConfig"]
["prebuiltVoiceConfig"]["voiceName"]
)
assert voice == "Puck"
def test_custom_model(self, tmp_path, monkeypatch, mock_gemini_response):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
config = {"gemini": {"model": "gemini-2.5-pro-preview-tts"}}
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
endpoint = mock_post.call_args[0][0]
assert "gemini-2.5-pro-preview-tts" in endpoint
def test_response_modality_is_audio(self, tmp_path, monkeypatch, mock_gemini_response):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
payload = mock_post.call_args[1]["json"]
assert payload["generationConfig"]["responseModalities"] == ["AUDIO"]
def test_http_error_raises_runtime_error(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
err_resp = MagicMock()
err_resp.status_code = 400
err_resp.json.return_value = {"error": {"message": "Invalid voice"}}
with patch("requests.post", return_value=err_resp):
with pytest.raises(RuntimeError, match="HTTP 400.*Invalid voice"):
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
def test_empty_audio_raises(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
resp = MagicMock()
resp.status_code = 200
resp.json.return_value = {
"candidates": [
{"content": {"parts": [{"inlineData": {"data": ""}}]}}
]
}
with patch("requests.post", return_value=resp):
with pytest.raises(RuntimeError, match="empty audio"):
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
def test_malformed_response_raises(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
resp = MagicMock()
resp.status_code = 200
resp.json.return_value = {"candidates": []} # no content
with patch("requests.post", return_value=resp):
with pytest.raises(RuntimeError, match="malformed"):
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
def test_snake_case_inline_data_accepted(self, tmp_path, monkeypatch, fake_pcm_bytes):
"""Some Gemini SDK versions return inline_data instead of inlineData."""
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
resp = MagicMock()
resp.status_code = 200
resp.json.return_value = {
"candidates": [
{
"content": {
"parts": [
{
"inline_data": {
"data": base64.b64encode(fake_pcm_bytes).decode()
}
}
]
}
}
]
}
output_path = str(tmp_path / "test.wav")
with patch("requests.post", return_value=resp):
_generate_gemini_tts("Hi", output_path, {})
data = (tmp_path / "test.wav").read_bytes()
assert data[:4] == b"RIFF"
def test_custom_base_url_env(self, tmp_path, monkeypatch, mock_gemini_response):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
monkeypatch.setenv("GEMINI_BASE_URL", "https://custom-gemini.example.com/v1beta")
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
assert mock_post.call_args[0][0].startswith("https://custom-gemini.example.com/v1beta/")
class TestGeminiInCheckRequirements:
def test_gemini_api_key_satisfies_requirements(self, monkeypatch):
from tools.tts_tool import check_tts_requirements
# Strip everything else
for key in (
"ELEVENLABS_API_KEY",
"OPENAI_API_KEY",
"VOICE_TOOLS_OPENAI_KEY",
"MINIMAX_API_KEY",
"XAI_API_KEY",
"MISTRAL_API_KEY",
"GOOGLE_API_KEY",
):
monkeypatch.delenv(key, raising=False)
monkeypatch.setenv("GEMINI_API_KEY", "k")
# Force edge_tts import to fail so we actually hit the gemini check
import builtins
real_import = builtins.__import__
def fake_import(name, *args, **kwargs):
if name == "edge_tts":
raise ImportError("simulated")
return real_import(name, *args, **kwargs)
with patch("builtins.__import__", side_effect=fake_import):
assert check_tts_requirements() is True

View file

@ -2,12 +2,13 @@
""" """
Text-to-Speech Tool Module Text-to-Speech Tool Module
Supports six TTS providers: Supports seven TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices - Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY - ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY - OpenAI TTS: Good quality, needs OPENAI_API_KEY
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY - MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY - Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
- Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed - NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
Output formats: Output formats:
@ -99,6 +100,13 @@ DEFAULT_XAI_LANGUAGE = "en"
DEFAULT_XAI_SAMPLE_RATE = 24000 DEFAULT_XAI_SAMPLE_RATE = 24000
DEFAULT_XAI_BIT_RATE = 128000 DEFAULT_XAI_BIT_RATE = 128000
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1" DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
DEFAULT_GEMINI_TTS_VOICE = "Kore"
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
# PCM output specs for Gemini TTS (fixed by the API)
GEMINI_TTS_SAMPLE_RATE = 24000
GEMINI_TTS_CHANNELS = 1
GEMINI_TTS_SAMPLE_WIDTH = 2 # 16-bit PCM (L16)
def _get_default_output_dir() -> str: def _get_default_output_dir() -> str:
from hermes_constants import get_hermes_dir from hermes_constants import get_hermes_dir
@ -506,6 +514,174 @@ def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any
return output_path return output_path
# ===========================================================================
# Provider: Google Gemini TTS
# ===========================================================================
def _wrap_pcm_as_wav(
pcm_bytes: bytes,
sample_rate: int = GEMINI_TTS_SAMPLE_RATE,
channels: int = GEMINI_TTS_CHANNELS,
sample_width: int = GEMINI_TTS_SAMPLE_WIDTH,
) -> bytes:
"""Wrap raw signed-little-endian PCM with a standard WAV RIFF header.
Gemini TTS returns audio/L16;codec=pcm;rate=24000 -- raw PCM samples with
no container. We add a minimal WAV header so the file is playable and
ffmpeg can re-encode it to MP3/Opus downstream.
"""
import struct
byte_rate = sample_rate * channels * sample_width
block_align = channels * sample_width
data_size = len(pcm_bytes)
fmt_chunk = struct.pack(
"<4sIHHIIHH",
b"fmt ",
16, # fmt chunk size (PCM)
1, # audio format (PCM)
channels,
sample_rate,
byte_rate,
block_align,
sample_width * 8,
)
data_chunk_header = struct.pack("<4sI", b"data", data_size)
riff_size = 4 + len(fmt_chunk) + len(data_chunk_header) + data_size
riff_header = struct.pack("<4sI4s", b"RIFF", riff_size, b"WAVE")
return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate audio using Google Gemini TTS.
Gemini's generateContent endpoint with responseModalities=["AUDIO"] returns
raw 24kHz mono 16-bit PCM (L16) as base64. We wrap it with a WAV RIFF
header to produce a playable file, then ffmpeg-convert to MP3 / Opus if
the caller requested those formats (same pattern as NeuTTS).
Args:
text: Text to convert (prompt-style; supports inline direction like
"Say cheerfully:" and audio tags like [whispers]).
output_path: Where to save the audio file (.wav, .mp3, or .ogg).
tts_config: TTS config dict.
Returns:
Path to the saved audio file.
"""
import requests
api_key = (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or "").strip()
if not api_key:
raise ValueError(
"GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
)
gemini_config = tts_config.get("gemini", {})
model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
base_url = str(
gemini_config.get("base_url")
or os.getenv("GEMINI_BASE_URL")
or DEFAULT_GEMINI_TTS_BASE_URL
).strip().rstrip("/")
payload: Dict[str, Any] = {
"contents": [{"parts": [{"text": text}]}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice},
},
},
},
}
endpoint = f"{base_url}/models/{model}:generateContent"
response = requests.post(
endpoint,
params={"key": api_key},
headers={"Content-Type": "application/json"},
json=payload,
timeout=60,
)
if response.status_code != 200:
# Surface the API error message when present
try:
err = response.json().get("error", {})
detail = err.get("message") or response.text[:300]
except Exception:
detail = response.text[:300]
raise RuntimeError(
f"Gemini TTS API error (HTTP {response.status_code}): {detail}"
)
try:
data = response.json()
parts = data["candidates"][0]["content"]["parts"]
audio_part = next((p for p in parts if "inlineData" in p or "inline_data" in p), None)
if audio_part is None:
raise RuntimeError("Gemini TTS response contained no audio data")
inline = audio_part.get("inlineData") or audio_part.get("inline_data") or {}
audio_b64 = inline.get("data", "")
except (KeyError, IndexError, TypeError) as e:
raise RuntimeError(f"Gemini TTS response was malformed: {e}") from e
if not audio_b64:
raise RuntimeError("Gemini TTS returned empty audio data")
pcm_bytes = base64.b64decode(audio_b64)
wav_bytes = _wrap_pcm_as_wav(pcm_bytes)
# Fast path: caller wants WAV directly, just write.
if output_path.lower().endswith(".wav"):
with open(output_path, "wb") as f:
f.write(wav_bytes)
return output_path
# Otherwise write WAV to a temp file and ffmpeg-convert to the target
# format (.mp3 or .ogg). If ffmpeg is missing, fall back to renaming the
# WAV -- this matches the NeuTTS behavior and keeps the tool usable on
# systems without ffmpeg (audio still plays, just with a misleading
# extension).
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(wav_bytes)
wav_path = tmp.name
try:
ffmpeg = shutil.which("ffmpeg")
if ffmpeg:
# For .ogg output, force libopus encoding (Telegram voice bubbles
# require Opus specifically; ffmpeg's default for .ogg is Vorbis).
if output_path.lower().endswith(".ogg"):
cmd = [
ffmpeg, "-i", wav_path,
"-acodec", "libopus", "-ac", "1",
"-b:a", "64k", "-vbr", "off",
"-y", "-loglevel", "error",
output_path,
]
else:
cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
result = subprocess.run(cmd, capture_output=True, timeout=30)
if result.returncode != 0:
stderr = result.stderr.decode("utf-8", errors="ignore")[:300]
raise RuntimeError(f"ffmpeg conversion failed: {stderr}")
else:
logger.warning(
"ffmpeg not found; writing raw WAV to %s (extension may be misleading)",
output_path,
)
shutil.copyfile(wav_path, output_path)
finally:
try:
os.remove(wav_path)
except OSError:
pass
return output_path
# =========================================================================== # ===========================================================================
# NeuTTS (local, on-device TTS via neutts_cli) # NeuTTS (local, on-device TTS via neutts_cli)
# =========================================================================== # ===========================================================================
@ -634,7 +810,7 @@ def text_to_speech_tool(
out_dir.mkdir(parents=True, exist_ok=True) out_dir.mkdir(parents=True, exist_ok=True)
# Use .ogg for Telegram with providers that support native Opus output, # Use .ogg for Telegram with providers that support native Opus output,
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later). # otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
if want_opus and provider in ("openai", "elevenlabs", "mistral"): if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
file_path = out_dir / f"tts_{timestamp}.ogg" file_path = out_dir / f"tts_{timestamp}.ogg"
else: else:
file_path = out_dir / f"tts_{timestamp}.mp3" file_path = out_dir / f"tts_{timestamp}.mp3"
@ -687,6 +863,10 @@ def text_to_speech_tool(
logger.info("Generating speech with Mistral Voxtral TTS...") logger.info("Generating speech with Mistral Voxtral TTS...")
_generate_mistral_tts(text, file_str, tts_config) _generate_mistral_tts(text, file_str, tts_config)
elif provider == "gemini":
logger.info("Generating speech with Google Gemini TTS...")
_generate_gemini_tts(text, file_str, tts_config)
elif provider == "neutts": elif provider == "neutts":
if not _check_neutts_available(): if not _check_neutts_available():
return json.dumps({ return json.dumps({
@ -741,7 +921,7 @@ def text_to_speech_tool(
if opus_path: if opus_path:
file_str = opus_path file_str = opus_path
voice_compatible = True voice_compatible = True
elif provider in ("elevenlabs", "openai", "mistral"): elif provider in ("elevenlabs", "openai", "mistral", "gemini"):
voice_compatible = file_str.endswith(".ogg") voice_compatible = file_str.endswith(".ogg")
file_size = os.path.getsize(file_str) file_size = os.path.getsize(file_str)
@ -811,6 +991,8 @@ def check_tts_requirements() -> bool:
return True return True
if os.getenv("XAI_API_KEY"): if os.getenv("XAI_API_KEY"):
return True return True
if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
return True
try: try:
_import_mistral_client() _import_mistral_client()
if os.getenv("MISTRAL_API_KEY"): if os.getenv("MISTRAL_API_KEY"):

View file

@ -14,7 +14,7 @@ If you have a paid [Nous Portal](https://portal.nousresearch.com) subscription,
## Text-to-Speech ## Text-to-Speech
Convert text to speech with six providers: Convert text to speech with seven providers:
| Provider | Quality | Cost | API Key | | Provider | Quality | Cost | API Key |
|----------|---------|------|---------| |----------|---------|------|---------|
@ -23,6 +23,7 @@ Convert text to speech with six providers:
| **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` | | **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` | | **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` | | **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
| **Google Gemini TTS** | Excellent | Free tier | `GEMINI_API_KEY` |
| **NeuTTS** | Good | Free | None needed | | **NeuTTS** | Good | Free | None needed |
### Platform Delivery ### Platform Delivery
@ -39,7 +40,7 @@ Convert text to speech with six providers:
```yaml ```yaml
# In ~/.hermes/config.yaml # In ~/.hermes/config.yaml
tts: tts:
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts" provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "gemini" | "neutts"
speed: 1.0 # Global speed multiplier (provider-specific settings override this) speed: 1.0 # Global speed multiplier (provider-specific settings override this)
edge: edge:
voice: "en-US-AriaNeural" # 322 voices, 74 languages voice: "en-US-AriaNeural" # 322 voices, 74 languages
@ -61,6 +62,9 @@ tts:
mistral: mistral:
model: "voxtral-mini-tts-2603" model: "voxtral-mini-tts-2603"
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default) voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
gemini:
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc.
neutts: neutts:
ref_audio: '' ref_audio: ''
ref_text: '' ref_text: ''
@ -77,6 +81,7 @@ Telegram voice bubbles require Opus/OGG audio format:
- **OpenAI, ElevenLabs, and Mistral** produce Opus natively — no extra setup - **OpenAI, ElevenLabs, and Mistral** produce Opus natively — no extra setup
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert: - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles - **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
- **Google Gemini TTS** outputs raw PCM and uses **ffmpeg** to encode Opus directly for Telegram voice bubbles
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
```bash ```bash