feat(tts): add Google Gemini TTS provider (#11229)

Adds Google Gemini TTS as the seventh voice provider, with 30 prebuilt
voices (Zephyr, Puck, Kore, Enceladus, Gacrux, etc.) and natural-language
prompt control. Integrates through the existing provider chain:

- tools/tts_tool.py: new _generate_gemini_tts() calls the
  generativelanguage REST endpoint with responseModalities=[AUDIO],
  wraps the returned 24kHz mono 16-bit PCM (L16) in a WAV RIFF header,
  then ffmpeg-converts to MP3 or Opus depending on output extension.
  For .ogg output, libopus is forced explicitly so Telegram voice
  bubbles get Opus (ffmpeg defaults to Vorbis for .ogg).
- hermes_cli/tools_config.py: exposes 'Google Gemini TTS' as a provider
  option in the curses-based 'hermes tools' UI.
- hermes_cli/setup.py: adds gemini to the setup wizard picker, tool
  status display, and API key prompt branch (accepts existing
  GEMINI_API_KEY or GOOGLE_API_KEY, falls back to Edge if neither set).
- tests/tools/test_tts_gemini.py: 15 unit tests covering WAV header
  wrap correctness, env var fallback (GEMINI/GOOGLE), voice/model
  overrides, snake_case vs camelCase inlineData handling, HTTP error
  surfacing, and empty-audio edge cases.
- docs: TTS features page updated to list seven providers with the new
  gemini config block and ffmpeg notes.

Live-tested against api key against gemini-2.5-flash-preview-tts: .wav,
.mp3, and Telegram-compatible .ogg (Opus codec) all produce valid
playable audio.
This commit is contained in:
Teknium 2026-04-16 14:23:16 -07:00 committed by GitHub
parent 80855f964e
commit fce6c3cdf6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 506 additions and 6 deletions

View file

@ -2,12 +2,13 @@
"""
Text-to-Speech Tool Module
Supports six TTS providers:
Supports seven TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
- Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
Output formats:
@ -99,6 +100,13 @@ DEFAULT_XAI_LANGUAGE = "en"
DEFAULT_XAI_SAMPLE_RATE = 24000
DEFAULT_XAI_BIT_RATE = 128000
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
DEFAULT_GEMINI_TTS_VOICE = "Kore"
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
# PCM output specs for Gemini TTS (fixed by the API)
GEMINI_TTS_SAMPLE_RATE = 24000
GEMINI_TTS_CHANNELS = 1
GEMINI_TTS_SAMPLE_WIDTH = 2 # 16-bit PCM (L16)
def _get_default_output_dir() -> str:
from hermes_constants import get_hermes_dir
@ -506,6 +514,174 @@ def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any
return output_path
# ===========================================================================
# Provider: Google Gemini TTS
# ===========================================================================
def _wrap_pcm_as_wav(
pcm_bytes: bytes,
sample_rate: int = GEMINI_TTS_SAMPLE_RATE,
channels: int = GEMINI_TTS_CHANNELS,
sample_width: int = GEMINI_TTS_SAMPLE_WIDTH,
) -> bytes:
"""Wrap raw signed-little-endian PCM with a standard WAV RIFF header.
Gemini TTS returns audio/L16;codec=pcm;rate=24000 -- raw PCM samples with
no container. We add a minimal WAV header so the file is playable and
ffmpeg can re-encode it to MP3/Opus downstream.
"""
import struct
byte_rate = sample_rate * channels * sample_width
block_align = channels * sample_width
data_size = len(pcm_bytes)
fmt_chunk = struct.pack(
"<4sIHHIIHH",
b"fmt ",
16, # fmt chunk size (PCM)
1, # audio format (PCM)
channels,
sample_rate,
byte_rate,
block_align,
sample_width * 8,
)
data_chunk_header = struct.pack("<4sI", b"data", data_size)
riff_size = 4 + len(fmt_chunk) + len(data_chunk_header) + data_size
riff_header = struct.pack("<4sI4s", b"RIFF", riff_size, b"WAVE")
return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate audio using Google Gemini TTS.
Gemini's generateContent endpoint with responseModalities=["AUDIO"] returns
raw 24kHz mono 16-bit PCM (L16) as base64. We wrap it with a WAV RIFF
header to produce a playable file, then ffmpeg-convert to MP3 / Opus if
the caller requested those formats (same pattern as NeuTTS).
Args:
text: Text to convert (prompt-style; supports inline direction like
"Say cheerfully:" and audio tags like [whispers]).
output_path: Where to save the audio file (.wav, .mp3, or .ogg).
tts_config: TTS config dict.
Returns:
Path to the saved audio file.
"""
import requests
api_key = (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or "").strip()
if not api_key:
raise ValueError(
"GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
)
gemini_config = tts_config.get("gemini", {})
model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
base_url = str(
gemini_config.get("base_url")
or os.getenv("GEMINI_BASE_URL")
or DEFAULT_GEMINI_TTS_BASE_URL
).strip().rstrip("/")
payload: Dict[str, Any] = {
"contents": [{"parts": [{"text": text}]}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice},
},
},
},
}
endpoint = f"{base_url}/models/{model}:generateContent"
response = requests.post(
endpoint,
params={"key": api_key},
headers={"Content-Type": "application/json"},
json=payload,
timeout=60,
)
if response.status_code != 200:
# Surface the API error message when present
try:
err = response.json().get("error", {})
detail = err.get("message") or response.text[:300]
except Exception:
detail = response.text[:300]
raise RuntimeError(
f"Gemini TTS API error (HTTP {response.status_code}): {detail}"
)
try:
data = response.json()
parts = data["candidates"][0]["content"]["parts"]
audio_part = next((p for p in parts if "inlineData" in p or "inline_data" in p), None)
if audio_part is None:
raise RuntimeError("Gemini TTS response contained no audio data")
inline = audio_part.get("inlineData") or audio_part.get("inline_data") or {}
audio_b64 = inline.get("data", "")
except (KeyError, IndexError, TypeError) as e:
raise RuntimeError(f"Gemini TTS response was malformed: {e}") from e
if not audio_b64:
raise RuntimeError("Gemini TTS returned empty audio data")
pcm_bytes = base64.b64decode(audio_b64)
wav_bytes = _wrap_pcm_as_wav(pcm_bytes)
# Fast path: caller wants WAV directly, just write.
if output_path.lower().endswith(".wav"):
with open(output_path, "wb") as f:
f.write(wav_bytes)
return output_path
# Otherwise write WAV to a temp file and ffmpeg-convert to the target
# format (.mp3 or .ogg). If ffmpeg is missing, fall back to renaming the
# WAV -- this matches the NeuTTS behavior and keeps the tool usable on
# systems without ffmpeg (audio still plays, just with a misleading
# extension).
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(wav_bytes)
wav_path = tmp.name
try:
ffmpeg = shutil.which("ffmpeg")
if ffmpeg:
# For .ogg output, force libopus encoding (Telegram voice bubbles
# require Opus specifically; ffmpeg's default for .ogg is Vorbis).
if output_path.lower().endswith(".ogg"):
cmd = [
ffmpeg, "-i", wav_path,
"-acodec", "libopus", "-ac", "1",
"-b:a", "64k", "-vbr", "off",
"-y", "-loglevel", "error",
output_path,
]
else:
cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
result = subprocess.run(cmd, capture_output=True, timeout=30)
if result.returncode != 0:
stderr = result.stderr.decode("utf-8", errors="ignore")[:300]
raise RuntimeError(f"ffmpeg conversion failed: {stderr}")
else:
logger.warning(
"ffmpeg not found; writing raw WAV to %s (extension may be misleading)",
output_path,
)
shutil.copyfile(wav_path, output_path)
finally:
try:
os.remove(wav_path)
except OSError:
pass
return output_path
# ===========================================================================
# NeuTTS (local, on-device TTS via neutts_cli)
# ===========================================================================
@ -634,7 +810,7 @@ def text_to_speech_tool(
out_dir.mkdir(parents=True, exist_ok=True)
# Use .ogg for Telegram with providers that support native Opus output,
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
if want_opus and provider in ("openai", "elevenlabs", "mistral"):
if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
file_path = out_dir / f"tts_{timestamp}.ogg"
else:
file_path = out_dir / f"tts_{timestamp}.mp3"
@ -687,6 +863,10 @@ def text_to_speech_tool(
logger.info("Generating speech with Mistral Voxtral TTS...")
_generate_mistral_tts(text, file_str, tts_config)
elif provider == "gemini":
logger.info("Generating speech with Google Gemini TTS...")
_generate_gemini_tts(text, file_str, tts_config)
elif provider == "neutts":
if not _check_neutts_available():
return json.dumps({
@ -741,7 +921,7 @@ def text_to_speech_tool(
if opus_path:
file_str = opus_path
voice_compatible = True
elif provider in ("elevenlabs", "openai", "mistral"):
elif provider in ("elevenlabs", "openai", "mistral", "gemini"):
voice_compatible = file_str.endswith(".ogg")
file_size = os.path.getsize(file_str)
@ -811,6 +991,8 @@ def check_tts_requirements() -> bool:
return True
if os.getenv("XAI_API_KEY"):
return True
if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
return True
try:
_import_mistral_client()
if os.getenv("MISTRAL_API_KEY"):