mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(tts): add Google Gemini TTS provider (#11229)
Adds Google Gemini TTS as the seventh voice provider, with 30 prebuilt voices (Zephyr, Puck, Kore, Enceladus, Gacrux, etc.) and natural-language prompt control. Integrates through the existing provider chain: - tools/tts_tool.py: new _generate_gemini_tts() calls the generativelanguage REST endpoint with responseModalities=[AUDIO], wraps the returned 24kHz mono 16-bit PCM (L16) in a WAV RIFF header, then ffmpeg-converts to MP3 or Opus depending on output extension. For .ogg output, libopus is forced explicitly so Telegram voice bubbles get Opus (ffmpeg defaults to Vorbis for .ogg). - hermes_cli/tools_config.py: exposes 'Google Gemini TTS' as a provider option in the curses-based 'hermes tools' UI. - hermes_cli/setup.py: adds gemini to the setup wizard picker, tool status display, and API key prompt branch (accepts existing GEMINI_API_KEY or GOOGLE_API_KEY, falls back to Edge if neither set). - tests/tools/test_tts_gemini.py: 15 unit tests covering WAV header wrap correctness, env var fallback (GEMINI/GOOGLE), voice/model overrides, snake_case vs camelCase inlineData handling, HTTP error surfacing, and empty-audio edge cases. - docs: TTS features page updated to list seven providers with the new gemini config block and ffmpeg notes. Live-tested against api key against gemini-2.5-flash-preview-tts: .wav, .mp3, and Telegram-compatible .ogg (Opus codec) all produce valid playable audio.
This commit is contained in:
parent
80855f964e
commit
fce6c3cdf6
5 changed files with 506 additions and 6 deletions
|
|
@ -2,12 +2,13 @@
|
|||
"""
|
||||
Text-to-Speech Tool Module
|
||||
|
||||
Supports six TTS providers:
|
||||
Supports seven TTS providers:
|
||||
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
|
||||
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
|
||||
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
|
||||
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
|
||||
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
|
||||
- Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
|
||||
|
||||
Output formats:
|
||||
|
|
@ -99,6 +100,13 @@ DEFAULT_XAI_LANGUAGE = "en"
|
|||
DEFAULT_XAI_SAMPLE_RATE = 24000
|
||||
DEFAULT_XAI_BIT_RATE = 128000
|
||||
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
|
||||
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
|
||||
DEFAULT_GEMINI_TTS_VOICE = "Kore"
|
||||
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
|
||||
# PCM output specs for Gemini TTS (fixed by the API)
|
||||
GEMINI_TTS_SAMPLE_RATE = 24000
|
||||
GEMINI_TTS_CHANNELS = 1
|
||||
GEMINI_TTS_SAMPLE_WIDTH = 2 # 16-bit PCM (L16)
|
||||
|
||||
def _get_default_output_dir() -> str:
|
||||
from hermes_constants import get_hermes_dir
|
||||
|
|
@ -506,6 +514,174 @@ def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
|||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Provider: Google Gemini TTS
|
||||
# ===========================================================================
|
||||
def _wrap_pcm_as_wav(
|
||||
pcm_bytes: bytes,
|
||||
sample_rate: int = GEMINI_TTS_SAMPLE_RATE,
|
||||
channels: int = GEMINI_TTS_CHANNELS,
|
||||
sample_width: int = GEMINI_TTS_SAMPLE_WIDTH,
|
||||
) -> bytes:
|
||||
"""Wrap raw signed-little-endian PCM with a standard WAV RIFF header.
|
||||
|
||||
Gemini TTS returns audio/L16;codec=pcm;rate=24000 -- raw PCM samples with
|
||||
no container. We add a minimal WAV header so the file is playable and
|
||||
ffmpeg can re-encode it to MP3/Opus downstream.
|
||||
"""
|
||||
import struct
|
||||
|
||||
byte_rate = sample_rate * channels * sample_width
|
||||
block_align = channels * sample_width
|
||||
data_size = len(pcm_bytes)
|
||||
fmt_chunk = struct.pack(
|
||||
"<4sIHHIIHH",
|
||||
b"fmt ",
|
||||
16, # fmt chunk size (PCM)
|
||||
1, # audio format (PCM)
|
||||
channels,
|
||||
sample_rate,
|
||||
byte_rate,
|
||||
block_align,
|
||||
sample_width * 8,
|
||||
)
|
||||
data_chunk_header = struct.pack("<4sI", b"data", data_size)
|
||||
riff_size = 4 + len(fmt_chunk) + len(data_chunk_header) + data_size
|
||||
riff_header = struct.pack("<4sI4s", b"RIFF", riff_size, b"WAVE")
|
||||
return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
|
||||
|
||||
|
||||
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate audio using Google Gemini TTS.
|
||||
|
||||
Gemini's generateContent endpoint with responseModalities=["AUDIO"] returns
|
||||
raw 24kHz mono 16-bit PCM (L16) as base64. We wrap it with a WAV RIFF
|
||||
header to produce a playable file, then ffmpeg-convert to MP3 / Opus if
|
||||
the caller requested those formats (same pattern as NeuTTS).
|
||||
|
||||
Args:
|
||||
text: Text to convert (prompt-style; supports inline direction like
|
||||
"Say cheerfully:" and audio tags like [whispers]).
|
||||
output_path: Where to save the audio file (.wav, .mp3, or .ogg).
|
||||
tts_config: TTS config dict.
|
||||
|
||||
Returns:
|
||||
Path to the saved audio file.
|
||||
"""
|
||||
import requests
|
||||
|
||||
api_key = (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or "").strip()
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
|
||||
)
|
||||
|
||||
gemini_config = tts_config.get("gemini", {})
|
||||
model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
|
||||
voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
|
||||
base_url = str(
|
||||
gemini_config.get("base_url")
|
||||
or os.getenv("GEMINI_BASE_URL")
|
||||
or DEFAULT_GEMINI_TTS_BASE_URL
|
||||
).strip().rstrip("/")
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"contents": [{"parts": [{"text": text}]}],
|
||||
"generationConfig": {
|
||||
"responseModalities": ["AUDIO"],
|
||||
"speechConfig": {
|
||||
"voiceConfig": {
|
||||
"prebuiltVoiceConfig": {"voiceName": voice},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
endpoint = f"{base_url}/models/{model}:generateContent"
|
||||
response = requests.post(
|
||||
endpoint,
|
||||
params={"key": api_key},
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=payload,
|
||||
timeout=60,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
# Surface the API error message when present
|
||||
try:
|
||||
err = response.json().get("error", {})
|
||||
detail = err.get("message") or response.text[:300]
|
||||
except Exception:
|
||||
detail = response.text[:300]
|
||||
raise RuntimeError(
|
||||
f"Gemini TTS API error (HTTP {response.status_code}): {detail}"
|
||||
)
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
parts = data["candidates"][0]["content"]["parts"]
|
||||
audio_part = next((p for p in parts if "inlineData" in p or "inline_data" in p), None)
|
||||
if audio_part is None:
|
||||
raise RuntimeError("Gemini TTS response contained no audio data")
|
||||
inline = audio_part.get("inlineData") or audio_part.get("inline_data") or {}
|
||||
audio_b64 = inline.get("data", "")
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
raise RuntimeError(f"Gemini TTS response was malformed: {e}") from e
|
||||
|
||||
if not audio_b64:
|
||||
raise RuntimeError("Gemini TTS returned empty audio data")
|
||||
|
||||
pcm_bytes = base64.b64decode(audio_b64)
|
||||
wav_bytes = _wrap_pcm_as_wav(pcm_bytes)
|
||||
|
||||
# Fast path: caller wants WAV directly, just write.
|
||||
if output_path.lower().endswith(".wav"):
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(wav_bytes)
|
||||
return output_path
|
||||
|
||||
# Otherwise write WAV to a temp file and ffmpeg-convert to the target
|
||||
# format (.mp3 or .ogg). If ffmpeg is missing, fall back to renaming the
|
||||
# WAV -- this matches the NeuTTS behavior and keeps the tool usable on
|
||||
# systems without ffmpeg (audio still plays, just with a misleading
|
||||
# extension).
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp.write(wav_bytes)
|
||||
wav_path = tmp.name
|
||||
|
||||
try:
|
||||
ffmpeg = shutil.which("ffmpeg")
|
||||
if ffmpeg:
|
||||
# For .ogg output, force libopus encoding (Telegram voice bubbles
|
||||
# require Opus specifically; ffmpeg's default for .ogg is Vorbis).
|
||||
if output_path.lower().endswith(".ogg"):
|
||||
cmd = [
|
||||
ffmpeg, "-i", wav_path,
|
||||
"-acodec", "libopus", "-ac", "1",
|
||||
"-b:a", "64k", "-vbr", "off",
|
||||
"-y", "-loglevel", "error",
|
||||
output_path,
|
||||
]
|
||||
else:
|
||||
cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode("utf-8", errors="ignore")[:300]
|
||||
raise RuntimeError(f"ffmpeg conversion failed: {stderr}")
|
||||
else:
|
||||
logger.warning(
|
||||
"ffmpeg not found; writing raw WAV to %s (extension may be misleading)",
|
||||
output_path,
|
||||
)
|
||||
shutil.copyfile(wav_path, output_path)
|
||||
finally:
|
||||
try:
|
||||
os.remove(wav_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# NeuTTS (local, on-device TTS via neutts_cli)
|
||||
# ===========================================================================
|
||||
|
|
@ -634,7 +810,7 @@ def text_to_speech_tool(
|
|||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Use .ogg for Telegram with providers that support native Opus output,
|
||||
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
|
||||
if want_opus and provider in ("openai", "elevenlabs", "mistral"):
|
||||
if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
|
||||
file_path = out_dir / f"tts_{timestamp}.ogg"
|
||||
else:
|
||||
file_path = out_dir / f"tts_{timestamp}.mp3"
|
||||
|
|
@ -687,6 +863,10 @@ def text_to_speech_tool(
|
|||
logger.info("Generating speech with Mistral Voxtral TTS...")
|
||||
_generate_mistral_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "gemini":
|
||||
logger.info("Generating speech with Google Gemini TTS...")
|
||||
_generate_gemini_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "neutts":
|
||||
if not _check_neutts_available():
|
||||
return json.dumps({
|
||||
|
|
@ -741,7 +921,7 @@ def text_to_speech_tool(
|
|||
if opus_path:
|
||||
file_str = opus_path
|
||||
voice_compatible = True
|
||||
elif provider in ("elevenlabs", "openai", "mistral"):
|
||||
elif provider in ("elevenlabs", "openai", "mistral", "gemini"):
|
||||
voice_compatible = file_str.endswith(".ogg")
|
||||
|
||||
file_size = os.path.getsize(file_str)
|
||||
|
|
@ -811,6 +991,8 @@ def check_tts_requirements() -> bool:
|
|||
return True
|
||||
if os.getenv("XAI_API_KEY"):
|
||||
return True
|
||||
if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
|
||||
return True
|
||||
try:
|
||||
_import_mistral_client()
|
||||
if os.getenv("MISTRAL_API_KEY"):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue