From fce6c3cdf66c3c25fecde950ee48e700cb832132 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 16 Apr 2026 14:23:16 -0700 Subject: [PATCH] feat(tts): add Google Gemini TTS provider (#11229) Adds Google Gemini TTS as the seventh voice provider, with 30 prebuilt voices (Zephyr, Puck, Kore, Enceladus, Gacrux, etc.) and natural-language prompt control. Integrates through the existing provider chain: - tools/tts_tool.py: new _generate_gemini_tts() calls the generativelanguage REST endpoint with responseModalities=[AUDIO], wraps the returned 24kHz mono 16-bit PCM (L16) in a WAV RIFF header, then ffmpeg-converts to MP3 or Opus depending on output extension. For .ogg output, libopus is forced explicitly so Telegram voice bubbles get Opus (ffmpeg defaults to Vorbis for .ogg). - hermes_cli/tools_config.py: exposes 'Google Gemini TTS' as a provider option in the curses-based 'hermes tools' UI. - hermes_cli/setup.py: adds gemini to the setup wizard picker, tool status display, and API key prompt branch (accepts existing GEMINI_API_KEY or GOOGLE_API_KEY, falls back to Edge if neither set). - tests/tools/test_tts_gemini.py: 15 unit tests covering WAV header wrap correctness, env var fallback (GEMINI/GOOGLE), voice/model overrides, snake_case vs camelCase inlineData handling, HTTP error surfacing, and empty-audio edge cases. - docs: TTS features page updated to list seven providers with the new gemini config block and ffmpeg notes. Live-tested against api key against gemini-2.5-flash-preview-tts: .wav, .mp3, and Telegram-compatible .ogg (Opus codec) all produce valid playable audio. --- hermes_cli/setup.py | 19 +- hermes_cli/tools_config.py | 9 + tests/tools/test_tts_gemini.py | 287 ++++++++++++++++++++++++ tools/tts_tool.py | 188 +++++++++++++++- website/docs/user-guide/features/tts.md | 9 +- 5 files changed, 506 insertions(+), 6 deletions(-) create mode 100644 tests/tools/test_tts_gemini.py diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 96ee77112..408fbc0f7 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -430,6 +430,8 @@ def _print_setup_summary(config: dict, hermes_home): tool_status.append(("Text-to-Speech (MiniMax)", True, None)) elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"): tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None)) + elif tts_provider == "gemini" and (get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")): + tool_status.append(("Text-to-Speech (Google Gemini)", True, None)) elif tts_provider == "neutts": try: import importlib.util @@ -913,6 +915,7 @@ def _setup_tts_provider(config: dict): "xai": "xAI TTS", "minimax": "MiniMax TTS", "mistral": "Mistral Voxtral TTS", + "gemini": "Google Gemini TTS", "neutts": "NeuTTS", } current_label = provider_labels.get(current_provider, current_provider) @@ -935,10 +938,11 @@ def _setup_tts_provider(config: dict): "xAI TTS (Grok voices, needs API key)", "MiniMax TTS (high quality with voice cloning, needs API key)", "Mistral Voxtral TTS (multilingual, native Opus, needs API key)", + "Google Gemini TTS (30 prebuilt voices, prompt-controllable, needs API key)", "NeuTTS (local on-device, free, ~300MB model download)", ] ) - providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts"]) + providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "gemini", "neutts"]) choices.append(f"Keep current ({current_label})") keep_current_idx = len(choices) - 1 idx = prompt_choice("Select TTS provider:", choices, keep_current_idx) @@ -1045,6 +1049,19 @@ def _setup_tts_provider(config: dict): print_warning("No API key provided. Falling back to Edge TTS.") selected = "edge" + elif selected == "gemini": + existing = get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY") + if not existing: + print() + print_info("Get a free API key at https://aistudio.google.com/app/apikey") + api_key = prompt("Gemini API key for TTS", password=True) + if api_key: + save_env_value("GEMINI_API_KEY", api_key) + print_success("Gemini TTS API key saved") + else: + print_warning("No API key provided. Falling back to Edge TTS.") + selected = "edge" + # Save the selection if "tts" not in config: config["tts"] = {} diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index fa15fe087..6d272ebcc 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -172,6 +172,15 @@ TOOL_CATEGORIES = { ], "tts_provider": "mistral", }, + { + "name": "Google Gemini TTS", + "badge": "preview", + "tag": "30 prebuilt voices, controllable via prompts", + "env_vars": [ + {"key": "GEMINI_API_KEY", "prompt": "Gemini API key", "url": "https://aistudio.google.com/app/apikey"}, + ], + "tts_provider": "gemini", + }, ], }, "web": { diff --git a/tests/tools/test_tts_gemini.py b/tests/tools/test_tts_gemini.py new file mode 100644 index 000000000..00a028674 --- /dev/null +++ b/tests/tools/test_tts_gemini.py @@ -0,0 +1,287 @@ +"""Tests for the Google Gemini TTS provider in tools/tts_tool.py.""" + +import base64 +import struct +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for key in ( + "GEMINI_API_KEY", + "GOOGLE_API_KEY", + "GEMINI_BASE_URL", + "HERMES_SESSION_PLATFORM", + ): + monkeypatch.delenv(key, raising=False) + + +@pytest.fixture +def fake_pcm_bytes(): + # 0.1s of silence at 24kHz mono 16-bit = 4800 bytes + return b"\x00" * 4800 + + +@pytest.fixture +def mock_gemini_response(fake_pcm_bytes): + """A successful Gemini generateContent response.""" + resp = MagicMock() + resp.status_code = 200 + resp.json.return_value = { + "candidates": [ + { + "content": { + "parts": [ + { + "inlineData": { + "mimeType": "audio/L16;codec=pcm;rate=24000", + "data": base64.b64encode(fake_pcm_bytes).decode(), + } + } + ] + } + } + ] + } + return resp + + +class TestWrapPcmAsWav: + def test_riff_header_structure(self): + from tools.tts_tool import _wrap_pcm_as_wav + + pcm = b"\x01\x02\x03\x04" * 10 + wav = _wrap_pcm_as_wav(pcm, sample_rate=24000, channels=1, sample_width=2) + + assert wav[:4] == b"RIFF" + assert wav[8:12] == b"WAVE" + assert wav[12:16] == b"fmt " + # Audio format (PCM=1) + assert struct.unpack(" str: from hermes_constants import get_hermes_dir @@ -506,6 +514,174 @@ def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any return output_path +# =========================================================================== +# Provider: Google Gemini TTS +# =========================================================================== +def _wrap_pcm_as_wav( + pcm_bytes: bytes, + sample_rate: int = GEMINI_TTS_SAMPLE_RATE, + channels: int = GEMINI_TTS_CHANNELS, + sample_width: int = GEMINI_TTS_SAMPLE_WIDTH, +) -> bytes: + """Wrap raw signed-little-endian PCM with a standard WAV RIFF header. + + Gemini TTS returns audio/L16;codec=pcm;rate=24000 -- raw PCM samples with + no container. We add a minimal WAV header so the file is playable and + ffmpeg can re-encode it to MP3/Opus downstream. + """ + import struct + + byte_rate = sample_rate * channels * sample_width + block_align = channels * sample_width + data_size = len(pcm_bytes) + fmt_chunk = struct.pack( + "<4sIHHIIHH", + b"fmt ", + 16, # fmt chunk size (PCM) + 1, # audio format (PCM) + channels, + sample_rate, + byte_rate, + block_align, + sample_width * 8, + ) + data_chunk_header = struct.pack("<4sI", b"data", data_size) + riff_size = 4 + len(fmt_chunk) + len(data_chunk_header) + data_size + riff_header = struct.pack("<4sI4s", b"RIFF", riff_size, b"WAVE") + return riff_header + fmt_chunk + data_chunk_header + pcm_bytes + + +def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """Generate audio using Google Gemini TTS. + + Gemini's generateContent endpoint with responseModalities=["AUDIO"] returns + raw 24kHz mono 16-bit PCM (L16) as base64. We wrap it with a WAV RIFF + header to produce a playable file, then ffmpeg-convert to MP3 / Opus if + the caller requested those formats (same pattern as NeuTTS). + + Args: + text: Text to convert (prompt-style; supports inline direction like + "Say cheerfully:" and audio tags like [whispers]). + output_path: Where to save the audio file (.wav, .mp3, or .ogg). + tts_config: TTS config dict. + + Returns: + Path to the saved audio file. + """ + import requests + + api_key = (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or "").strip() + if not api_key: + raise ValueError( + "GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey" + ) + + gemini_config = tts_config.get("gemini", {}) + model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL + voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE + base_url = str( + gemini_config.get("base_url") + or os.getenv("GEMINI_BASE_URL") + or DEFAULT_GEMINI_TTS_BASE_URL + ).strip().rstrip("/") + + payload: Dict[str, Any] = { + "contents": [{"parts": [{"text": text}]}], + "generationConfig": { + "responseModalities": ["AUDIO"], + "speechConfig": { + "voiceConfig": { + "prebuiltVoiceConfig": {"voiceName": voice}, + }, + }, + }, + } + + endpoint = f"{base_url}/models/{model}:generateContent" + response = requests.post( + endpoint, + params={"key": api_key}, + headers={"Content-Type": "application/json"}, + json=payload, + timeout=60, + ) + if response.status_code != 200: + # Surface the API error message when present + try: + err = response.json().get("error", {}) + detail = err.get("message") or response.text[:300] + except Exception: + detail = response.text[:300] + raise RuntimeError( + f"Gemini TTS API error (HTTP {response.status_code}): {detail}" + ) + + try: + data = response.json() + parts = data["candidates"][0]["content"]["parts"] + audio_part = next((p for p in parts if "inlineData" in p or "inline_data" in p), None) + if audio_part is None: + raise RuntimeError("Gemini TTS response contained no audio data") + inline = audio_part.get("inlineData") or audio_part.get("inline_data") or {} + audio_b64 = inline.get("data", "") + except (KeyError, IndexError, TypeError) as e: + raise RuntimeError(f"Gemini TTS response was malformed: {e}") from e + + if not audio_b64: + raise RuntimeError("Gemini TTS returned empty audio data") + + pcm_bytes = base64.b64decode(audio_b64) + wav_bytes = _wrap_pcm_as_wav(pcm_bytes) + + # Fast path: caller wants WAV directly, just write. + if output_path.lower().endswith(".wav"): + with open(output_path, "wb") as f: + f.write(wav_bytes) + return output_path + + # Otherwise write WAV to a temp file and ffmpeg-convert to the target + # format (.mp3 or .ogg). If ffmpeg is missing, fall back to renaming the + # WAV -- this matches the NeuTTS behavior and keeps the tool usable on + # systems without ffmpeg (audio still plays, just with a misleading + # extension). + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp.write(wav_bytes) + wav_path = tmp.name + + try: + ffmpeg = shutil.which("ffmpeg") + if ffmpeg: + # For .ogg output, force libopus encoding (Telegram voice bubbles + # require Opus specifically; ffmpeg's default for .ogg is Vorbis). + if output_path.lower().endswith(".ogg"): + cmd = [ + ffmpeg, "-i", wav_path, + "-acodec", "libopus", "-ac", "1", + "-b:a", "64k", "-vbr", "off", + "-y", "-loglevel", "error", + output_path, + ] + else: + cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path] + result = subprocess.run(cmd, capture_output=True, timeout=30) + if result.returncode != 0: + stderr = result.stderr.decode("utf-8", errors="ignore")[:300] + raise RuntimeError(f"ffmpeg conversion failed: {stderr}") + else: + logger.warning( + "ffmpeg not found; writing raw WAV to %s (extension may be misleading)", + output_path, + ) + shutil.copyfile(wav_path, output_path) + finally: + try: + os.remove(wav_path) + except OSError: + pass + + return output_path + + # =========================================================================== # NeuTTS (local, on-device TTS via neutts_cli) # =========================================================================== @@ -634,7 +810,7 @@ def text_to_speech_tool( out_dir.mkdir(parents=True, exist_ok=True) # Use .ogg for Telegram with providers that support native Opus output, # otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later). - if want_opus and provider in ("openai", "elevenlabs", "mistral"): + if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"): file_path = out_dir / f"tts_{timestamp}.ogg" else: file_path = out_dir / f"tts_{timestamp}.mp3" @@ -687,6 +863,10 @@ def text_to_speech_tool( logger.info("Generating speech with Mistral Voxtral TTS...") _generate_mistral_tts(text, file_str, tts_config) + elif provider == "gemini": + logger.info("Generating speech with Google Gemini TTS...") + _generate_gemini_tts(text, file_str, tts_config) + elif provider == "neutts": if not _check_neutts_available(): return json.dumps({ @@ -741,7 +921,7 @@ def text_to_speech_tool( if opus_path: file_str = opus_path voice_compatible = True - elif provider in ("elevenlabs", "openai", "mistral"): + elif provider in ("elevenlabs", "openai", "mistral", "gemini"): voice_compatible = file_str.endswith(".ogg") file_size = os.path.getsize(file_str) @@ -811,6 +991,8 @@ def check_tts_requirements() -> bool: return True if os.getenv("XAI_API_KEY"): return True + if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"): + return True try: _import_mistral_client() if os.getenv("MISTRAL_API_KEY"): diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 7d864eddd..9b0fe8b3a 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -14,7 +14,7 @@ If you have a paid [Nous Portal](https://portal.nousresearch.com) subscription, ## Text-to-Speech -Convert text to speech with six providers: +Convert text to speech with seven providers: | Provider | Quality | Cost | API Key | |----------|---------|------|---------| @@ -23,6 +23,7 @@ Convert text to speech with six providers: | **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` | | **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` | | **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` | +| **Google Gemini TTS** | Excellent | Free tier | `GEMINI_API_KEY` | | **NeuTTS** | Good | Free | None needed | ### Platform Delivery @@ -39,7 +40,7 @@ Convert text to speech with six providers: ```yaml # In ~/.hermes/config.yaml tts: - provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts" + provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "gemini" | "neutts" speed: 1.0 # Global speed multiplier (provider-specific settings override this) edge: voice: "en-US-AriaNeural" # 322 voices, 74 languages @@ -61,6 +62,9 @@ tts: mistral: model: "voxtral-mini-tts-2603" voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default) + gemini: + model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts + voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc. neutts: ref_audio: '' ref_text: '' @@ -77,6 +81,7 @@ Telegram voice bubbles require Opus/OGG audio format: - **OpenAI, ElevenLabs, and Mistral** produce Opus natively — no extra setup - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert: - **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles +- **Google Gemini TTS** outputs raw PCM and uses **ffmpeg** to encode Opus directly for Telegram voice bubbles - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles ```bash