From 0671201c05d900d9ad2c177cad8eb1915dc70270 Mon Sep 17 00:00:00 2001 From: zhonghui5207 Date: Thu, 16 Apr 2026 20:47:29 +0530 Subject: [PATCH] feat(tts): add Gemini TTS provider Add Google's Gemini speech-generation API as 8th TTS backend. Returns base64-encoded signed 16-bit PCM at 24 kHz mono, wrapped in WAV natively via stdlib wave module. Optional ffmpeg conversion to mp3/ogg for Telegram voice bubbles. Supports GEMINI_API_KEY and GOOGLE_API_KEY (fallback), 30 prebuilt voices, configurable model (flash/pro). Cherry-picked from #10922 by @zhonghui5207. Fixes #10918. --- cli-config.yaml.example | 12 +- hermes_cli/config.py | 27 ++- hermes_cli/setup.py | 21 ++- tests/tools/test_tts_gemini.py | 221 ++++++++++++++++++++++++ tests/tools/test_tts_mistral.py | 11 +- tools/tts_tool.py | 115 +++++++++++- website/docs/user-guide/features/tts.md | 7 +- 7 files changed, 388 insertions(+), 26 deletions(-) create mode 100644 tests/tools/test_tts_gemini.py diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 8c0484abd0..cac6cba6a9 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -26,7 +26,6 @@ model: # "huggingface" - Hugging Face Inference (requires: HF_TOKEN) # "xiaomi" - Xiaomi MiMo (requires: XIAOMI_API_KEY) # "arcee" - Arcee AI Trinity models (requires: ARCEEAI_API_KEY) - # "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY — https://ollama.com/settings) # "kilocode" - KiloCode gateway (requires: KILOCODE_API_KEY) # "ai-gateway" - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY) # @@ -38,6 +37,12 @@ model: # base_url: "http://localhost:1234/v1" # No API key needed — local servers typically ignore auth. # + # For Ollama Cloud (https://ollama.com/pricing): + # provider: "custom" + # base_url: "https://ollama.com/v1" + # Set OLLAMA_API_KEY in .env — automatically picked up when base_url + # points to ollama.com. + # # Can also be overridden with --provider flag or HERMES_INFERENCE_PROVIDER env var. provider: "auto" @@ -332,7 +337,6 @@ compression: # "openrouter" - Force OpenRouter (requires OPENROUTER_API_KEY) # "nous" - Force Nous Portal (requires: hermes login) # "gemini" - Force Google AI Studio direct (requires: GOOGLE_API_KEY or GEMINI_API_KEY) -# "ollama-cloud" - Ollama Cloud (requires: OLLAMA_API_KEY) # "codex" - Force Codex OAuth (requires: hermes model → Codex). # Uses gpt-5.3-codex which supports vision. # "main" - Use your custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY). @@ -593,7 +597,7 @@ platform_toolsets: # skills_hub - skill_hub (search/install/manage from online registries — user-driven only) # moa - mixture_of_agents (requires OPENROUTER_API_KEY) # todo - todo (in-memory task planning, no deps) -# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key) +# tts - text_to_speech (Edge TTS free, or ELEVENLABS/OPENAI/XAI/MINIMAX/MISTRAL/GEMINI key) # cronjob - cronjob (create/list/update/pause/resume/run/remove scheduled tasks) # rl - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY) # @@ -622,7 +626,7 @@ platform_toolsets: # todo - Task planning and tracking for multi-step work # memory - Persistent memory across sessions (personal notes + user profile) # session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization) -# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral) +# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, xAI, MiniMax, Mistral, Gemini) # cronjob - Schedule and manage automated tasks (CLI-only) # rl - RL training tools (Tinker-Atropos) # diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 33bc325ee3..ba67bc961f 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -61,9 +61,7 @@ from hermes_cli.colors import Colors, color from hermes_cli.default_soul import DEFAULT_SOUL_MD -# ============================================================================= # Managed mode (NixOS declarative config) -# ============================================================================= _MANAGED_TRUE_VALUES = ("true", "1", "yes") _MANAGED_SYSTEM_NAMES = { @@ -147,9 +145,7 @@ def managed_error(action: str = "modify configuration"): print(format_managed_message(action), file=sys.stderr) -# ============================================================================= # Container-aware CLI (NixOS container mode) -# ============================================================================= def get_container_exec_info() -> Optional[dict]: """Read container mode metadata from HERMES_HOME/.container-mode. @@ -196,9 +192,7 @@ def get_container_exec_info() -> Optional[dict]: } -# ============================================================================= # Config paths -# ============================================================================= # Re-export from hermes_constants — canonical definition lives there. from hermes_constants import get_hermes_home # noqa: F811,E402 @@ -335,9 +329,7 @@ def _ensure_hermes_home_managed(home: Path): _ensure_default_soul_md(home) -# ============================================================================= # Config loading/saving -# ============================================================================= DEFAULT_CONFIG = { "model": "", @@ -570,7 +562,7 @@ DEFAULT_CONFIG = { # Text-to-speech configuration "tts": { - "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local) + "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local) | "gemini" "edge": { "voice": "en-US-AriaNeural", # Popular: AriaNeural, JennyNeural, AndrewNeural, BrianNeural, SoniaNeural @@ -600,6 +592,15 @@ DEFAULT_CONFIG = { "model": "neuphonic/neutts-air-q4-gguf", # HuggingFace model repo "device": "cpu", # cpu, cuda, or mps }, + "gemini": { + "model": "gemini-2.5-flash-preview-tts", + "voice": "Kore", + # 30 prebuilt voices: Zephyr, Puck, Charon, Kore, Fenrir, Leda, + # Orus, Aoede, Callirrhoe, Autonoe, Enceladus, Iapetus, Umbriel, + # Algieba, Despina, Erinome, Algenib, Rasalgethi, Laomedeia, + # Achernar, Alnilam, Schedar, Gacrux, Pulcherrima, Achird, + # Zubenelgenubi, Vindemiatrix, Sadachbia, Sadaltager, Sulafat + }, }, "stt": { @@ -781,9 +782,7 @@ DEFAULT_CONFIG = { "_config_version": 18, } -# ============================================================================= # Config Migration System -# ============================================================================= # Track which env vars were introduced in each config version. # Migration only mentions vars new since the user's previous version. @@ -1901,9 +1900,7 @@ def check_config_version() -> Tuple[int, int]: return current, latest -# ============================================================================= # Config structure validation -# ============================================================================= # Fields that are valid at root level of config.yaml _KNOWN_ROOT_KEYS = { @@ -3167,9 +3164,7 @@ def get_env_value(key: str) -> Optional[str]: return env_vars.get(key) -# ============================================================================= # Config display -# ============================================================================= def redact_key(key: str) -> str: """Redact an API key for display.""" @@ -3461,9 +3456,7 @@ def set_config_value(key: str, value: str): print(f"✓ Set {key} = {value} in {config_path}") -# ============================================================================= # Command handler -# ============================================================================= def config_command(args): """Handle config subcommands.""" diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index eafe3b6334..def22bb74a 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -433,6 +433,10 @@ def _print_setup_summary(config: dict, hermes_home): tool_status.append(("Text-to-Speech (MiniMax)", True, None)) elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"): tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None)) + elif tts_provider == "gemini" and ( + get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY") + ): + tool_status.append(("Text-to-Speech (Gemini)", True, None)) elif tts_provider == "neutts": try: import importlib.util @@ -924,6 +928,7 @@ def _setup_tts_provider(config: dict): "minimax": "MiniMax TTS", "mistral": "Mistral Voxtral TTS", "neutts": "NeuTTS", + "gemini": "Gemini TTS", } current_label = provider_labels.get(current_provider, current_provider) @@ -946,9 +951,10 @@ def _setup_tts_provider(config: dict): "MiniMax TTS (high quality with voice cloning, needs API key)", "Mistral Voxtral TTS (multilingual, native Opus, needs API key)", "NeuTTS (local on-device, free, ~300MB model download)", + "Gemini TTS (Google speech generation, 30 voices, needs GEMINI_API_KEY)", ] ) - providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts"]) + providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts", "gemini"]) choices.append(f"Keep current ({current_label})") keep_current_idx = len(choices) - 1 idx = prompt_choice("Select TTS provider:", choices, keep_current_idx) @@ -1055,6 +1061,19 @@ def _setup_tts_provider(config: dict): print_warning("No API key provided. Falling back to Edge TTS.") selected = "edge" + elif selected == "gemini": + existing = get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY") + if not existing: + print() + print_info("Get a key at https://aistudio.google.com/apikey") + api_key = prompt("Gemini API key for TTS", password=True) + if api_key: + save_env_value("GEMINI_API_KEY", api_key) + print_success("Gemini API key saved") + else: + print_warning("No API key provided. Falling back to Edge TTS.") + selected = "edge" + # Save the selection if "tts" not in config: config["tts"] = {} diff --git a/tests/tools/test_tts_gemini.py b/tests/tools/test_tts_gemini.py new file mode 100644 index 0000000000..9a3136b1de --- /dev/null +++ b/tests/tools/test_tts_gemini.py @@ -0,0 +1,221 @@ +"""Tests for the Gemini TTS provider in tools/tts_tool.py.""" + +import base64 +import struct +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + for key in ( + "GEMINI_API_KEY", + "GOOGLE_API_KEY", + "HERMES_SESSION_PLATFORM", + "MINIMAX_API_KEY", + "ELEVENLABS_API_KEY", + "OPENAI_API_KEY", + "VOICE_TOOLS_OPENAI_KEY", + "MISTRAL_API_KEY", + ): + monkeypatch.delenv(key, raising=False) + + +def _gemini_response(pcm_bytes: bytes) -> dict: + return { + "candidates": [ + { + "content": { + "parts": [ + {"inlineData": {"data": base64.b64encode(pcm_bytes).decode()}} + ] + } + } + ] + } + + +def _mock_urlopen(response_payload: dict): + resp_body = __import__("json").dumps(response_payload).encode("utf-8") + mock_resp = MagicMock() + mock_resp.read.return_value = resp_body + mock_resp.__enter__ = MagicMock(return_value=mock_resp) + mock_resp.__exit__ = MagicMock(return_value=False) + return mock_resp + + +class TestGenerateGeminiTts: + def test_missing_api_key_raises_value_error(self, tmp_path): + from tools.tts_tool import _generate_gemini_tts + + with pytest.raises(ValueError, match="GEMINI_API_KEY"): + _generate_gemini_tts("Hello", str(tmp_path / "out.wav"), {}) + + def test_google_api_key_fallback_accepted(self, tmp_path, monkeypatch): + from tools.tts_tool import _generate_gemini_tts + + monkeypatch.setenv("GOOGLE_API_KEY", "test-key") + pcm = b"\x01\x00\x02\x00\x03\x00" + with patch( + "tools.tts_tool.urllib.request.urlopen", + return_value=_mock_urlopen(_gemini_response(pcm)), + ): + result = _generate_gemini_tts("Hi", str(tmp_path / "out.wav"), {}) + + assert result == str(tmp_path / "out.wav") + + def test_writes_wav_with_correct_pcm_params(self, tmp_path, monkeypatch): + import wave + + from tools.tts_tool import _generate_gemini_tts + + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + pcm = struct.pack("<6h", 0, 1, 2, 3, 4, 5) + with patch( + "tools.tts_tool.urllib.request.urlopen", + return_value=_mock_urlopen(_gemini_response(pcm)), + ): + out = tmp_path / "out.wav" + _generate_gemini_tts("Hi", str(out), {}) + + with wave.open(str(out), "rb") as wf: + assert wf.getnchannels() == 1 + assert wf.getsampwidth() == 2 + assert wf.getframerate() == 24000 + assert wf.readframes(wf.getnframes()) == pcm + + def test_default_model_and_voice_in_payload(self, tmp_path, monkeypatch): + import json as _json + + from tools.tts_tool import ( + DEFAULT_GEMINI_TTS_MODEL, + DEFAULT_GEMINI_TTS_VOICE, + _generate_gemini_tts, + ) + + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + captured = {} + + def fake_urlopen(req, timeout=None): + captured["url"] = req.full_url + captured["body"] = _json.loads(req.data.decode()) + captured["headers"] = dict(req.headers) + return _mock_urlopen(_gemini_response(b"\x00\x00")) + + with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen): + _generate_gemini_tts("hello", str(tmp_path / "out.wav"), {}) + + assert DEFAULT_GEMINI_TTS_MODEL in captured["url"] + voice_cfg = captured["body"]["generationConfig"]["speechConfig"]["voiceConfig"][ + "prebuiltVoiceConfig" + ] + assert voice_cfg["voiceName"] == DEFAULT_GEMINI_TTS_VOICE + # Header keys normalize to capitalized form via urllib + assert captured["headers"].get("X-goog-api-key") == "test-key" + + def test_config_overrides(self, tmp_path, monkeypatch): + import json as _json + + from tools.tts_tool import _generate_gemini_tts + + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + captured = {} + + def fake_urlopen(req, timeout=None): + captured["url"] = req.full_url + captured["body"] = _json.loads(req.data.decode()) + return _mock_urlopen(_gemini_response(b"\x00\x00")) + + with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen): + config = {"gemini": {"model": "gemini-2.5-pro-preview-tts", "voice": "Puck"}} + _generate_gemini_tts("hi", str(tmp_path / "out.wav"), config) + + assert "gemini-2.5-pro-preview-tts" in captured["url"] + voice_cfg = captured["body"]["generationConfig"]["speechConfig"]["voiceConfig"][ + "prebuiltVoiceConfig" + ] + assert voice_cfg["voiceName"] == "Puck" + + def test_http_error_surfaced_as_runtime_error(self, tmp_path, monkeypatch): + import urllib.error + + from tools.tts_tool import _generate_gemini_tts + + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + err = urllib.error.HTTPError( + "https://example", 429, "Too Many Requests", {}, None + ) + err.read = MagicMock(return_value=b'{"error": "rate limit"}') + + with patch("tools.tts_tool.urllib.request.urlopen", side_effect=err): + with pytest.raises(RuntimeError, match="429"): + _generate_gemini_tts("hi", str(tmp_path / "out.wav"), {}) + + def test_missing_audio_payload_raises_runtime_error(self, tmp_path, monkeypatch): + from tools.tts_tool import _generate_gemini_tts + + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + bad_response = {"candidates": [{"content": {"parts": []}}]} + with patch( + "tools.tts_tool.urllib.request.urlopen", + return_value=_mock_urlopen(bad_response), + ): + with pytest.raises(RuntimeError, match="missing audio payload"): + _generate_gemini_tts("hi", str(tmp_path / "out.wav"), {}) + + +class TestTtsDispatcherGemini: + def test_dispatcher_routes_to_gemini(self, tmp_path, monkeypatch): + import json + + from tools.tts_tool import text_to_speech_tool + + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + pcm = struct.pack("<2h", 100, -100) + with patch( + "tools.tts_tool.urllib.request.urlopen", + return_value=_mock_urlopen(_gemini_response(pcm)), + ), patch( + "tools.tts_tool._load_tts_config", return_value={"provider": "gemini"} + ): + # Force .wav output so we skip the ffmpeg / Opus conversion branch + output_path = str(tmp_path / "out.wav") + result = json.loads(text_to_speech_tool("Hello", output_path=output_path)) + + assert result["success"] is True + assert result["provider"] == "gemini" + + +class TestCheckTtsRequirementsGemini: + def test_gemini_key_returns_true(self, monkeypatch): + from tools.tts_tool import check_tts_requirements + + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch( + "tools.tts_tool._import_elevenlabs", side_effect=ImportError + ), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch( + "tools.tts_tool._import_mistral_client", side_effect=ImportError + ), patch("tools.tts_tool._check_neutts_available", return_value=False): + assert check_tts_requirements() is True + + def test_google_api_key_also_accepted(self, monkeypatch): + from tools.tts_tool import check_tts_requirements + + monkeypatch.setenv("GOOGLE_API_KEY", "test-key") + with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch( + "tools.tts_tool._import_elevenlabs", side_effect=ImportError + ), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch( + "tools.tts_tool._import_mistral_client", side_effect=ImportError + ), patch("tools.tts_tool._check_neutts_available", return_value=False): + assert check_tts_requirements() is True + + def test_no_key_returns_false(self): + from tools.tts_tool import check_tts_requirements + + with patch("tools.tts_tool._import_edge_tts", side_effect=ImportError), patch( + "tools.tts_tool._import_elevenlabs", side_effect=ImportError + ), patch("tools.tts_tool._import_openai_client", side_effect=ImportError), patch( + "tools.tts_tool._import_mistral_client", side_effect=ImportError + ), patch("tools.tts_tool._check_neutts_available", return_value=False): + assert check_tts_requirements() is False diff --git a/tests/tools/test_tts_mistral.py b/tests/tools/test_tts_mistral.py index a62afd8dbe..bef8d1cd8e 100644 --- a/tests/tools/test_tts_mistral.py +++ b/tests/tools/test_tts_mistral.py @@ -8,7 +8,16 @@ import pytest @pytest.fixture(autouse=True) def clean_env(monkeypatch): - for key in ("MISTRAL_API_KEY", "HERMES_SESSION_PLATFORM"): + for key in ( + "MISTRAL_API_KEY", + "HERMES_SESSION_PLATFORM", + "MINIMAX_API_KEY", + "ELEVENLABS_API_KEY", + "OPENAI_API_KEY", + "VOICE_TOOLS_OPENAI_KEY", + "GEMINI_API_KEY", + "GOOGLE_API_KEY", + ): monkeypatch.delenv(key, raising=False) diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 65ff725ee6..0ce1769fad 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -2,13 +2,14 @@ """ Text-to-Speech Tool Module -Supports six TTS providers: +Supports seven TTS providers: - Edge TTS (default, free, no API key): Microsoft Edge neural voices - ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY - OpenAI TTS: Good quality, needs OPENAI_API_KEY - MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY - Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY - NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed +- Gemini TTS: Google speech generation, 30 prebuilt voices, needs GEMINI_API_KEY or GOOGLE_API_KEY Output formats: - Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS) @@ -35,7 +36,10 @@ import shutil import subprocess import tempfile import threading +import urllib.error +import urllib.request import uuid +import wave from pathlib import Path from typing import Callable, Dict, Any, Optional from urllib.parse import urljoin @@ -99,6 +103,12 @@ DEFAULT_XAI_LANGUAGE = "en" DEFAULT_XAI_SAMPLE_RATE = 24000 DEFAULT_XAI_BIT_RATE = 128000 DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1" +DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts" +DEFAULT_GEMINI_TTS_VOICE = "Kore" +DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" +GEMINI_TTS_SAMPLE_RATE = 24000 +GEMINI_TTS_CHANNELS = 1 +GEMINI_TTS_SAMPLE_WIDTH = 2 # signed 16-bit PCM def _get_default_output_dir() -> str: from hermes_constants import get_hermes_dir @@ -582,6 +592,101 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> return output_path +# =========================================================================== +# Provider: Gemini TTS +# =========================================================================== +def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: + """Generate audio using Google's Gemini speech-generation API. + + Gemini returns base64-encoded PCM (signed 16-bit, 24 kHz, mono). This + function wraps the PCM in a WAV container natively (no ffmpeg needed + for the base case), then converts to the caller's requested extension + via ffmpeg if available. Mirrors the NeuTTS output handling. + + Reference: https://ai.google.dev/gemini-api/docs/speech-generation + """ + api_key = ( + os.getenv("GEMINI_API_KEY") + or os.getenv("GOOGLE_API_KEY") + or "" + ).strip() + if not api_key: + raise ValueError( + "GEMINI_API_KEY (or GOOGLE_API_KEY) not set. " + "Get one at https://aistudio.google.com/apikey" + ) + + gm_config = tts_config.get("gemini", {}) + model = gm_config.get("model", DEFAULT_GEMINI_TTS_MODEL) + voice = gm_config.get("voice", DEFAULT_GEMINI_TTS_VOICE) + base_url = gm_config.get("base_url", DEFAULT_GEMINI_TTS_BASE_URL).rstrip("/") + + endpoint = f"{base_url}/models/{model}:generateContent" + payload = { + "contents": [{"parts": [{"text": text}]}], + "generationConfig": { + "responseModalities": ["AUDIO"], + "speechConfig": { + "voiceConfig": { + "prebuiltVoiceConfig": {"voiceName": voice}, + }, + }, + }, + } + + body = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + endpoint, + data=body, + headers={ + "Content-Type": "application/json", + "x-goog-api-key": api_key, + }, + method="POST", + ) + + try: + with urllib.request.urlopen(req, timeout=60) as resp: + response_data = json.loads(resp.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + err_body = exc.read().decode("utf-8", errors="ignore")[:500] + raise RuntimeError(f"Gemini TTS HTTP {exc.code}: {err_body}") from exc + + try: + audio_part = response_data["candidates"][0]["content"]["parts"][0] + audio_b64 = audio_part["inlineData"]["data"] + except (KeyError, IndexError, TypeError) as exc: + raise RuntimeError( + f"Gemini TTS response missing audio payload: {str(response_data)[:300]}" + ) from exc + + pcm_bytes = base64.b64decode(audio_b64) + + # Write PCM as WAV natively — ffmpeg is only needed if the caller + # asked for a non-WAV extension (mp3/ogg). + wav_path = output_path + if not output_path.endswith(".wav"): + wav_path = output_path.rsplit(".", 1)[0] + ".wav" + + with wave.open(wav_path, "wb") as wf: + wf.setnchannels(GEMINI_TTS_CHANNELS) + wf.setsampwidth(GEMINI_TTS_SAMPLE_WIDTH) + wf.setframerate(GEMINI_TTS_SAMPLE_RATE) + wf.writeframes(pcm_bytes) + + if wav_path != output_path: + ffmpeg = shutil.which("ffmpeg") + if ffmpeg: + conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path] + subprocess.run(conv_cmd, check=True, timeout=30) + os.remove(wav_path) + else: + # No ffmpeg — keep WAV content but honor the caller's path. + os.rename(wav_path, output_path) + + return output_path + + # =========================================================================== # Main tool function # =========================================================================== @@ -697,6 +802,10 @@ def text_to_speech_tool( logger.info("Generating speech with NeuTTS (local)...") _generate_neutts(text, file_str, tts_config) + elif provider == "gemini": + logger.info("Generating speech with Gemini TTS...") + _generate_gemini_tts(text, file_str, tts_config) + else: # Default: Edge TTS (free), with NeuTTS as local fallback edge_available = True @@ -736,7 +845,7 @@ def text_to_speech_tool( # Try Opus conversion for Telegram compatibility # Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion voice_compatible = False - if provider in ("edge", "neutts", "minimax", "xai") and not file_str.endswith(".ogg"): + if provider in ("edge", "neutts", "minimax", "xai", "gemini") and not file_str.endswith(".ogg"): opus_path = _convert_to_opus(file_str) if opus_path: file_str = opus_path @@ -817,6 +926,8 @@ def check_tts_requirements() -> bool: return True except ImportError: pass + if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"): + return True if _check_neutts_available(): return True return False diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 625e25ad9e..ccaac6b111 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -10,7 +10,7 @@ Hermes Agent supports both text-to-speech output and voice message transcription ## Text-to-Speech -Convert text to speech with six providers: +Convert text to speech with seven providers: | Provider | Quality | Cost | API Key | |----------|---------|------|---------| @@ -20,6 +20,7 @@ Convert text to speech with six providers: | **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` | | **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` | | **NeuTTS** | Good | Free | None needed | +| **Gemini TTS** | Excellent | Paid (free tier) | `GEMINI_API_KEY` | ### Platform Delivery @@ -62,6 +63,9 @@ tts: ref_text: '' model: neuphonic/neutts-air-q4-gguf device: cpu + gemini: + model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts + voice: "Kore" # 30 prebuilt voices (Zephyr, Puck, Charon, ...) ``` **Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed). @@ -74,6 +78,7 @@ Telegram voice bubbles require Opus/OGG audio format: - **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert: - **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles - **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles +- **Gemini TTS** returns raw PCM (wrapped in WAV natively) and needs **ffmpeg** to convert for Telegram voice bubbles ```bash # Ubuntu/Debian