mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-29 01:31:41 +00:00
feat(tts): add Google Gemini TTS provider (#11229)
Adds Google Gemini TTS as the seventh voice provider, with 30 prebuilt voices (Zephyr, Puck, Kore, Enceladus, Gacrux, etc.) and natural-language prompt control. Integrates through the existing provider chain: - tools/tts_tool.py: new _generate_gemini_tts() calls the generativelanguage REST endpoint with responseModalities=[AUDIO], wraps the returned 24kHz mono 16-bit PCM (L16) in a WAV RIFF header, then ffmpeg-converts to MP3 or Opus depending on output extension. For .ogg output, libopus is forced explicitly so Telegram voice bubbles get Opus (ffmpeg defaults to Vorbis for .ogg). - hermes_cli/tools_config.py: exposes 'Google Gemini TTS' as a provider option in the curses-based 'hermes tools' UI. - hermes_cli/setup.py: adds gemini to the setup wizard picker, tool status display, and API key prompt branch (accepts existing GEMINI_API_KEY or GOOGLE_API_KEY, falls back to Edge if neither set). - tests/tools/test_tts_gemini.py: 15 unit tests covering WAV header wrap correctness, env var fallback (GEMINI/GOOGLE), voice/model overrides, snake_case vs camelCase inlineData handling, HTTP error surfacing, and empty-audio edge cases. - docs: TTS features page updated to list seven providers with the new gemini config block and ffmpeg notes. Live-tested against api key against gemini-2.5-flash-preview-tts: .wav, .mp3, and Telegram-compatible .ogg (Opus codec) all produce valid playable audio.
This commit is contained in:
parent
80855f964e
commit
fce6c3cdf6
5 changed files with 506 additions and 6 deletions
|
|
@ -430,6 +430,8 @@ def _print_setup_summary(config: dict, hermes_home):
|
|||
tool_status.append(("Text-to-Speech (MiniMax)", True, None))
|
||||
elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"):
|
||||
tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None))
|
||||
elif tts_provider == "gemini" and (get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")):
|
||||
tool_status.append(("Text-to-Speech (Google Gemini)", True, None))
|
||||
elif tts_provider == "neutts":
|
||||
try:
|
||||
import importlib.util
|
||||
|
|
@ -913,6 +915,7 @@ def _setup_tts_provider(config: dict):
|
|||
"xai": "xAI TTS",
|
||||
"minimax": "MiniMax TTS",
|
||||
"mistral": "Mistral Voxtral TTS",
|
||||
"gemini": "Google Gemini TTS",
|
||||
"neutts": "NeuTTS",
|
||||
}
|
||||
current_label = provider_labels.get(current_provider, current_provider)
|
||||
|
|
@ -935,10 +938,11 @@ def _setup_tts_provider(config: dict):
|
|||
"xAI TTS (Grok voices, needs API key)",
|
||||
"MiniMax TTS (high quality with voice cloning, needs API key)",
|
||||
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
|
||||
"Google Gemini TTS (30 prebuilt voices, prompt-controllable, needs API key)",
|
||||
"NeuTTS (local on-device, free, ~300MB model download)",
|
||||
]
|
||||
)
|
||||
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts"])
|
||||
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "gemini", "neutts"])
|
||||
choices.append(f"Keep current ({current_label})")
|
||||
keep_current_idx = len(choices) - 1
|
||||
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
|
||||
|
|
@ -1045,6 +1049,19 @@ def _setup_tts_provider(config: dict):
|
|||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
elif selected == "gemini":
|
||||
existing = get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")
|
||||
if not existing:
|
||||
print()
|
||||
print_info("Get a free API key at https://aistudio.google.com/app/apikey")
|
||||
api_key = prompt("Gemini API key for TTS", password=True)
|
||||
if api_key:
|
||||
save_env_value("GEMINI_API_KEY", api_key)
|
||||
print_success("Gemini TTS API key saved")
|
||||
else:
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
# Save the selection
|
||||
if "tts" not in config:
|
||||
config["tts"] = {}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue