mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(tts): add Google Gemini TTS provider (#11229)
Adds Google Gemini TTS as the seventh voice provider, with 30 prebuilt voices (Zephyr, Puck, Kore, Enceladus, Gacrux, etc.) and natural-language prompt control. Integrates through the existing provider chain: - tools/tts_tool.py: new _generate_gemini_tts() calls the generativelanguage REST endpoint with responseModalities=[AUDIO], wraps the returned 24kHz mono 16-bit PCM (L16) in a WAV RIFF header, then ffmpeg-converts to MP3 or Opus depending on output extension. For .ogg output, libopus is forced explicitly so Telegram voice bubbles get Opus (ffmpeg defaults to Vorbis for .ogg). - hermes_cli/tools_config.py: exposes 'Google Gemini TTS' as a provider option in the curses-based 'hermes tools' UI. - hermes_cli/setup.py: adds gemini to the setup wizard picker, tool status display, and API key prompt branch (accepts existing GEMINI_API_KEY or GOOGLE_API_KEY, falls back to Edge if neither set). - tests/tools/test_tts_gemini.py: 15 unit tests covering WAV header wrap correctness, env var fallback (GEMINI/GOOGLE), voice/model overrides, snake_case vs camelCase inlineData handling, HTTP error surfacing, and empty-audio edge cases. - docs: TTS features page updated to list seven providers with the new gemini config block and ffmpeg notes. Live-tested against api key against gemini-2.5-flash-preview-tts: .wav, .mp3, and Telegram-compatible .ogg (Opus codec) all produce valid playable audio.
This commit is contained in:
parent
80855f964e
commit
fce6c3cdf6
5 changed files with 506 additions and 6 deletions
|
|
@ -430,6 +430,8 @@ def _print_setup_summary(config: dict, hermes_home):
|
|||
tool_status.append(("Text-to-Speech (MiniMax)", True, None))
|
||||
elif tts_provider == "mistral" and get_env_value("MISTRAL_API_KEY"):
|
||||
tool_status.append(("Text-to-Speech (Mistral Voxtral)", True, None))
|
||||
elif tts_provider == "gemini" and (get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")):
|
||||
tool_status.append(("Text-to-Speech (Google Gemini)", True, None))
|
||||
elif tts_provider == "neutts":
|
||||
try:
|
||||
import importlib.util
|
||||
|
|
@ -913,6 +915,7 @@ def _setup_tts_provider(config: dict):
|
|||
"xai": "xAI TTS",
|
||||
"minimax": "MiniMax TTS",
|
||||
"mistral": "Mistral Voxtral TTS",
|
||||
"gemini": "Google Gemini TTS",
|
||||
"neutts": "NeuTTS",
|
||||
}
|
||||
current_label = provider_labels.get(current_provider, current_provider)
|
||||
|
|
@ -935,10 +938,11 @@ def _setup_tts_provider(config: dict):
|
|||
"xAI TTS (Grok voices, needs API key)",
|
||||
"MiniMax TTS (high quality with voice cloning, needs API key)",
|
||||
"Mistral Voxtral TTS (multilingual, native Opus, needs API key)",
|
||||
"Google Gemini TTS (30 prebuilt voices, prompt-controllable, needs API key)",
|
||||
"NeuTTS (local on-device, free, ~300MB model download)",
|
||||
]
|
||||
)
|
||||
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "neutts"])
|
||||
providers.extend(["edge", "elevenlabs", "openai", "xai", "minimax", "mistral", "gemini", "neutts"])
|
||||
choices.append(f"Keep current ({current_label})")
|
||||
keep_current_idx = len(choices) - 1
|
||||
idx = prompt_choice("Select TTS provider:", choices, keep_current_idx)
|
||||
|
|
@ -1045,6 +1049,19 @@ def _setup_tts_provider(config: dict):
|
|||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
elif selected == "gemini":
|
||||
existing = get_env_value("GEMINI_API_KEY") or get_env_value("GOOGLE_API_KEY")
|
||||
if not existing:
|
||||
print()
|
||||
print_info("Get a free API key at https://aistudio.google.com/app/apikey")
|
||||
api_key = prompt("Gemini API key for TTS", password=True)
|
||||
if api_key:
|
||||
save_env_value("GEMINI_API_KEY", api_key)
|
||||
print_success("Gemini TTS API key saved")
|
||||
else:
|
||||
print_warning("No API key provided. Falling back to Edge TTS.")
|
||||
selected = "edge"
|
||||
|
||||
# Save the selection
|
||||
if "tts" not in config:
|
||||
config["tts"] = {}
|
||||
|
|
|
|||
|
|
@ -172,6 +172,15 @@ TOOL_CATEGORIES = {
|
|||
],
|
||||
"tts_provider": "mistral",
|
||||
},
|
||||
{
|
||||
"name": "Google Gemini TTS",
|
||||
"badge": "preview",
|
||||
"tag": "30 prebuilt voices, controllable via prompts",
|
||||
"env_vars": [
|
||||
{"key": "GEMINI_API_KEY", "prompt": "Gemini API key", "url": "https://aistudio.google.com/app/apikey"},
|
||||
],
|
||||
"tts_provider": "gemini",
|
||||
},
|
||||
],
|
||||
},
|
||||
"web": {
|
||||
|
|
|
|||
287
tests/tools/test_tts_gemini.py
Normal file
287
tests/tools/test_tts_gemini.py
Normal file
|
|
@ -0,0 +1,287 @@
|
|||
"""Tests for the Google Gemini TTS provider in tools/tts_tool.py."""
|
||||
|
||||
import base64
|
||||
import struct
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_env(monkeypatch):
|
||||
for key in (
|
||||
"GEMINI_API_KEY",
|
||||
"GOOGLE_API_KEY",
|
||||
"GEMINI_BASE_URL",
|
||||
"HERMES_SESSION_PLATFORM",
|
||||
):
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_pcm_bytes():
|
||||
# 0.1s of silence at 24kHz mono 16-bit = 4800 bytes
|
||||
return b"\x00" * 4800
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_gemini_response(fake_pcm_bytes):
|
||||
"""A successful Gemini generateContent response."""
|
||||
resp = MagicMock()
|
||||
resp.status_code = 200
|
||||
resp.json.return_value = {
|
||||
"candidates": [
|
||||
{
|
||||
"content": {
|
||||
"parts": [
|
||||
{
|
||||
"inlineData": {
|
||||
"mimeType": "audio/L16;codec=pcm;rate=24000",
|
||||
"data": base64.b64encode(fake_pcm_bytes).decode(),
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
return resp
|
||||
|
||||
|
||||
class TestWrapPcmAsWav:
|
||||
def test_riff_header_structure(self):
|
||||
from tools.tts_tool import _wrap_pcm_as_wav
|
||||
|
||||
pcm = b"\x01\x02\x03\x04" * 10
|
||||
wav = _wrap_pcm_as_wav(pcm, sample_rate=24000, channels=1, sample_width=2)
|
||||
|
||||
assert wav[:4] == b"RIFF"
|
||||
assert wav[8:12] == b"WAVE"
|
||||
assert wav[12:16] == b"fmt "
|
||||
# Audio format (PCM=1)
|
||||
assert struct.unpack("<H", wav[20:22])[0] == 1
|
||||
# Channels
|
||||
assert struct.unpack("<H", wav[22:24])[0] == 1
|
||||
# Sample rate
|
||||
assert struct.unpack("<I", wav[24:28])[0] == 24000
|
||||
# Bits per sample
|
||||
assert struct.unpack("<H", wav[34:36])[0] == 16
|
||||
assert wav[36:40] == b"data"
|
||||
assert wav[44:] == pcm
|
||||
|
||||
def test_header_size_is_44(self):
|
||||
from tools.tts_tool import _wrap_pcm_as_wav
|
||||
|
||||
pcm = b"\xff" * 100
|
||||
wav = _wrap_pcm_as_wav(pcm)
|
||||
assert len(wav) == 44 + len(pcm)
|
||||
|
||||
|
||||
class TestGenerateGeminiTts:
|
||||
def test_missing_api_key_raises_value_error(self, tmp_path):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
output_path = str(tmp_path / "test.wav")
|
||||
with pytest.raises(ValueError, match="GEMINI_API_KEY"):
|
||||
_generate_gemini_tts("Hello", output_path, {})
|
||||
|
||||
def test_google_api_key_fallback(self, tmp_path, monkeypatch, mock_gemini_response):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GOOGLE_API_KEY", "from-google-env")
|
||||
output_path = str(tmp_path / "test.wav")
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi", output_path, {})
|
||||
|
||||
# Confirm it used the GOOGLE_API_KEY as the query parameter
|
||||
_, kwargs = mock_post.call_args
|
||||
assert kwargs["params"]["key"] == "from-google-env"
|
||||
|
||||
def test_wav_output_fast_path(self, tmp_path, monkeypatch, mock_gemini_response, fake_pcm_bytes):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
output_path = str(tmp_path / "test.wav")
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response):
|
||||
result = _generate_gemini_tts("Hi", output_path, {})
|
||||
|
||||
assert result == output_path
|
||||
data = (tmp_path / "test.wav").read_bytes()
|
||||
assert data[:4] == b"RIFF"
|
||||
assert data[8:12] == b"WAVE"
|
||||
# Audio payload should match the PCM we put in
|
||||
assert data[44:] == fake_pcm_bytes
|
||||
|
||||
def test_default_voice_and_model(self, tmp_path, monkeypatch, mock_gemini_response):
|
||||
from tools.tts_tool import (
|
||||
DEFAULT_GEMINI_TTS_MODEL,
|
||||
DEFAULT_GEMINI_TTS_VOICE,
|
||||
_generate_gemini_tts,
|
||||
)
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
|
||||
|
||||
args, kwargs = mock_post.call_args
|
||||
assert DEFAULT_GEMINI_TTS_MODEL in args[0]
|
||||
payload = kwargs["json"]
|
||||
voice = (
|
||||
payload["generationConfig"]["speechConfig"]["voiceConfig"]
|
||||
["prebuiltVoiceConfig"]["voiceName"]
|
||||
)
|
||||
assert voice == DEFAULT_GEMINI_TTS_VOICE
|
||||
|
||||
def test_custom_voice(self, tmp_path, monkeypatch, mock_gemini_response):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
config = {"gemini": {"voice": "Puck"}}
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
|
||||
|
||||
payload = mock_post.call_args[1]["json"]
|
||||
voice = (
|
||||
payload["generationConfig"]["speechConfig"]["voiceConfig"]
|
||||
["prebuiltVoiceConfig"]["voiceName"]
|
||||
)
|
||||
assert voice == "Puck"
|
||||
|
||||
def test_custom_model(self, tmp_path, monkeypatch, mock_gemini_response):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
config = {"gemini": {"model": "gemini-2.5-pro-preview-tts"}}
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
|
||||
|
||||
endpoint = mock_post.call_args[0][0]
|
||||
assert "gemini-2.5-pro-preview-tts" in endpoint
|
||||
|
||||
def test_response_modality_is_audio(self, tmp_path, monkeypatch, mock_gemini_response):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
|
||||
|
||||
payload = mock_post.call_args[1]["json"]
|
||||
assert payload["generationConfig"]["responseModalities"] == ["AUDIO"]
|
||||
|
||||
def test_http_error_raises_runtime_error(self, tmp_path, monkeypatch):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
err_resp = MagicMock()
|
||||
err_resp.status_code = 400
|
||||
err_resp.json.return_value = {"error": {"message": "Invalid voice"}}
|
||||
|
||||
with patch("requests.post", return_value=err_resp):
|
||||
with pytest.raises(RuntimeError, match="HTTP 400.*Invalid voice"):
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
|
||||
|
||||
def test_empty_audio_raises(self, tmp_path, monkeypatch):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
resp = MagicMock()
|
||||
resp.status_code = 200
|
||||
resp.json.return_value = {
|
||||
"candidates": [
|
||||
{"content": {"parts": [{"inlineData": {"data": ""}}]}}
|
||||
]
|
||||
}
|
||||
|
||||
with patch("requests.post", return_value=resp):
|
||||
with pytest.raises(RuntimeError, match="empty audio"):
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
|
||||
|
||||
def test_malformed_response_raises(self, tmp_path, monkeypatch):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
resp = MagicMock()
|
||||
resp.status_code = 200
|
||||
resp.json.return_value = {"candidates": []} # no content
|
||||
|
||||
with patch("requests.post", return_value=resp):
|
||||
with pytest.raises(RuntimeError, match="malformed"):
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
|
||||
|
||||
def test_snake_case_inline_data_accepted(self, tmp_path, monkeypatch, fake_pcm_bytes):
|
||||
"""Some Gemini SDK versions return inline_data instead of inlineData."""
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
resp = MagicMock()
|
||||
resp.status_code = 200
|
||||
resp.json.return_value = {
|
||||
"candidates": [
|
||||
{
|
||||
"content": {
|
||||
"parts": [
|
||||
{
|
||||
"inline_data": {
|
||||
"data": base64.b64encode(fake_pcm_bytes).decode()
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
output_path = str(tmp_path / "test.wav")
|
||||
with patch("requests.post", return_value=resp):
|
||||
_generate_gemini_tts("Hi", output_path, {})
|
||||
|
||||
data = (tmp_path / "test.wav").read_bytes()
|
||||
assert data[:4] == b"RIFF"
|
||||
|
||||
def test_custom_base_url_env(self, tmp_path, monkeypatch, mock_gemini_response):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
monkeypatch.setenv("GEMINI_BASE_URL", "https://custom-gemini.example.com/v1beta")
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
|
||||
|
||||
assert mock_post.call_args[0][0].startswith("https://custom-gemini.example.com/v1beta/")
|
||||
|
||||
|
||||
class TestGeminiInCheckRequirements:
|
||||
def test_gemini_api_key_satisfies_requirements(self, monkeypatch):
|
||||
from tools.tts_tool import check_tts_requirements
|
||||
|
||||
# Strip everything else
|
||||
for key in (
|
||||
"ELEVENLABS_API_KEY",
|
||||
"OPENAI_API_KEY",
|
||||
"VOICE_TOOLS_OPENAI_KEY",
|
||||
"MINIMAX_API_KEY",
|
||||
"XAI_API_KEY",
|
||||
"MISTRAL_API_KEY",
|
||||
"GOOGLE_API_KEY",
|
||||
):
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "k")
|
||||
|
||||
# Force edge_tts import to fail so we actually hit the gemini check
|
||||
import builtins
|
||||
|
||||
real_import = builtins.__import__
|
||||
|
||||
def fake_import(name, *args, **kwargs):
|
||||
if name == "edge_tts":
|
||||
raise ImportError("simulated")
|
||||
return real_import(name, *args, **kwargs)
|
||||
|
||||
with patch("builtins.__import__", side_effect=fake_import):
|
||||
assert check_tts_requirements() is True
|
||||
|
|
@ -2,12 +2,13 @@
|
|||
"""
|
||||
Text-to-Speech Tool Module
|
||||
|
||||
Supports six TTS providers:
|
||||
Supports seven TTS providers:
|
||||
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
|
||||
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
|
||||
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
|
||||
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
|
||||
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
|
||||
- Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
|
||||
|
||||
Output formats:
|
||||
|
|
@ -99,6 +100,13 @@ DEFAULT_XAI_LANGUAGE = "en"
|
|||
DEFAULT_XAI_SAMPLE_RATE = 24000
|
||||
DEFAULT_XAI_BIT_RATE = 128000
|
||||
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
|
||||
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
|
||||
DEFAULT_GEMINI_TTS_VOICE = "Kore"
|
||||
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
|
||||
# PCM output specs for Gemini TTS (fixed by the API)
|
||||
GEMINI_TTS_SAMPLE_RATE = 24000
|
||||
GEMINI_TTS_CHANNELS = 1
|
||||
GEMINI_TTS_SAMPLE_WIDTH = 2 # 16-bit PCM (L16)
|
||||
|
||||
def _get_default_output_dir() -> str:
|
||||
from hermes_constants import get_hermes_dir
|
||||
|
|
@ -506,6 +514,174 @@ def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
|||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Provider: Google Gemini TTS
|
||||
# ===========================================================================
|
||||
def _wrap_pcm_as_wav(
|
||||
pcm_bytes: bytes,
|
||||
sample_rate: int = GEMINI_TTS_SAMPLE_RATE,
|
||||
channels: int = GEMINI_TTS_CHANNELS,
|
||||
sample_width: int = GEMINI_TTS_SAMPLE_WIDTH,
|
||||
) -> bytes:
|
||||
"""Wrap raw signed-little-endian PCM with a standard WAV RIFF header.
|
||||
|
||||
Gemini TTS returns audio/L16;codec=pcm;rate=24000 -- raw PCM samples with
|
||||
no container. We add a minimal WAV header so the file is playable and
|
||||
ffmpeg can re-encode it to MP3/Opus downstream.
|
||||
"""
|
||||
import struct
|
||||
|
||||
byte_rate = sample_rate * channels * sample_width
|
||||
block_align = channels * sample_width
|
||||
data_size = len(pcm_bytes)
|
||||
fmt_chunk = struct.pack(
|
||||
"<4sIHHIIHH",
|
||||
b"fmt ",
|
||||
16, # fmt chunk size (PCM)
|
||||
1, # audio format (PCM)
|
||||
channels,
|
||||
sample_rate,
|
||||
byte_rate,
|
||||
block_align,
|
||||
sample_width * 8,
|
||||
)
|
||||
data_chunk_header = struct.pack("<4sI", b"data", data_size)
|
||||
riff_size = 4 + len(fmt_chunk) + len(data_chunk_header) + data_size
|
||||
riff_header = struct.pack("<4sI4s", b"RIFF", riff_size, b"WAVE")
|
||||
return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
|
||||
|
||||
|
||||
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate audio using Google Gemini TTS.
|
||||
|
||||
Gemini's generateContent endpoint with responseModalities=["AUDIO"] returns
|
||||
raw 24kHz mono 16-bit PCM (L16) as base64. We wrap it with a WAV RIFF
|
||||
header to produce a playable file, then ffmpeg-convert to MP3 / Opus if
|
||||
the caller requested those formats (same pattern as NeuTTS).
|
||||
|
||||
Args:
|
||||
text: Text to convert (prompt-style; supports inline direction like
|
||||
"Say cheerfully:" and audio tags like [whispers]).
|
||||
output_path: Where to save the audio file (.wav, .mp3, or .ogg).
|
||||
tts_config: TTS config dict.
|
||||
|
||||
Returns:
|
||||
Path to the saved audio file.
|
||||
"""
|
||||
import requests
|
||||
|
||||
api_key = (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or "").strip()
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
|
||||
)
|
||||
|
||||
gemini_config = tts_config.get("gemini", {})
|
||||
model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
|
||||
voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
|
||||
base_url = str(
|
||||
gemini_config.get("base_url")
|
||||
or os.getenv("GEMINI_BASE_URL")
|
||||
or DEFAULT_GEMINI_TTS_BASE_URL
|
||||
).strip().rstrip("/")
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"contents": [{"parts": [{"text": text}]}],
|
||||
"generationConfig": {
|
||||
"responseModalities": ["AUDIO"],
|
||||
"speechConfig": {
|
||||
"voiceConfig": {
|
||||
"prebuiltVoiceConfig": {"voiceName": voice},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
endpoint = f"{base_url}/models/{model}:generateContent"
|
||||
response = requests.post(
|
||||
endpoint,
|
||||
params={"key": api_key},
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=payload,
|
||||
timeout=60,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
# Surface the API error message when present
|
||||
try:
|
||||
err = response.json().get("error", {})
|
||||
detail = err.get("message") or response.text[:300]
|
||||
except Exception:
|
||||
detail = response.text[:300]
|
||||
raise RuntimeError(
|
||||
f"Gemini TTS API error (HTTP {response.status_code}): {detail}"
|
||||
)
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
parts = data["candidates"][0]["content"]["parts"]
|
||||
audio_part = next((p for p in parts if "inlineData" in p or "inline_data" in p), None)
|
||||
if audio_part is None:
|
||||
raise RuntimeError("Gemini TTS response contained no audio data")
|
||||
inline = audio_part.get("inlineData") or audio_part.get("inline_data") or {}
|
||||
audio_b64 = inline.get("data", "")
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
raise RuntimeError(f"Gemini TTS response was malformed: {e}") from e
|
||||
|
||||
if not audio_b64:
|
||||
raise RuntimeError("Gemini TTS returned empty audio data")
|
||||
|
||||
pcm_bytes = base64.b64decode(audio_b64)
|
||||
wav_bytes = _wrap_pcm_as_wav(pcm_bytes)
|
||||
|
||||
# Fast path: caller wants WAV directly, just write.
|
||||
if output_path.lower().endswith(".wav"):
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(wav_bytes)
|
||||
return output_path
|
||||
|
||||
# Otherwise write WAV to a temp file and ffmpeg-convert to the target
|
||||
# format (.mp3 or .ogg). If ffmpeg is missing, fall back to renaming the
|
||||
# WAV -- this matches the NeuTTS behavior and keeps the tool usable on
|
||||
# systems without ffmpeg (audio still plays, just with a misleading
|
||||
# extension).
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp.write(wav_bytes)
|
||||
wav_path = tmp.name
|
||||
|
||||
try:
|
||||
ffmpeg = shutil.which("ffmpeg")
|
||||
if ffmpeg:
|
||||
# For .ogg output, force libopus encoding (Telegram voice bubbles
|
||||
# require Opus specifically; ffmpeg's default for .ogg is Vorbis).
|
||||
if output_path.lower().endswith(".ogg"):
|
||||
cmd = [
|
||||
ffmpeg, "-i", wav_path,
|
||||
"-acodec", "libopus", "-ac", "1",
|
||||
"-b:a", "64k", "-vbr", "off",
|
||||
"-y", "-loglevel", "error",
|
||||
output_path,
|
||||
]
|
||||
else:
|
||||
cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode("utf-8", errors="ignore")[:300]
|
||||
raise RuntimeError(f"ffmpeg conversion failed: {stderr}")
|
||||
else:
|
||||
logger.warning(
|
||||
"ffmpeg not found; writing raw WAV to %s (extension may be misleading)",
|
||||
output_path,
|
||||
)
|
||||
shutil.copyfile(wav_path, output_path)
|
||||
finally:
|
||||
try:
|
||||
os.remove(wav_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# NeuTTS (local, on-device TTS via neutts_cli)
|
||||
# ===========================================================================
|
||||
|
|
@ -634,7 +810,7 @@ def text_to_speech_tool(
|
|||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Use .ogg for Telegram with providers that support native Opus output,
|
||||
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
|
||||
if want_opus and provider in ("openai", "elevenlabs", "mistral"):
|
||||
if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
|
||||
file_path = out_dir / f"tts_{timestamp}.ogg"
|
||||
else:
|
||||
file_path = out_dir / f"tts_{timestamp}.mp3"
|
||||
|
|
@ -687,6 +863,10 @@ def text_to_speech_tool(
|
|||
logger.info("Generating speech with Mistral Voxtral TTS...")
|
||||
_generate_mistral_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "gemini":
|
||||
logger.info("Generating speech with Google Gemini TTS...")
|
||||
_generate_gemini_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "neutts":
|
||||
if not _check_neutts_available():
|
||||
return json.dumps({
|
||||
|
|
@ -741,7 +921,7 @@ def text_to_speech_tool(
|
|||
if opus_path:
|
||||
file_str = opus_path
|
||||
voice_compatible = True
|
||||
elif provider in ("elevenlabs", "openai", "mistral"):
|
||||
elif provider in ("elevenlabs", "openai", "mistral", "gemini"):
|
||||
voice_compatible = file_str.endswith(".ogg")
|
||||
|
||||
file_size = os.path.getsize(file_str)
|
||||
|
|
@ -811,6 +991,8 @@ def check_tts_requirements() -> bool:
|
|||
return True
|
||||
if os.getenv("XAI_API_KEY"):
|
||||
return True
|
||||
if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
|
||||
return True
|
||||
try:
|
||||
_import_mistral_client()
|
||||
if os.getenv("MISTRAL_API_KEY"):
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ If you have a paid [Nous Portal](https://portal.nousresearch.com) subscription,
|
|||
|
||||
## Text-to-Speech
|
||||
|
||||
Convert text to speech with six providers:
|
||||
Convert text to speech with seven providers:
|
||||
|
||||
| Provider | Quality | Cost | API Key |
|
||||
|----------|---------|------|---------|
|
||||
|
|
@ -23,6 +23,7 @@ Convert text to speech with six providers:
|
|||
| **OpenAI TTS** | Good | Paid | `VOICE_TOOLS_OPENAI_KEY` |
|
||||
| **MiniMax TTS** | Excellent | Paid | `MINIMAX_API_KEY` |
|
||||
| **Mistral (Voxtral TTS)** | Excellent | Paid | `MISTRAL_API_KEY` |
|
||||
| **Google Gemini TTS** | Excellent | Free tier | `GEMINI_API_KEY` |
|
||||
| **NeuTTS** | Good | Free | None needed |
|
||||
|
||||
### Platform Delivery
|
||||
|
|
@ -39,7 +40,7 @@ Convert text to speech with six providers:
|
|||
```yaml
|
||||
# In ~/.hermes/config.yaml
|
||||
tts:
|
||||
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "neutts"
|
||||
provider: "edge" # "edge" | "elevenlabs" | "openai" | "minimax" | "mistral" | "gemini" | "neutts"
|
||||
speed: 1.0 # Global speed multiplier (provider-specific settings override this)
|
||||
edge:
|
||||
voice: "en-US-AriaNeural" # 322 voices, 74 languages
|
||||
|
|
@ -61,6 +62,9 @@ tts:
|
|||
mistral:
|
||||
model: "voxtral-mini-tts-2603"
|
||||
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
|
||||
gemini:
|
||||
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
|
||||
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc.
|
||||
neutts:
|
||||
ref_audio: ''
|
||||
ref_text: ''
|
||||
|
|
@ -77,6 +81,7 @@ Telegram voice bubbles require Opus/OGG audio format:
|
|||
- **OpenAI, ElevenLabs, and Mistral** produce Opus natively — no extra setup
|
||||
- **Edge TTS** (default) outputs MP3 and needs **ffmpeg** to convert:
|
||||
- **MiniMax TTS** outputs MP3 and needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
- **Google Gemini TTS** outputs raw PCM and uses **ffmpeg** to encode Opus directly for Telegram voice bubbles
|
||||
- **NeuTTS** outputs WAV and also needs **ffmpeg** to convert for Telegram voice bubbles
|
||||
|
||||
```bash
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue