feat(tts): add Google Gemini TTS provider (#11229)

Adds Google Gemini TTS as the seventh voice provider, with 30 prebuilt voices (Zephyr, Puck, Kore, Enceladus, Gacrux, etc.) and natural-language prompt control. Integrates through the existing provider chain: - tools/tts_tool.py: new _generate_gemini_tts() calls the generativelanguage REST endpoint with responseModalities=[AUDIO], wraps the returned 24kHz mono 16-bit PCM (L16) in a WAV RIFF header, then ffmpeg-converts to MP3 or Opus depending on output extension. For .ogg output, libopus is forced explicitly so Telegram voice bubbles get Opus (ffmpeg defaults to Vorbis for .ogg). - hermes_cli/tools_config.py: exposes 'Google Gemini TTS' as a provider option in the curses-based 'hermes tools' UI. - hermes_cli/setup.py: adds gemini to the setup wizard picker, tool status display, and API key prompt branch (accepts existing GEMINI_API_KEY or GOOGLE_API_KEY, falls back to Edge if neither set). - tests/tools/test_tts_gemini.py: 15 unit tests covering WAV header wrap correctness, env var fallback (GEMINI/GOOGLE), voice/model overrides, snake_case vs camelCase inlineData handling, HTTP error surfacing, and empty-audio edge cases. - docs: TTS features page updated to list seven providers with the new gemini config block and ffmpeg notes. Live-tested against api key against gemini-2.5-flash-preview-tts: .wav, .mp3, and Telegram-compatible .ogg (Opus codec) all produce valid playable audio.
2026-04-28 01:21:43 +00:00 · 2026-04-16 14:23:16 -07:00 · 2026-04-16 14:23:16 -07:00 · fce6c3cdf6
commit fce6c3cdf6
parent 80855f964e
5 changed files with 506 additions and 6 deletions
--- a/tests/tools/test_tts_gemini.py
+++ b/tests/tools/test_tts_gemini.py
@ -0,0 +1,287 @@
+"""Tests for the Google Gemini TTS provider in tools/tts_tool.py."""
+
+import base64
+import struct
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def clean_env(monkeypatch):
+    for key in (
+        "GEMINI_API_KEY",
+        "GOOGLE_API_KEY",
+        "GEMINI_BASE_URL",
+        "HERMES_SESSION_PLATFORM",
+    ):
+        monkeypatch.delenv(key, raising=False)
+
+
+@pytest.fixture
+def fake_pcm_bytes():
+    # 0.1s of silence at 24kHz mono 16-bit = 4800 bytes
+    return b"\x00" * 4800
+
+
+@pytest.fixture
+def mock_gemini_response(fake_pcm_bytes):
+    """A successful Gemini generateContent response."""
+    resp = MagicMock()
+    resp.status_code = 200
+    resp.json.return_value = {
+        "candidates": [
+            {
+                "content": {
+                    "parts": [
+                        {
+                            "inlineData": {
+                                "mimeType": "audio/L16;codec=pcm;rate=24000",
+                                "data": base64.b64encode(fake_pcm_bytes).decode(),
+                            }
+                        }
+                    ]
+                }
+            }
+        ]
+    }
+    return resp
+
+
+class TestWrapPcmAsWav:
+    def test_riff_header_structure(self):
+        from tools.tts_tool import _wrap_pcm_as_wav
+
+        pcm = b"\x01\x02\x03\x04" * 10
+        wav = _wrap_pcm_as_wav(pcm, sample_rate=24000, channels=1, sample_width=2)
+
+        assert wav[:4] == b"RIFF"
+        assert wav[8:12] == b"WAVE"
+        assert wav[12:16] == b"fmt "
+        # Audio format (PCM=1)
+        assert struct.unpack("<H", wav[20:22])[0] == 1
+        # Channels
+        assert struct.unpack("<H", wav[22:24])[0] == 1
+        # Sample rate
+        assert struct.unpack("<I", wav[24:28])[0] == 24000
+        # Bits per sample
+        assert struct.unpack("<H", wav[34:36])[0] == 16
+        assert wav[36:40] == b"data"
+        assert wav[44:] == pcm
+
+    def test_header_size_is_44(self):
+        from tools.tts_tool import _wrap_pcm_as_wav
+
+        pcm = b"\xff" * 100
+        wav = _wrap_pcm_as_wav(pcm)
+        assert len(wav) == 44 + len(pcm)
+
+
+class TestGenerateGeminiTts:
+    def test_missing_api_key_raises_value_error(self, tmp_path):
+        from tools.tts_tool import _generate_gemini_tts
+
+        output_path = str(tmp_path / "test.wav")
+        with pytest.raises(ValueError, match="GEMINI_API_KEY"):
+            _generate_gemini_tts("Hello", output_path, {})
+
+    def test_google_api_key_fallback(self, tmp_path, monkeypatch, mock_gemini_response):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GOOGLE_API_KEY", "from-google-env")
+        output_path = str(tmp_path / "test.wav")
+
+        with patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi", output_path, {})
+
+        # Confirm it used the GOOGLE_API_KEY as the query parameter
+        _, kwargs = mock_post.call_args
+        assert kwargs["params"]["key"] == "from-google-env"
+
+    def test_wav_output_fast_path(self, tmp_path, monkeypatch, mock_gemini_response, fake_pcm_bytes):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        output_path = str(tmp_path / "test.wav")
+
+        with patch("requests.post", return_value=mock_gemini_response):
+            result = _generate_gemini_tts("Hi", output_path, {})
+
+        assert result == output_path
+        data = (tmp_path / "test.wav").read_bytes()
+        assert data[:4] == b"RIFF"
+        assert data[8:12] == b"WAVE"
+        # Audio payload should match the PCM we put in
+        assert data[44:] == fake_pcm_bytes
+
+    def test_default_voice_and_model(self, tmp_path, monkeypatch, mock_gemini_response):
+        from tools.tts_tool import (
+            DEFAULT_GEMINI_TTS_MODEL,
+            DEFAULT_GEMINI_TTS_VOICE,
+            _generate_gemini_tts,
+        )
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+
+        with patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
+
+        args, kwargs = mock_post.call_args
+        assert DEFAULT_GEMINI_TTS_MODEL in args[0]
+        payload = kwargs["json"]
+        voice = (
+            payload["generationConfig"]["speechConfig"]["voiceConfig"]
+            ["prebuiltVoiceConfig"]["voiceName"]
+        )
+        assert voice == DEFAULT_GEMINI_TTS_VOICE
+
+    def test_custom_voice(self, tmp_path, monkeypatch, mock_gemini_response):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        config = {"gemini": {"voice": "Puck"}}
+
+        with patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
+
+        payload = mock_post.call_args[1]["json"]
+        voice = (
+            payload["generationConfig"]["speechConfig"]["voiceConfig"]
+            ["prebuiltVoiceConfig"]["voiceName"]
+        )
+        assert voice == "Puck"
+
+    def test_custom_model(self, tmp_path, monkeypatch, mock_gemini_response):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        config = {"gemini": {"model": "gemini-2.5-pro-preview-tts"}}
+
+        with patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
+
+        endpoint = mock_post.call_args[0][0]
+        assert "gemini-2.5-pro-preview-tts" in endpoint
+
+    def test_response_modality_is_audio(self, tmp_path, monkeypatch, mock_gemini_response):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+
+        with patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
+
+        payload = mock_post.call_args[1]["json"]
+        assert payload["generationConfig"]["responseModalities"] == ["AUDIO"]
+
+    def test_http_error_raises_runtime_error(self, tmp_path, monkeypatch):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        err_resp = MagicMock()
+        err_resp.status_code = 400
+        err_resp.json.return_value = {"error": {"message": "Invalid voice"}}
+
+        with patch("requests.post", return_value=err_resp):
+            with pytest.raises(RuntimeError, match="HTTP 400.*Invalid voice"):
+                _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
+
+    def test_empty_audio_raises(self, tmp_path, monkeypatch):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        resp = MagicMock()
+        resp.status_code = 200
+        resp.json.return_value = {
+            "candidates": [
+                {"content": {"parts": [{"inlineData": {"data": ""}}]}}
+            ]
+        }
+
+        with patch("requests.post", return_value=resp):
+            with pytest.raises(RuntimeError, match="empty audio"):
+                _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
+
+    def test_malformed_response_raises(self, tmp_path, monkeypatch):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        resp = MagicMock()
+        resp.status_code = 200
+        resp.json.return_value = {"candidates": []}  # no content
+
+        with patch("requests.post", return_value=resp):
+            with pytest.raises(RuntimeError, match="malformed"):
+                _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
+
+    def test_snake_case_inline_data_accepted(self, tmp_path, monkeypatch, fake_pcm_bytes):
+        """Some Gemini SDK versions return inline_data instead of inlineData."""
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        resp = MagicMock()
+        resp.status_code = 200
+        resp.json.return_value = {
+            "candidates": [
+                {
+                    "content": {
+                        "parts": [
+                            {
+                                "inline_data": {
+                                    "data": base64.b64encode(fake_pcm_bytes).decode()
+                                }
+                            }
+                        ]
+                    }
+                }
+            ]
+        }
+
+        output_path = str(tmp_path / "test.wav")
+        with patch("requests.post", return_value=resp):
+            _generate_gemini_tts("Hi", output_path, {})
+
+        data = (tmp_path / "test.wav").read_bytes()
+        assert data[:4] == b"RIFF"
+
+    def test_custom_base_url_env(self, tmp_path, monkeypatch, mock_gemini_response):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        monkeypatch.setenv("GEMINI_BASE_URL", "https://custom-gemini.example.com/v1beta")
+
+        with patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), {})
+
+        assert mock_post.call_args[0][0].startswith("https://custom-gemini.example.com/v1beta/")
+
+
+class TestGeminiInCheckRequirements:
+    def test_gemini_api_key_satisfies_requirements(self, monkeypatch):
+        from tools.tts_tool import check_tts_requirements
+
+        # Strip everything else
+        for key in (
+            "ELEVENLABS_API_KEY",
+            "OPENAI_API_KEY",
+            "VOICE_TOOLS_OPENAI_KEY",
+            "MINIMAX_API_KEY",
+            "XAI_API_KEY",
+            "MISTRAL_API_KEY",
+            "GOOGLE_API_KEY",
+        ):
+            monkeypatch.delenv(key, raising=False)
+        monkeypatch.setenv("GEMINI_API_KEY", "k")
+
+        # Force edge_tts import to fail so we actually hit the gemini check
+        import builtins
+
+        real_import = builtins.__import__
+
+        def fake_import(name, *args, **kwargs):
+            if name == "edge_tts":
+                raise ImportError("simulated")
+            return real_import(name, *args, **kwargs)
+
+        with patch("builtins.__import__", side_effect=fake_import):
+            assert check_tts_requirements() is True