fix: follow-up for Gemini TTS salvage

Review findings addressed: - Scan response parts for inlineData instead of blindly picking parts[0] - Validate empty PCM bytes (prevents silent 44-byte WAV) - Catch URLError for network/DNS failures - 6 new tests: empty PCM, text-before-audio part, base_url override, WAV→MP3 ffmpeg conversion, no-ffmpeg rename fallback, URLError
2026-04-25 00:51:20 +00:00 · 2026-04-16 20:59:09 +05:30 · 2026-04-16 20:59:09 +05:30 · b945f9e0d5
commit b945f9e0d5
parent acff9d36db
2 changed files with 123 additions and 1 deletions
--- a/tests/tools/test_tts_gemini.py
+++ b/tests/tools/test_tts_gemini.py
@ -1,6 +1,7 @@
 """Tests for the Gemini TTS provider in tools/tts_tool.py."""

 import base64
+import os
 import struct
 from unittest.mock import MagicMock, patch

@ -219,3 +220,111 @@ class TestCheckTtsRequirementsGemini:
            "tools.tts_tool._import_mistral_client", side_effect=ImportError
        ), patch("tools.tts_tool._check_neutts_available", return_value=False):
            assert check_tts_requirements() is False
+
+
+class TestGeminiTtsEdgeCases:
+    """Tests for edge cases and conversion paths added during salvage review."""
+
+    def test_empty_pcm_raises_runtime_error(self, tmp_path, monkeypatch):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        with patch(
+            "tools.tts_tool.urllib.request.urlopen",
+            return_value=_mock_urlopen(_gemini_response(b"")),
+        ):
+            with pytest.raises(RuntimeError, match="empty audio data"):
+                _generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
+
+    def test_text_part_before_audio_is_handled(self, tmp_path, monkeypatch):
+        """If the response has a text part before the audio part, still extract audio."""
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        pcm = b"\x01\x00\x02\x00"
+        mixed_response = {
+            "candidates": [
+                {
+                    "content": {
+                        "parts": [
+                            {"text": "Here is your audio"},
+                            {"inlineData": {"data": base64.b64encode(pcm).decode()}},
+                        ]
+                    }
+                }
+            ]
+        }
+        with patch(
+            "tools.tts_tool.urllib.request.urlopen",
+            return_value=_mock_urlopen(mixed_response),
+        ):
+            result = _generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
+        assert result == str(tmp_path / "out.wav")
+
+    def test_base_url_config_override(self, tmp_path, monkeypatch):
+        import json as _json
+
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        captured = {}
+
+        def fake_urlopen(req, timeout=None):
+            captured["url"] = req.full_url
+            return _mock_urlopen(_gemini_response(b"\x00\x00"))
+
+        with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen):
+            config = {"gemini": {"base_url": "https://custom.api.example.com/v1"}}
+            _generate_gemini_tts("hi", str(tmp_path / "out.wav"), config)
+
+        assert "custom.api.example.com" in captured["url"]
+
+    def test_wav_to_mp3_conversion_with_ffmpeg(self, tmp_path, monkeypatch):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        pcm = b"\x01\x00\x02\x00\x03\x00"
+        mp3_path = str(tmp_path / "out.mp3")
+
+        with patch(
+            "tools.tts_tool.urllib.request.urlopen",
+            return_value=_mock_urlopen(_gemini_response(pcm)),
+        ), patch("shutil.which", return_value="/usr/bin/ffmpeg"), patch(
+            "subprocess.run"
+        ) as mock_run:
+            result = _generate_gemini_tts("hi", mp3_path, {})
+
+        # ffmpeg should be called to convert .wav -> .mp3
+        mock_run.assert_called_once()
+        cmd = mock_run.call_args[0][0]
+        assert cmd[0] == "/usr/bin/ffmpeg"
+        assert mp3_path in cmd
+
+    def test_wav_to_ogg_no_ffmpeg_renames(self, tmp_path, monkeypatch):
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        pcm = b"\x01\x00\x02\x00"
+        ogg_path = str(tmp_path / "out.ogg")
+
+        with patch(
+            "tools.tts_tool.urllib.request.urlopen",
+            return_value=_mock_urlopen(_gemini_response(pcm)),
+        ), patch("shutil.which", return_value=None):
+            result = _generate_gemini_tts("hi", ogg_path, {})
+
+        # Without ffmpeg, the WAV content gets renamed to .ogg path
+        assert result == ogg_path
+        assert os.path.exists(ogg_path)
+
+    def test_url_error_surfaced_as_runtime_error(self, tmp_path, monkeypatch):
+        import urllib.error
+
+        from tools.tts_tool import _generate_gemini_tts
+
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+        err = urllib.error.URLError("Name or service not known")
+
+        with patch("tools.tts_tool.urllib.request.urlopen", side_effect=err):
+            with pytest.raises(RuntimeError, match="connection failed"):
+                _generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -651,9 +651,20 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
    except urllib.error.HTTPError as exc:
        err_body = exc.read().decode("utf-8", errors="ignore")[:500]
        raise RuntimeError(f"Gemini TTS HTTP {exc.code}: {err_body}") from exc
+    except urllib.error.URLError as exc:
+        raise RuntimeError(f"Gemini TTS connection failed: {exc.reason}") from exc

    try:
-        audio_part = response_data["candidates"][0]["content"]["parts"][0]
+        parts = response_data["candidates"][0]["content"]["parts"]
+        audio_part = None
+        for part in parts:
+            if "inlineData" in part:
+                audio_part = part
+                break
+        if audio_part is None:
+            raise RuntimeError(
+                f"Gemini TTS response missing audio payload: {str(response_data)[:300]}"
+            )
        audio_b64 = audio_part["inlineData"]["data"]
    except (KeyError, IndexError, TypeError) as exc:
        raise RuntimeError(
@ -661,6 +672,8 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
        ) from exc

    pcm_bytes = base64.b64decode(audio_b64)
+    if not pcm_bytes:
+        raise RuntimeError("Gemini TTS returned empty audio data")

    # Write PCM as WAV natively — ffmpeg is only needed if the caller
    # asked for a non-WAV extension (mp3/ogg).