fix: follow-up for Gemini TTS salvage

Review findings addressed:
- Scan response parts for inlineData instead of blindly picking parts[0]
- Validate empty PCM bytes (prevents silent 44-byte WAV)
- Catch URLError for network/DNS failures
- 6 new tests: empty PCM, text-before-audio part, base_url override,
  WAV→MP3 ffmpeg conversion, no-ffmpeg rename fallback, URLError
This commit is contained in:
kshitijk4poor 2026-04-16 20:59:09 +05:30
parent acff9d36db
commit b945f9e0d5
2 changed files with 123 additions and 1 deletions

View file

@ -1,6 +1,7 @@
"""Tests for the Gemini TTS provider in tools/tts_tool.py."""
import base64
import os
import struct
from unittest.mock import MagicMock, patch
@ -219,3 +220,111 @@ class TestCheckTtsRequirementsGemini:
"tools.tts_tool._import_mistral_client", side_effect=ImportError
), patch("tools.tts_tool._check_neutts_available", return_value=False):
assert check_tts_requirements() is False
class TestGeminiTtsEdgeCases:
"""Tests for edge cases and conversion paths added during salvage review."""
def test_empty_pcm_raises_runtime_error(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(b"")),
):
with pytest.raises(RuntimeError, match="empty audio data"):
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
def test_text_part_before_audio_is_handled(self, tmp_path, monkeypatch):
"""If the response has a text part before the audio part, still extract audio."""
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
pcm = b"\x01\x00\x02\x00"
mixed_response = {
"candidates": [
{
"content": {
"parts": [
{"text": "Here is your audio"},
{"inlineData": {"data": base64.b64encode(pcm).decode()}},
]
}
}
]
}
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(mixed_response),
):
result = _generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})
assert result == str(tmp_path / "out.wav")
def test_base_url_config_override(self, tmp_path, monkeypatch):
import json as _json
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
captured = {}
def fake_urlopen(req, timeout=None):
captured["url"] = req.full_url
return _mock_urlopen(_gemini_response(b"\x00\x00"))
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=fake_urlopen):
config = {"gemini": {"base_url": "https://custom.api.example.com/v1"}}
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), config)
assert "custom.api.example.com" in captured["url"]
def test_wav_to_mp3_conversion_with_ffmpeg(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
pcm = b"\x01\x00\x02\x00\x03\x00"
mp3_path = str(tmp_path / "out.mp3")
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(pcm)),
), patch("shutil.which", return_value="/usr/bin/ffmpeg"), patch(
"subprocess.run"
) as mock_run:
result = _generate_gemini_tts("hi", mp3_path, {})
# ffmpeg should be called to convert .wav -> .mp3
mock_run.assert_called_once()
cmd = mock_run.call_args[0][0]
assert cmd[0] == "/usr/bin/ffmpeg"
assert mp3_path in cmd
def test_wav_to_ogg_no_ffmpeg_renames(self, tmp_path, monkeypatch):
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
pcm = b"\x01\x00\x02\x00"
ogg_path = str(tmp_path / "out.ogg")
with patch(
"tools.tts_tool.urllib.request.urlopen",
return_value=_mock_urlopen(_gemini_response(pcm)),
), patch("shutil.which", return_value=None):
result = _generate_gemini_tts("hi", ogg_path, {})
# Without ffmpeg, the WAV content gets renamed to .ogg path
assert result == ogg_path
assert os.path.exists(ogg_path)
def test_url_error_surfaced_as_runtime_error(self, tmp_path, monkeypatch):
import urllib.error
from tools.tts_tool import _generate_gemini_tts
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
err = urllib.error.URLError("Name or service not known")
with patch("tools.tts_tool.urllib.request.urlopen", side_effect=err):
with pytest.raises(RuntimeError, match="connection failed"):
_generate_gemini_tts("hi", str(tmp_path / "out.wav"), {})

View file

@ -651,9 +651,20 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
except urllib.error.HTTPError as exc:
err_body = exc.read().decode("utf-8", errors="ignore")[:500]
raise RuntimeError(f"Gemini TTS HTTP {exc.code}: {err_body}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"Gemini TTS connection failed: {exc.reason}") from exc
try:
audio_part = response_data["candidates"][0]["content"]["parts"][0]
parts = response_data["candidates"][0]["content"]["parts"]
audio_part = None
for part in parts:
if "inlineData" in part:
audio_part = part
break
if audio_part is None:
raise RuntimeError(
f"Gemini TTS response missing audio payload: {str(response_data)[:300]}"
)
audio_b64 = audio_part["inlineData"]["data"]
except (KeyError, IndexError, TypeError) as exc:
raise RuntimeError(
@ -661,6 +672,8 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
) from exc
pcm_bytes = base64.b64decode(audio_b64)
if not pcm_bytes:
raise RuntimeError("Gemini TTS returned empty audio data")
# Write PCM as WAV natively — ffmpeg is only needed if the caller
# asked for a non-WAV extension (mp3/ogg).