fix(gateway): use OGG for Telegram auto TTS

2026-06-11 08:42:11 +00:00 · 2026-05-05 11:28:27 +00:00 · 2026-05-05 11:28:27 +00:00 · ae82eed2b1
commit ae82eed2b1
parent cb83149dc6
2 changed files with 104 additions and 3 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -12422,11 +12422,12 @@ class GatewayRunner:
            if not tts_text:
                return
-            # Use .mp3 extension so edge-tts conversion to opus works correctly.
+            # Telegram's adapter only sends native voice bubbles for OGG/Opus.
-            # The TTS tool may convert to .ogg — use file_path from result.
+            # Other platforms keep the existing MP3 default.
            audio_ext = "ogg" if event.source.platform == Platform.TELEGRAM else "mp3"
            audio_path = os.path.join(
                tempfile.gettempdir(), "hermes_voice",
-                f"tts_reply_{_uuid.uuid4().hex[:12]}.mp3",
+                f"tts_reply_{_uuid.uuid4().hex[:12]}.{audio_ext}",
            )
            os.makedirs(os.path.dirname(audio_path), exist_ok=True)
--- a/tests/gateway/test_auto_voice_reply_format.py
+++ b/tests/gateway/test_auto_voice_reply_format.py
@ -0,0 +1,100 @@
 """Tests for gateway auto-TTS voice reply audio format selection."""
 import json
 from pathlib import Path
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from gateway.config import Platform
 from gateway.platforms.base import MessageEvent
 from gateway.run import GatewayRunner
 from gateway.session import SessionSource
 class TestAutoVoiceReplyFormat:
    @pytest.mark.asyncio
    async def test_telegram_auto_voice_reply_requests_ogg_for_native_voice_bubble(self):
        """Telegram auto-TTS should request OGG/Opus so send_voice sends a voice bubble."""
        runner = _make_runner()
        adapter = _make_adapter(Platform.TELEGRAM)
        runner.adapters[Platform.TELEGRAM] = adapter
        event = _make_event(Platform.TELEGRAM)
        requested_paths = []
        def fake_tts(*, text, output_path):
            requested_paths.append(output_path)
            assert output_path.endswith(".ogg")
            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
            Path(output_path).write_bytes(b"fake ogg opus")
            return json.dumps({
                "success": True,
                "file_path": output_path,
                "provider": "gemini",
                "voice_compatible": True,
            })
        with patch("tools.tts_tool.text_to_speech_tool", side_effect=fake_tts):
            await runner._send_voice_reply(event, "hello from auto tts")
        assert requested_paths
        assert requested_paths[0].endswith(".ogg")
        adapter.send_voice.assert_awaited_once()
        assert adapter.send_voice.await_args.kwargs["audio_path"].endswith(".ogg")
    @pytest.mark.asyncio
    async def test_non_telegram_auto_voice_reply_keeps_mp3_default(self):
        """Non-Telegram platforms should keep the current MP3 default."""
        runner = _make_runner()
        adapter = _make_adapter(Platform.SLACK)
        runner.adapters[Platform.SLACK] = adapter
        event = _make_event(Platform.SLACK)
        requested_paths = []
        def fake_tts(*, text, output_path):
            requested_paths.append(output_path)
            assert output_path.endswith(".mp3")
            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
            Path(output_path).write_bytes(b"fake mp3")
            return json.dumps({
                "success": True,
                "file_path": output_path,
                "provider": "gemini",
                "voice_compatible": False,
            })
        with patch("tools.tts_tool.text_to_speech_tool", side_effect=fake_tts):
            await runner._send_voice_reply(event, "hello from auto tts")
        assert requested_paths
        assert requested_paths[0].endswith(".mp3")
        adapter.send_voice.assert_awaited_once()
        assert adapter.send_voice.await_args.kwargs["audio_path"].endswith(".mp3")
 def _make_runner() -> GatewayRunner:
    with patch("gateway.run.GatewayRunner._load_voice_modes", return_value={}):
        runner = GatewayRunner.__new__(GatewayRunner)
        runner._voice_mode = {}
        runner.adapters = {}
    return runner
 def _make_adapter(platform: Platform) -> MagicMock:
    adapter = MagicMock()
    adapter.platform = platform
    adapter.send_voice = AsyncMock()
    return adapter
 def _make_event(platform: Platform) -> MessageEvent:
    return MessageEvent(
        text="trigger",
        source=SessionSource(
            platform=platform,
            chat_id="123",
            user_id="u1",
            user_name="User",
        ),
        message_id="456",
    )