fix(tts): keep native audio outside Telegram voice delivery

2026-07-18 14:52:04 +00:00 · 2026-05-15 21:43:20 +07:00 · 2026-05-15 21:43:20 +07:00 · ed9087fce7
commit ed9087fce7
parent e19f4c1730
2 changed files with 80 additions and 4 deletions
--- a/tests/tools/test_tts_opus_routing.py
+++ b/tests/tools/test_tts_opus_routing.py
@ -0,0 +1,70 @@
+import json
+from pathlib import Path
+from unittest.mock import Mock
+
+import pytest
+
+from gateway.session_context import _UNSET, _VAR_MAP
+from tools import tts_tool
+
+
+def _reset_session_context() -> None:
+    for var in _VAR_MAP.values():
+        var.set(_UNSET)
+
+
+@pytest.fixture(autouse=True)
+def _clean_session_platform(monkeypatch):
+    _reset_session_context()
+    monkeypatch.delenv("HERMES_SESSION_PLATFORM", raising=False)
+    yield
+    _reset_session_context()
+
+
+async def _write_edge_output(_text: str, output_path: str, _tts_config: dict) -> str:
+    Path(output_path).write_bytes(b"mp3")
+    return output_path
+
+
+def test_edge_cli_preserves_native_mp3(tmp_path, monkeypatch):
+    out = tmp_path / "speech.mp3"
+    convert = Mock()
+
+    monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: {"provider": "edge"})
+    monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: object())
+    monkeypatch.setattr(tts_tool, "_generate_edge_tts", _write_edge_output)
+    monkeypatch.setattr(tts_tool, "_convert_to_opus", convert)
+
+    result = json.loads(tts_tool.text_to_speech_tool("hello", output_path=str(out)))
+
+    assert result["success"] is True
+    assert result["file_path"] == str(out)
+    assert result["voice_compatible"] is False
+    assert result["media_tag"] == f"MEDIA:{out}"
+    convert.assert_not_called()
+
+
+def test_edge_telegram_converts_to_opus_voice(tmp_path, monkeypatch):
+    out = tmp_path / "speech.mp3"
+    opus = tmp_path / "speech.ogg"
+
+    def fake_convert(path: str) -> str:
+        assert path == str(out)
+        opus.write_bytes(b"ogg")
+        return str(opus)
+
+    convert = Mock(side_effect=fake_convert)
+
+    monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
+    monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: {"provider": "edge"})
+    monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: object())
+    monkeypatch.setattr(tts_tool, "_generate_edge_tts", _write_edge_output)
+    monkeypatch.setattr(tts_tool, "_convert_to_opus", convert)
+
+    result = json.loads(tts_tool.text_to_speech_tool("hello", output_path=str(out)))
+
+    assert result["success"] is True
+    assert result["file_path"] == str(opus)
+    assert result["voice_compatible"] is True
+    assert result["media_tag"] == f"[[audio_as_voice]]\nMEDIA:{opus}"
+    convert.assert_called_once_with(str(out))
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -1830,8 +1830,10 @@ def text_to_speech_tool(
                "error": f"TTS generation produced no output (provider: {provider})"
            }, ensure_ascii=False)

-        # Try Opus conversion for Telegram compatibility
-        # Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV — all need ffmpeg conversion
+        # Try Opus conversion for Telegram compatibility.
+        # Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV. Keep those native
+        # formats for local/CLI playback and only convert when the current
+        # platform actually needs Opus voice delivery.
        voice_compatible = False
        if command_provider_config is not None:
            # Command providers are documents by default. Voice-bubble
@ -1843,13 +1845,17 @@ def text_to_speech_tool(
                    if opus_path:
                        file_str = opus_path
                voice_compatible = file_str.endswith(".ogg")
-        elif provider in {"edge", "neutts", "minimax", "xai", "kittentts", "piper"} and not file_str.endswith(".ogg"):
+        elif (
+            want_opus
+            and provider in {"edge", "neutts", "minimax", "xai", "kittentts", "piper"}
+            and not file_str.endswith(".ogg")
+        ):
            opus_path = _convert_to_opus(file_str)
            if opus_path:
                file_str = opus_path
                voice_compatible = True
        elif provider in {"elevenlabs", "openai", "mistral", "gemini"}:
-            voice_compatible = file_str.endswith(".ogg")
+            voice_compatible = want_opus and file_str.endswith(".ogg")

        file_size = os.path.getsize(file_str)
        logger.info("TTS audio saved: %s (%s bytes, provider: %s)", file_str, f"{file_size:,}", provider)