From ed9087fce77f02b90688ade94f1f6c0658db1ac5 Mon Sep 17 00:00:00 2001 From: aqilaziz Date: Fri, 15 May 2026 21:43:20 +0700 Subject: [PATCH] fix(tts): keep native audio outside Telegram voice delivery --- tests/tools/test_tts_opus_routing.py | 70 ++++++++++++++++++++++++++++ tools/tts_tool.py | 14 ++++-- 2 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 tests/tools/test_tts_opus_routing.py diff --git a/tests/tools/test_tts_opus_routing.py b/tests/tools/test_tts_opus_routing.py new file mode 100644 index 00000000000..0073146c304 --- /dev/null +++ b/tests/tools/test_tts_opus_routing.py @@ -0,0 +1,70 @@ +import json +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from gateway.session_context import _UNSET, _VAR_MAP +from tools import tts_tool + + +def _reset_session_context() -> None: + for var in _VAR_MAP.values(): + var.set(_UNSET) + + +@pytest.fixture(autouse=True) +def _clean_session_platform(monkeypatch): + _reset_session_context() + monkeypatch.delenv("HERMES_SESSION_PLATFORM", raising=False) + yield + _reset_session_context() + + +async def _write_edge_output(_text: str, output_path: str, _tts_config: dict) -> str: + Path(output_path).write_bytes(b"mp3") + return output_path + + +def test_edge_cli_preserves_native_mp3(tmp_path, monkeypatch): + out = tmp_path / "speech.mp3" + convert = Mock() + + monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: {"provider": "edge"}) + monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: object()) + monkeypatch.setattr(tts_tool, "_generate_edge_tts", _write_edge_output) + monkeypatch.setattr(tts_tool, "_convert_to_opus", convert) + + result = json.loads(tts_tool.text_to_speech_tool("hello", output_path=str(out))) + + assert result["success"] is True + assert result["file_path"] == str(out) + assert result["voice_compatible"] is False + assert result["media_tag"] == f"MEDIA:{out}" + convert.assert_not_called() + + +def test_edge_telegram_converts_to_opus_voice(tmp_path, monkeypatch): + out = tmp_path / "speech.mp3" + opus = tmp_path / "speech.ogg" + + def fake_convert(path: str) -> str: + assert path == str(out) + opus.write_bytes(b"ogg") + return str(opus) + + convert = Mock(side_effect=fake_convert) + + monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram") + monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: {"provider": "edge"}) + monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: object()) + monkeypatch.setattr(tts_tool, "_generate_edge_tts", _write_edge_output) + monkeypatch.setattr(tts_tool, "_convert_to_opus", convert) + + result = json.loads(tts_tool.text_to_speech_tool("hello", output_path=str(out))) + + assert result["success"] is True + assert result["file_path"] == str(opus) + assert result["voice_compatible"] is True + assert result["media_tag"] == f"[[audio_as_voice]]\nMEDIA:{opus}" + convert.assert_called_once_with(str(out)) diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 9e46fa6a7ef..469cb6608d4 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -1830,8 +1830,10 @@ def text_to_speech_tool( "error": f"TTS generation produced no output (provider: {provider})" }, ensure_ascii=False) - # Try Opus conversion for Telegram compatibility - # Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV — all need ffmpeg conversion + # Try Opus conversion for Telegram compatibility. + # Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV. Keep those native + # formats for local/CLI playback and only convert when the current + # platform actually needs Opus voice delivery. voice_compatible = False if command_provider_config is not None: # Command providers are documents by default. Voice-bubble @@ -1843,13 +1845,17 @@ def text_to_speech_tool( if opus_path: file_str = opus_path voice_compatible = file_str.endswith(".ogg") - elif provider in {"edge", "neutts", "minimax", "xai", "kittentts", "piper"} and not file_str.endswith(".ogg"): + elif ( + want_opus + and provider in {"edge", "neutts", "minimax", "xai", "kittentts", "piper"} + and not file_str.endswith(".ogg") + ): opus_path = _convert_to_opus(file_str) if opus_path: file_str = opus_path voice_compatible = True elif provider in {"elevenlabs", "openai", "mistral", "gemini"}: - voice_compatible = file_str.endswith(".ogg") + voice_compatible = want_opus and file_str.endswith(".ogg") file_size = os.path.getsize(file_str) logger.info("TTS audio saved: %s (%s bytes, provider: %s)", file_str, f"{file_size:,}", provider)