From ed9087fce77f02b90688ade94f1f6c0658db1ac5 Mon Sep 17 00:00:00 2001
From: aqilaziz <gonzes7@gmail.com>
Date: Fri, 15 May 2026 21:43:20 +0700
Subject: [PATCH] fix(tts): keep native audio outside Telegram voice delivery

---
 tests/tools/test_tts_opus_routing.py | 70 ++++++++++++++++++++++++++++
 tools/tts_tool.py                    | 14 ++++--
 2 files changed, 80 insertions(+), 4 deletions(-)
 create mode 100644 tests/tools/test_tts_opus_routing.py

diff --git a/tests/tools/test_tts_opus_routing.py b/tests/tools/test_tts_opus_routing.py
new file mode 100644
index 00000000000..0073146c304
--- /dev/null
+++ b/tests/tools/test_tts_opus_routing.py
@@ -0,0 +1,70 @@
+import json
+from pathlib import Path
+from unittest.mock import Mock
+
+import pytest
+
+from gateway.session_context import _UNSET, _VAR_MAP
+from tools import tts_tool
+
+
+def _reset_session_context() -> None:
+    for var in _VAR_MAP.values():
+        var.set(_UNSET)
+
+
+@pytest.fixture(autouse=True)
+def _clean_session_platform(monkeypatch):
+    _reset_session_context()
+    monkeypatch.delenv("HERMES_SESSION_PLATFORM", raising=False)
+    yield
+    _reset_session_context()
+
+
+async def _write_edge_output(_text: str, output_path: str, _tts_config: dict) -> str:
+    Path(output_path).write_bytes(b"mp3")
+    return output_path
+
+
+def test_edge_cli_preserves_native_mp3(tmp_path, monkeypatch):
+    out = tmp_path / "speech.mp3"
+    convert = Mock()
+
+    monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: {"provider": "edge"})
+    monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: object())
+    monkeypatch.setattr(tts_tool, "_generate_edge_tts", _write_edge_output)
+    monkeypatch.setattr(tts_tool, "_convert_to_opus", convert)
+
+    result = json.loads(tts_tool.text_to_speech_tool("hello", output_path=str(out)))
+
+    assert result["success"] is True
+    assert result["file_path"] == str(out)
+    assert result["voice_compatible"] is False
+    assert result["media_tag"] == f"MEDIA:{out}"
+    convert.assert_not_called()
+
+
+def test_edge_telegram_converts_to_opus_voice(tmp_path, monkeypatch):
+    out = tmp_path / "speech.mp3"
+    opus = tmp_path / "speech.ogg"
+
+    def fake_convert(path: str) -> str:
+        assert path == str(out)
+        opus.write_bytes(b"ogg")
+        return str(opus)
+
+    convert = Mock(side_effect=fake_convert)
+
+    monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
+    monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: {"provider": "edge"})
+    monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: object())
+    monkeypatch.setattr(tts_tool, "_generate_edge_tts", _write_edge_output)
+    monkeypatch.setattr(tts_tool, "_convert_to_opus", convert)
+
+    result = json.loads(tts_tool.text_to_speech_tool("hello", output_path=str(out)))
+
+    assert result["success"] is True
+    assert result["file_path"] == str(opus)
+    assert result["voice_compatible"] is True
+    assert result["media_tag"] == f"[[audio_as_voice]]\nMEDIA:{opus}"
+    convert.assert_called_once_with(str(out))
diff --git a/tools/tts_tool.py b/tools/tts_tool.py
index 9e46fa6a7ef..469cb6608d4 100644
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@@ -1830,8 +1830,10 @@ def text_to_speech_tool(
                 "error": f"TTS generation produced no output (provider: {provider})"
             }, ensure_ascii=False)
 
-        # Try Opus conversion for Telegram compatibility
-        # Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV — all need ffmpeg conversion
+        # Try Opus conversion for Telegram compatibility.
+        # Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV. Keep those native
+        # formats for local/CLI playback and only convert when the current
+        # platform actually needs Opus voice delivery.
         voice_compatible = False
         if command_provider_config is not None:
             # Command providers are documents by default. Voice-bubble
@@ -1843,13 +1845,17 @@ def text_to_speech_tool(
                     if opus_path:
                         file_str = opus_path
                 voice_compatible = file_str.endswith(".ogg")
-        elif provider in {"edge", "neutts", "minimax", "xai", "kittentts", "piper"} and not file_str.endswith(".ogg"):
+        elif (
+            want_opus
+            and provider in {"edge", "neutts", "minimax", "xai", "kittentts", "piper"}
+            and not file_str.endswith(".ogg")
+        ):
             opus_path = _convert_to_opus(file_str)
             if opus_path:
                 file_str = opus_path
                 voice_compatible = True
         elif provider in {"elevenlabs", "openai", "mistral", "gemini"}:
-            voice_compatible = file_str.endswith(".ogg")
+            voice_compatible = want_opus and file_str.endswith(".ogg")
 
         file_size = os.path.getsize(file_str)
         logger.info("TTS audio saved: %s (%s bytes, provider: %s)", file_str, f"{file_size:,}", provider)