fix(tts): keep native audio outside Telegram voice delivery

This commit is contained in:
aqilaziz 2026-05-15 21:43:20 +07:00 committed by Teknium
parent e19f4c1730
commit ed9087fce7
2 changed files with 80 additions and 4 deletions

View file

@ -0,0 +1,70 @@
import json
from pathlib import Path
from unittest.mock import Mock
import pytest
from gateway.session_context import _UNSET, _VAR_MAP
from tools import tts_tool
def _reset_session_context() -> None:
for var in _VAR_MAP.values():
var.set(_UNSET)
@pytest.fixture(autouse=True)
def _clean_session_platform(monkeypatch):
_reset_session_context()
monkeypatch.delenv("HERMES_SESSION_PLATFORM", raising=False)
yield
_reset_session_context()
async def _write_edge_output(_text: str, output_path: str, _tts_config: dict) -> str:
Path(output_path).write_bytes(b"mp3")
return output_path
def test_edge_cli_preserves_native_mp3(tmp_path, monkeypatch):
out = tmp_path / "speech.mp3"
convert = Mock()
monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: {"provider": "edge"})
monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: object())
monkeypatch.setattr(tts_tool, "_generate_edge_tts", _write_edge_output)
monkeypatch.setattr(tts_tool, "_convert_to_opus", convert)
result = json.loads(tts_tool.text_to_speech_tool("hello", output_path=str(out)))
assert result["success"] is True
assert result["file_path"] == str(out)
assert result["voice_compatible"] is False
assert result["media_tag"] == f"MEDIA:{out}"
convert.assert_not_called()
def test_edge_telegram_converts_to_opus_voice(tmp_path, monkeypatch):
out = tmp_path / "speech.mp3"
opus = tmp_path / "speech.ogg"
def fake_convert(path: str) -> str:
assert path == str(out)
opus.write_bytes(b"ogg")
return str(opus)
convert = Mock(side_effect=fake_convert)
monkeypatch.setenv("HERMES_SESSION_PLATFORM", "telegram")
monkeypatch.setattr(tts_tool, "_load_tts_config", lambda: {"provider": "edge"})
monkeypatch.setattr(tts_tool, "_import_edge_tts", lambda: object())
monkeypatch.setattr(tts_tool, "_generate_edge_tts", _write_edge_output)
monkeypatch.setattr(tts_tool, "_convert_to_opus", convert)
result = json.loads(tts_tool.text_to_speech_tool("hello", output_path=str(out)))
assert result["success"] is True
assert result["file_path"] == str(opus)
assert result["voice_compatible"] is True
assert result["media_tag"] == f"[[audio_as_voice]]\nMEDIA:{opus}"
convert.assert_called_once_with(str(out))

View file

@ -1830,8 +1830,10 @@ def text_to_speech_tool(
"error": f"TTS generation produced no output (provider: {provider})"
}, ensure_ascii=False)
# Try Opus conversion for Telegram compatibility
# Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV — all need ffmpeg conversion
# Try Opus conversion for Telegram compatibility.
# Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV. Keep those native
# formats for local/CLI playback and only convert when the current
# platform actually needs Opus voice delivery.
voice_compatible = False
if command_provider_config is not None:
# Command providers are documents by default. Voice-bubble
@ -1843,13 +1845,17 @@ def text_to_speech_tool(
if opus_path:
file_str = opus_path
voice_compatible = file_str.endswith(".ogg")
elif provider in {"edge", "neutts", "minimax", "xai", "kittentts", "piper"} and not file_str.endswith(".ogg"):
elif (
want_opus
and provider in {"edge", "neutts", "minimax", "xai", "kittentts", "piper"}
and not file_str.endswith(".ogg")
):
opus_path = _convert_to_opus(file_str)
if opus_path:
file_str = opus_path
voice_compatible = True
elif provider in {"elevenlabs", "openai", "mistral", "gemini"}:
voice_compatible = file_str.endswith(".ogg")
voice_compatible = want_opus and file_str.endswith(".ogg")
file_size = os.path.getsize(file_str)
logger.info("TTS audio saved: %s (%s bytes, provider: %s)", file_str, f"{file_size:,}", provider)