hermes-agent/tests/gateway/test_telegram_audio_vs_voice.py
Teknium 13650ab7f8 fix(gateway): audio attachment note no longer steers the agent into punting
Sibling site of the PDF/DOCX note fixed in PR #44175: the audio file
attachment context note led with "Ask the user what they'd like you to
do with it", steering the model into asking instead of transcribing.
Rewritten to instruct the agent to transcribe/process the file itself
when the request involves its content, only asking when intent is
genuinely unclear. Contract assertion added to the existing audio
attachment note test.
2026-06-11 11:58:19 -07:00

188 lines
7.1 KiB
Python

"""
Tests for #24870 — Telegram: audio file attachments must NOT be routed to STT.
Telegram distinguishes three kinds of audio payloads:
- message.voice → Opus/OGG voice message → STT pipeline
- message.audio → audio file attachment → file path note, NOT STT
- message.document (audio mime) → generic file route
These tests confirm that:
1. MessageType.VOICE events still flow through the STT pipeline.
2. MessageType.AUDIO events bypass STT and get a file-path context note instead.
3. Mixed media lists (voice + audio) split correctly.
"""
from unittest.mock import patch
import pytest
from gateway.config import GatewayConfig, Platform
from gateway.platforms.base import MessageEvent, MessageType
from gateway.session import SessionSource
def _make_runner(stt_enabled: bool = True) -> "GatewayRunner": # type: ignore[name-defined]
from gateway.run import GatewayRunner
runner = GatewayRunner.__new__(GatewayRunner)
runner.config = GatewayConfig(stt_enabled=stt_enabled)
runner.adapters = {}
runner._model = "test-model"
runner._base_url = ""
runner._has_setup_skill = lambda: False
return runner
def _voice_event(path: str = "/tmp/voice.ogg") -> MessageEvent:
return MessageEvent(
text="",
message_type=MessageType.VOICE,
source=SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm"),
media_urls=[path],
media_types=["audio/ogg"],
)
def _audio_event(path: str = "/tmp/song.mp3") -> MessageEvent:
return MessageEvent(
text="",
message_type=MessageType.AUDIO,
source=SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm"),
media_urls=[path],
media_types=["audio/mpeg"],
)
# ---------------------------------------------------------------------------
# 1. VOICE still goes through STT
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_voice_message_still_transcribed():
"""MessageType.VOICE must still be sent through _enrich_message_with_transcription."""
runner = _make_runner(stt_enabled=True)
source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
event = _voice_event("/tmp/voice.ogg")
with patch(
"tools.transcription_tools.transcribe_audio",
return_value={"success": True, "transcript": "hello world", "provider": "whisper"},
) as mock_transcribe:
result = await runner._prepare_inbound_message_text(
event=event,
source=source,
history=[],
)
mock_transcribe.assert_called_once_with("/tmp/voice.ogg")
assert "hello world" in result
assert "voice message" in result.lower()
# ---------------------------------------------------------------------------
# 2. AUDIO file attachment bypasses STT
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_audio_attachment_skips_stt():
"""MessageType.AUDIO must NOT be routed to STT — transcribe_audio must not be called."""
runner = _make_runner(stt_enabled=True)
source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
event = _audio_event("/tmp/song.mp3")
with patch(
"tools.transcription_tools.transcribe_audio",
side_effect=AssertionError("transcribe_audio must NOT be called for audio file attachments"),
):
with patch(
"tools.credential_files.to_agent_visible_cache_path",
side_effect=lambda p: p,
):
result = await runner._prepare_inbound_message_text(
event=event,
source=source,
history=[],
)
assert result is not None
assert "/tmp/song.mp3" in result
assert "audio file attachment" in result.lower()
@pytest.mark.asyncio
async def test_audio_attachment_context_note_format():
"""Context note for audio file attachments should include the file path and guidance."""
runner = _make_runner(stt_enabled=True)
source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
event = _audio_event("/tmp/cache_12345_my_song.mp3")
with patch(
"tools.transcription_tools.transcribe_audio",
side_effect=AssertionError("must not be called"),
):
with patch(
"tools.credential_files.to_agent_visible_cache_path",
side_effect=lambda p: p,
):
result = await runner._prepare_inbound_message_text(
event=event,
source=source,
history=[],
)
assert "my_song.mp3" in result
assert "audio file attachment" in result.lower()
# Should NOT contain the voice-message transcription wrapper text
assert "voice message" not in result.lower()
# Guides the agent to transcribe/process the file itself rather than
# punting back to the user (same bug class as the PDF/DOCX note).
assert "transcri" in result.lower()
assert "ask the user what they'd like" not in result.lower()
# ---------------------------------------------------------------------------
# 3. STT disabled still results in no transcription for audio file attachments
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_audio_attachment_skips_stt_when_stt_disabled():
"""Even with STT disabled, AUDIO must NOT produce STT disabled notice — just a file note."""
runner = _make_runner(stt_enabled=False)
source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
event = _audio_event("/tmp/podcast.m4a")
with patch(
"tools.transcription_tools.transcribe_audio",
side_effect=AssertionError("must not be called"),
):
with patch(
"tools.credential_files.to_agent_visible_cache_path",
side_effect=lambda p: p,
):
result = await runner._prepare_inbound_message_text(
event=event,
source=source,
history=[],
)
# Should NOT see the "transcription is disabled" note — that's only for VOICE
assert "transcription is disabled" not in result.lower()
assert "audio file attachment" in result.lower()
assert "/tmp/podcast.m4a" in result
# ---------------------------------------------------------------------------
# 4. Telegram gateway: msg.audio → MessageType.AUDIO (not VOICE)
# ---------------------------------------------------------------------------
def test_telegram_media_type_detection_audio_vs_voice():
"""The Telegram platform must set MessageType.AUDIO for msg.audio, VOICE for msg.voice."""
from gateway.platforms.base import MessageType
# The Telegram adapter's _build_media_type already returns correct values
# via MessageType.AUDIO for .audio and MessageType.VOICE for .voice.
# Check the constants match expected semantic roles.
assert MessageType.AUDIO.value == "audio"
assert MessageType.VOICE.value == "voice"
# Sanity: they are distinct
assert MessageType.AUDIO != MessageType.VOICE