fix(gateway): route Telegram audio file attachments away from STT pipeline (#24870)

Telegram distinguishes three kinds of audio payloads: - message.voice → Opus/OGG voice messages → STT pipeline ✓ - message.audio → audio file attachments → bypasses STT ← was broken - message.document (audio mime) → generic file route **Root cause** — the inbound message routing block in gateway/run.py matched both MessageType.VOICE *and* MessageType.AUDIO into audio_paths, which were then fed unconditionally to _enrich_message_with_transcription. Audio file attachments (.mp3, .m4a, etc.) were therefore auto-transcribed instead of being treated as files, making the transcribe skill unusable from Telegram because the path it needed was never surfaced. **Fix** - Introduce a new audio_file_paths list populated exclusively by MessageType.AUDIO events. - Narrow the audio_paths selector to MessageType.VOICE (and bare audio/ mime-type events that are not explicitly AUDIO or DOCUMENT). - After the STT block, inject a document-style context note for each audio_file_path, giving the agent the file path and asking what to do with it (consistent with how plain documents are handled). **Tests** — 5 new tests in test_telegram_audio_vs_voice.py: - voice message still transcribed (regression guard) - audio attachment skips STT (core fix) - audio attachment context note format - STT disabled still produces file note (not STT-disabled notice) - MessageType.AUDIO != MessageType.VOICE sanity check Fixes #24870
2026-07-17 14:42:06 +00:00 · 2026-05-13 03:32:55 -04:00 · 2026-05-13 03:32:55 -04:00 · b93996c35e
commit b93996c35e
parent 21a15b6711
2 changed files with 208 additions and 1 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -7257,11 +7257,19 @@ class GatewayRunner:
        if event.media_urls:
            image_paths = []
            audio_paths = []
+            audio_file_paths: list[str] = []  # audio file attachments — not for STT
            for i, path in enumerate(event.media_urls):
                mtype = event.media_types[i] if i < len(event.media_types) else ""
                if mtype.startswith("image/") or event.message_type == MessageType.PHOTO:
                    image_paths.append(path)
-                if mtype.startswith("audio/") or event.message_type in {MessageType.VOICE, MessageType.AUDIO}:
+                # MessageType.AUDIO = audio file attachment (e.g. .mp3, .m4a) — never STT
+                # MessageType.VOICE = voice message (Opus/OGG) — always STT
+                if event.message_type == MessageType.AUDIO:
+                    audio_file_paths.append(path)
+                elif event.message_type == MessageType.VOICE or (
+                    mtype.startswith("audio/")
+                    and event.message_type not in {MessageType.AUDIO, MessageType.DOCUMENT}
+                ):
                    audio_paths.append(path)

            if image_paths:
@ -7323,6 +7331,21 @@ class GatewayRunner:
                        except Exception:
                            pass

+        if audio_file_paths:
+            from tools.credential_files import to_agent_visible_cache_path as _to_agent_path
+            for _apath in audio_file_paths:
+                _basename = os.path.basename(_apath)
+                _parts = _basename.split("_", 2)
+                _display = _parts[2] if len(_parts) >= 3 else _basename
+                _display = re.sub(r'[^\w.\- ]', '_', _display)
+                _agent_path = _to_agent_path(_apath)
+                _note = (
+                    f"[The user sent an audio file attachment: '{_display}'. "
+                    f"It is saved at: {_agent_path}. "
+                    f"Ask the user what they'd like you to do with it, or pass the path to a transcription or media tool.]"
+                )
+                message_text = f"{_note}\n\n{message_text}"
+
        if event.media_urls and event.message_type == MessageType.DOCUMENT:
            import mimetypes as _mimetypes
            from tools.credential_files import to_agent_visible_cache_path
--- a/tests/gateway/test_telegram_audio_vs_voice.py
+++ b/tests/gateway/test_telegram_audio_vs_voice.py
@ -0,0 +1,184 @@
+"""
+Tests for #24870 — Telegram: audio file attachments must NOT be routed to STT.
+
+Telegram distinguishes three kinds of audio payloads:
+  - message.voice  → Opus/OGG voice message  → STT pipeline
+  - message.audio  → audio file attachment   → file path note, NOT STT
+  - message.document (audio mime) → generic file route
+
+These tests confirm that:
+  1. MessageType.VOICE events still flow through the STT pipeline.
+  2. MessageType.AUDIO events bypass STT and get a file-path context note instead.
+  3. Mixed media lists (voice + audio) split correctly.
+"""
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from gateway.config import GatewayConfig, Platform
+from gateway.platforms.base import MessageEvent, MessageType
+from gateway.session import SessionSource
+
+
+def _make_runner(stt_enabled: bool = True) -> "GatewayRunner":  # type: ignore[name-defined]
+    from gateway.run import GatewayRunner
+
+    runner = GatewayRunner.__new__(GatewayRunner)
+    runner.config = GatewayConfig(stt_enabled=stt_enabled)
+    runner.adapters = {}
+    runner._model = "test-model"
+    runner._base_url = ""
+    runner._has_setup_skill = lambda: False
+    return runner
+
+
+def _voice_event(path: str = "/tmp/voice.ogg") -> MessageEvent:
+    return MessageEvent(
+        text="",
+        message_type=MessageType.VOICE,
+        source=SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm"),
+        media_urls=[path],
+        media_types=["audio/ogg"],
+    )
+
+
+def _audio_event(path: str = "/tmp/song.mp3") -> MessageEvent:
+    return MessageEvent(
+        text="",
+        message_type=MessageType.AUDIO,
+        source=SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm"),
+        media_urls=[path],
+        media_types=["audio/mpeg"],
+    )
+
+
+# ---------------------------------------------------------------------------
+# 1. VOICE still goes through STT
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_voice_message_still_transcribed():
+    """MessageType.VOICE must still be sent through _enrich_message_with_transcription."""
+    runner = _make_runner(stt_enabled=True)
+    source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
+    event = _voice_event("/tmp/voice.ogg")
+
+    with patch(
+        "tools.transcription_tools.transcribe_audio",
+        return_value={"success": True, "transcript": "hello world", "provider": "whisper"},
+    ) as mock_transcribe:
+        result = await runner._prepare_inbound_message_text(
+            event=event,
+            source=source,
+            history=[],
+        )
+
+    mock_transcribe.assert_called_once_with("/tmp/voice.ogg")
+    assert "hello world" in result
+    assert "voice message" in result.lower()
+
+
+# ---------------------------------------------------------------------------
+# 2. AUDIO file attachment bypasses STT
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_audio_attachment_skips_stt():
+    """MessageType.AUDIO must NOT be routed to STT — transcribe_audio must not be called."""
+    runner = _make_runner(stt_enabled=True)
+    source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
+    event = _audio_event("/tmp/song.mp3")
+
+    with patch(
+        "tools.transcription_tools.transcribe_audio",
+        side_effect=AssertionError("transcribe_audio must NOT be called for audio file attachments"),
+    ):
+        with patch(
+            "tools.credential_files.to_agent_visible_cache_path",
+            side_effect=lambda p: p,
+        ):
+            result = await runner._prepare_inbound_message_text(
+                event=event,
+                source=source,
+                history=[],
+            )
+
+    assert result is not None
+    assert "/tmp/song.mp3" in result
+    assert "audio file attachment" in result.lower()
+
+
+@pytest.mark.asyncio
+async def test_audio_attachment_context_note_format():
+    """Context note for audio file attachments should include the file path and guidance."""
+    runner = _make_runner(stt_enabled=True)
+    source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
+    event = _audio_event("/tmp/cache_12345_my_song.mp3")
+
+    with patch(
+        "tools.transcription_tools.transcribe_audio",
+        side_effect=AssertionError("must not be called"),
+    ):
+        with patch(
+            "tools.credential_files.to_agent_visible_cache_path",
+            side_effect=lambda p: p,
+        ):
+            result = await runner._prepare_inbound_message_text(
+                event=event,
+                source=source,
+                history=[],
+            )
+
+    assert "my_song.mp3" in result
+    assert "audio file attachment" in result.lower()
+    # Should NOT contain the voice-message transcription wrapper text
+    assert "voice message" not in result.lower()
+
+
+# ---------------------------------------------------------------------------
+# 3. STT disabled still results in no transcription for audio file attachments
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_audio_attachment_skips_stt_when_stt_disabled():
+    """Even with STT disabled, AUDIO must NOT produce STT disabled notice — just a file note."""
+    runner = _make_runner(stt_enabled=False)
+    source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
+    event = _audio_event("/tmp/podcast.m4a")
+
+    with patch(
+        "tools.transcription_tools.transcribe_audio",
+        side_effect=AssertionError("must not be called"),
+    ):
+        with patch(
+            "tools.credential_files.to_agent_visible_cache_path",
+            side_effect=lambda p: p,
+        ):
+            result = await runner._prepare_inbound_message_text(
+                event=event,
+                source=source,
+                history=[],
+            )
+
+    # Should NOT see the "transcription is disabled" note — that's only for VOICE
+    assert "transcription is disabled" not in result.lower()
+    assert "audio file attachment" in result.lower()
+    assert "/tmp/podcast.m4a" in result
+
+
+# ---------------------------------------------------------------------------
+# 4. Telegram gateway: msg.audio → MessageType.AUDIO (not VOICE)
+# ---------------------------------------------------------------------------
+
+def test_telegram_media_type_detection_audio_vs_voice():
+    """The Telegram platform must set MessageType.AUDIO for msg.audio, VOICE for msg.voice."""
+    from gateway.platforms.base import MessageType
+
+    # The Telegram adapter's _build_media_type already returns correct values
+    # via MessageType.AUDIO for .audio and MessageType.VOICE for .voice.
+    # Check the constants match expected semantic roles.
+    assert MessageType.AUDIO.value == "audio"
+    assert MessageType.VOICE.value == "voice"
+    # Sanity: they are distinct
+    assert MessageType.AUDIO != MessageType.VOICE