fix(discord): transcribe native voice notes

2026-07-13 14:02:16 +00:00 · 2026-05-19 14:28:00 -06:00 · 2026-05-19 14:28:00 -06:00 · 448a3f9ea2
commit 448a3f9ea2
parent d35f8932e8
2 changed files with 111 additions and 1 deletions
--- a/gateway/platforms/discord.py
+++ b/gateway/platforms/discord.py
@ -3602,6 +3602,24 @@ class DiscordAdapter(BasePlatformAdapter):
            return 32 * 1024 * 1024
        return max(0, value)

+    @staticmethod
+    def _is_discord_voice_message_attachment(att: Any) -> bool:
+        """Return True when a Discord audio attachment is a native voice note."""
+        marker = getattr(att, "is_voice_message", None)
+        if marker is not None:
+            if callable(marker):
+                try:
+                    return bool(marker())
+                except Exception as exc:
+                    logger.debug("[Discord] is_voice_message() failed for attachment: %s", exc)
+                    return False
+            return bool(marker)
+
+        return (
+            getattr(att, "duration", None) is not None
+            and getattr(att, "waveform", None) is not None
+        )
+
    def _discord_free_response_channels(self) -> set:
        """Return Discord channel IDs where no bot mention is required.

@ -4542,7 +4560,10 @@ class DiscordAdapter(BasePlatformAdapter):
                    elif att.content_type.startswith("video/"):
                        msg_type = MessageType.VIDEO
                    elif att.content_type.startswith("audio/"):
-                        msg_type = MessageType.AUDIO
+                        if self._is_discord_voice_message_attachment(att):
+                            msg_type = MessageType.VOICE
+                        else:
+                            msg_type = MessageType.AUDIO
                    else:
                        doc_ext = ""
                        if att.filename:
--- a/tests/gateway/test_discord_attachment_download.py
+++ b/tests/gateway/test_discord_attachment_download.py
@ -59,6 +59,7 @@ def _ensure_discord_mock():
 _ensure_discord_mock()

 from gateway.platforms.discord import DiscordAdapter  # noqa: E402
+from gateway.platforms.base import MessageType  # noqa: E402


 # Minimal valid image / audio / PDF bytes so the cache_*_from_bytes
@ -358,3 +359,91 @@ class TestHandleMessageUsesAuthenticatedRead:
        event = adapter.handle_message.call_args[0][0]
        assert event.media_urls == ["/tmp/img_from_read.png"]
        assert event.media_types == ["image/png"]
+
+    @pytest.mark.asyncio
+    async def test_native_voice_note_is_classified_as_voice(self, monkeypatch):
+        """Discord native voice notes must enter the auto-STT voice path."""
+        adapter = _make_adapter()
+        adapter._client = SimpleNamespace(user=SimpleNamespace(id=999))
+        adapter.handle_message = AsyncMock()
+
+        with patch(
+            "gateway.platforms.discord.cache_audio_from_bytes",
+            return_value="/tmp/voice_from_read.ogg",
+        ):
+            att = SimpleNamespace(
+                url="https://cdn.discordapp.com/attachments/fake/voice.ogg",
+                filename="voice.ogg",
+                content_type="audio/ogg",
+                size=len(_OGG_BYTES),
+                read=AsyncMock(return_value=_OGG_BYTES),
+                is_voice_message=lambda: True,
+            )
+            from datetime import datetime, timezone
+
+            class _FakeDMChannel:
+                id = 100
+                name = "dm"
+
+            monkeypatch.setattr(
+                "gateway.platforms.discord.discord.DMChannel",
+                _FakeDMChannel,
+            )
+            chan = _FakeDMChannel()
+            msg = SimpleNamespace(
+                id=1, content="", attachments=[att], mentions=[],
+                reference=None,
+                created_at=datetime.now(timezone.utc),
+                channel=chan,
+                author=SimpleNamespace(id=42, display_name="U", name="U"),
+            )
+            await adapter._handle_message(msg)
+
+        event = adapter.handle_message.call_args[0][0]
+        assert event.message_type == MessageType.VOICE
+        assert event.media_urls == ["/tmp/voice_from_read.ogg"]
+        assert event.media_types == ["audio/ogg"]
+
+    @pytest.mark.asyncio
+    async def test_plain_audio_attachment_stays_audio(self, monkeypatch):
+        """Plain audio uploads should stay out of automatic voice-note STT."""
+        adapter = _make_adapter()
+        adapter._client = SimpleNamespace(user=SimpleNamespace(id=999))
+        adapter.handle_message = AsyncMock()
+
+        with patch(
+            "gateway.platforms.discord.cache_audio_from_bytes",
+            return_value="/tmp/audio_from_read.ogg",
+        ):
+            att = SimpleNamespace(
+                url="https://cdn.discordapp.com/attachments/fake/audio.ogg",
+                filename="audio.ogg",
+                content_type="audio/ogg",
+                size=len(_OGG_BYTES),
+                read=AsyncMock(return_value=_OGG_BYTES),
+                is_voice_message=lambda: False,
+            )
+            from datetime import datetime, timezone
+
+            class _FakeDMChannel:
+                id = 100
+                name = "dm"
+
+            monkeypatch.setattr(
+                "gateway.platforms.discord.discord.DMChannel",
+                _FakeDMChannel,
+            )
+            chan = _FakeDMChannel()
+            msg = SimpleNamespace(
+                id=1, content="", attachments=[att], mentions=[],
+                reference=None,
+                created_at=datetime.now(timezone.utc),
+                channel=chan,
+                author=SimpleNamespace(id=42, display_name="U", name="U"),
+            )
+            await adapter._handle_message(msg)
+
+        event = adapter.handle_message.call_args[0][0]
+        assert event.message_type == MessageType.AUDIO
+        assert event.media_urls == ["/tmp/audio_from_read.ogg"]
+        assert event.media_types == ["audio/ogg"]