fix(discord): transcribe native voice notes

This commit is contained in:
helix4u 2026-05-19 14:28:00 -06:00 committed by Teknium
parent d35f8932e8
commit 448a3f9ea2
2 changed files with 111 additions and 1 deletions

View file

@ -3602,6 +3602,24 @@ class DiscordAdapter(BasePlatformAdapter):
return 32 * 1024 * 1024
return max(0, value)
@staticmethod
def _is_discord_voice_message_attachment(att: Any) -> bool:
"""Return True when a Discord audio attachment is a native voice note."""
marker = getattr(att, "is_voice_message", None)
if marker is not None:
if callable(marker):
try:
return bool(marker())
except Exception as exc:
logger.debug("[Discord] is_voice_message() failed for attachment: %s", exc)
return False
return bool(marker)
return (
getattr(att, "duration", None) is not None
and getattr(att, "waveform", None) is not None
)
def _discord_free_response_channels(self) -> set:
"""Return Discord channel IDs where no bot mention is required.
@ -4542,7 +4560,10 @@ class DiscordAdapter(BasePlatformAdapter):
elif att.content_type.startswith("video/"):
msg_type = MessageType.VIDEO
elif att.content_type.startswith("audio/"):
msg_type = MessageType.AUDIO
if self._is_discord_voice_message_attachment(att):
msg_type = MessageType.VOICE
else:
msg_type = MessageType.AUDIO
else:
doc_ext = ""
if att.filename:

View file

@ -59,6 +59,7 @@ def _ensure_discord_mock():
_ensure_discord_mock()
from gateway.platforms.discord import DiscordAdapter # noqa: E402
from gateway.platforms.base import MessageType # noqa: E402
# Minimal valid image / audio / PDF bytes so the cache_*_from_bytes
@ -358,3 +359,91 @@ class TestHandleMessageUsesAuthenticatedRead:
event = adapter.handle_message.call_args[0][0]
assert event.media_urls == ["/tmp/img_from_read.png"]
assert event.media_types == ["image/png"]
@pytest.mark.asyncio
async def test_native_voice_note_is_classified_as_voice(self, monkeypatch):
"""Discord native voice notes must enter the auto-STT voice path."""
adapter = _make_adapter()
adapter._client = SimpleNamespace(user=SimpleNamespace(id=999))
adapter.handle_message = AsyncMock()
with patch(
"gateway.platforms.discord.cache_audio_from_bytes",
return_value="/tmp/voice_from_read.ogg",
):
att = SimpleNamespace(
url="https://cdn.discordapp.com/attachments/fake/voice.ogg",
filename="voice.ogg",
content_type="audio/ogg",
size=len(_OGG_BYTES),
read=AsyncMock(return_value=_OGG_BYTES),
is_voice_message=lambda: True,
)
from datetime import datetime, timezone
class _FakeDMChannel:
id = 100
name = "dm"
monkeypatch.setattr(
"gateway.platforms.discord.discord.DMChannel",
_FakeDMChannel,
)
chan = _FakeDMChannel()
msg = SimpleNamespace(
id=1, content="", attachments=[att], mentions=[],
reference=None,
created_at=datetime.now(timezone.utc),
channel=chan,
author=SimpleNamespace(id=42, display_name="U", name="U"),
)
await adapter._handle_message(msg)
event = adapter.handle_message.call_args[0][0]
assert event.message_type == MessageType.VOICE
assert event.media_urls == ["/tmp/voice_from_read.ogg"]
assert event.media_types == ["audio/ogg"]
@pytest.mark.asyncio
async def test_plain_audio_attachment_stays_audio(self, monkeypatch):
"""Plain audio uploads should stay out of automatic voice-note STT."""
adapter = _make_adapter()
adapter._client = SimpleNamespace(user=SimpleNamespace(id=999))
adapter.handle_message = AsyncMock()
with patch(
"gateway.platforms.discord.cache_audio_from_bytes",
return_value="/tmp/audio_from_read.ogg",
):
att = SimpleNamespace(
url="https://cdn.discordapp.com/attachments/fake/audio.ogg",
filename="audio.ogg",
content_type="audio/ogg",
size=len(_OGG_BYTES),
read=AsyncMock(return_value=_OGG_BYTES),
is_voice_message=lambda: False,
)
from datetime import datetime, timezone
class _FakeDMChannel:
id = 100
name = "dm"
monkeypatch.setattr(
"gateway.platforms.discord.discord.DMChannel",
_FakeDMChannel,
)
chan = _FakeDMChannel()
msg = SimpleNamespace(
id=1, content="", attachments=[att], mentions=[],
reference=None,
created_at=datetime.now(timezone.utc),
channel=chan,
author=SimpleNamespace(id=42, display_name="U", name="U"),
)
await adapter._handle_message(msg)
event = adapter.handle_message.call_args[0][0]
assert event.message_type == MessageType.AUDIO
assert event.media_urls == ["/tmp/audio_from_read.ogg"]
assert event.media_types == ["audio/ogg"]