fix(dingtalk): transcribe native voice notes

Sibling fix to PR #28918 (Discord voice notes). DingTalk's rich-text
"voice" item type is its native voice-message format, but the adapter
was routing it to MessageType.AUDIO — which gateway/run.py:7605 skips
for STT. The docs claim every voice-capable platform auto-transcribes,
so this brings DingTalk in line.

Generic audio uploads (mapped to "file" by DINGTALK_TYPE_MAPPING) are
unchanged — they were already classified as DOCUMENT, not AUDIO.

Adds tests/gateway/test_dingtalk.py::TestExtractMedia covering both the
voice path and the audio-passthrough invariant.
This commit is contained in:
Teknium 2026-05-19 17:15:55 -07:00
parent 448a3f9ea2
commit 93734c26e5
2 changed files with 60 additions and 1 deletions

View file

@ -774,7 +774,14 @@ class DingTalkAdapter(BasePlatformAdapter):
elif mapped == "audio":
media_types.append("audio")
if msg_type == MessageType.TEXT:
msg_type = MessageType.AUDIO
# DingTalk's "voice" rich-text item is a
# native voice note — route through STT.
# "audio" comes from file uploads only;
# keep those as AUDIO (no auto-STT).
if item_type == "voice":
msg_type = MessageType.VOICE
else:
msg_type = MessageType.AUDIO
elif mapped == "video":
media_types.append("video")
if msg_type == MessageType.TEXT:

View file

@ -542,6 +542,58 @@ class TestExtractText:
assert DingTalkAdapter._extract_text(msg) == ""
class TestExtractMedia:
"""_extract_media must split native voice rich-text items (auto-STT)
from generic audio file uploads (kept as attachments, no STT)."""
def _msg_with_rich_text(self, items):
msg = MagicMock()
msg.text = None
msg.image_content = None
msg.rich_text_content = None
msg.rich_text = items
return msg
def test_voice_rich_text_item_classified_as_voice(self):
"""Native DingTalk voice notes (type=voice) must enter the auto-STT
path via MessageType.VOICE the gateway skips STT for AUDIO."""
from gateway.platforms.dingtalk import DingTalkAdapter
from gateway.platforms.base import MessageType
msg = self._msg_with_rich_text(
[{"type": "voice", "downloadCode": "dl_voice_abc"}]
)
msg_type, urls, mtypes = DingTalkAdapter._extract_media(
DingTalkAdapter, msg
)
assert msg_type == MessageType.VOICE
assert urls == ["dl_voice_abc"]
assert mtypes == ["audio"]
def test_audio_rich_text_item_stays_audio(self):
"""Generic audio uploads (e.g. an mp3 the user attached) must NOT
be auto-transcribed they stay MessageType.AUDIO."""
from gateway.platforms.dingtalk import DingTalkAdapter, DINGTALK_TYPE_MAPPING
from gateway.platforms.base import MessageType
# Simulate a future/non-voice audio rich-text item by extending the
# mapping so item_type != "voice" but still routes through the
# ``mapped == "audio"`` branch.
DINGTALK_TYPE_MAPPING["audio"] = "audio"
try:
msg = self._msg_with_rich_text(
[{"type": "audio", "downloadCode": "dl_audio_xyz"}]
)
msg_type, urls, mtypes = DingTalkAdapter._extract_media(
DingTalkAdapter, msg
)
assert msg_type == MessageType.AUDIO
assert urls == ["dl_audio_xyz"]
assert mtypes == ["audio"]
finally:
del DINGTALK_TYPE_MAPPING["audio"]
# ---------------------------------------------------------------------------
# Group gating — require_mention + allowed_users (parity with other platforms)
# ---------------------------------------------------------------------------