From 93734c26e5620de35863df6ae60c728bc770e0c2 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 19 May 2026 17:15:55 -0700 Subject: [PATCH] fix(dingtalk): transcribe native voice notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sibling fix to PR #28918 (Discord voice notes). DingTalk's rich-text "voice" item type is its native voice-message format, but the adapter was routing it to MessageType.AUDIO — which gateway/run.py:7605 skips for STT. The docs claim every voice-capable platform auto-transcribes, so this brings DingTalk in line. Generic audio uploads (mapped to "file" by DINGTALK_TYPE_MAPPING) are unchanged — they were already classified as DOCUMENT, not AUDIO. Adds tests/gateway/test_dingtalk.py::TestExtractMedia covering both the voice path and the audio-passthrough invariant. --- gateway/platforms/dingtalk.py | 9 +++++- tests/gateway/test_dingtalk.py | 52 ++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/gateway/platforms/dingtalk.py b/gateway/platforms/dingtalk.py index c77de2f3616..6e599ed2210 100644 --- a/gateway/platforms/dingtalk.py +++ b/gateway/platforms/dingtalk.py @@ -774,7 +774,14 @@ class DingTalkAdapter(BasePlatformAdapter): elif mapped == "audio": media_types.append("audio") if msg_type == MessageType.TEXT: - msg_type = MessageType.AUDIO + # DingTalk's "voice" rich-text item is a + # native voice note — route through STT. + # "audio" comes from file uploads only; + # keep those as AUDIO (no auto-STT). + if item_type == "voice": + msg_type = MessageType.VOICE + else: + msg_type = MessageType.AUDIO elif mapped == "video": media_types.append("video") if msg_type == MessageType.TEXT: diff --git a/tests/gateway/test_dingtalk.py b/tests/gateway/test_dingtalk.py index 570eb997ba0..6b2db13299d 100644 --- a/tests/gateway/test_dingtalk.py +++ b/tests/gateway/test_dingtalk.py @@ -542,6 +542,58 @@ class TestExtractText: assert DingTalkAdapter._extract_text(msg) == "" +class TestExtractMedia: + """_extract_media must split native voice rich-text items (auto-STT) + from generic audio file uploads (kept as attachments, no STT).""" + + def _msg_with_rich_text(self, items): + msg = MagicMock() + msg.text = None + msg.image_content = None + msg.rich_text_content = None + msg.rich_text = items + return msg + + def test_voice_rich_text_item_classified_as_voice(self): + """Native DingTalk voice notes (type=voice) must enter the auto-STT + path via MessageType.VOICE — the gateway skips STT for AUDIO.""" + from gateway.platforms.dingtalk import DingTalkAdapter + from gateway.platforms.base import MessageType + + msg = self._msg_with_rich_text( + [{"type": "voice", "downloadCode": "dl_voice_abc"}] + ) + msg_type, urls, mtypes = DingTalkAdapter._extract_media( + DingTalkAdapter, msg + ) + assert msg_type == MessageType.VOICE + assert urls == ["dl_voice_abc"] + assert mtypes == ["audio"] + + def test_audio_rich_text_item_stays_audio(self): + """Generic audio uploads (e.g. an mp3 the user attached) must NOT + be auto-transcribed — they stay MessageType.AUDIO.""" + from gateway.platforms.dingtalk import DingTalkAdapter, DINGTALK_TYPE_MAPPING + from gateway.platforms.base import MessageType + + # Simulate a future/non-voice audio rich-text item by extending the + # mapping so item_type != "voice" but still routes through the + # ``mapped == "audio"`` branch. + DINGTALK_TYPE_MAPPING["audio"] = "audio" + try: + msg = self._msg_with_rich_text( + [{"type": "audio", "downloadCode": "dl_audio_xyz"}] + ) + msg_type, urls, mtypes = DingTalkAdapter._extract_media( + DingTalkAdapter, msg + ) + assert msg_type == MessageType.AUDIO + assert urls == ["dl_audio_xyz"] + assert mtypes == ["audio"] + finally: + del DINGTALK_TYPE_MAPPING["audio"] + + # --------------------------------------------------------------------------- # Group gating — require_mention + allowed_users (parity with other platforms) # ---------------------------------------------------------------------------