From 93734c26e5620de35863df6ae60c728bc770e0c2 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Tue, 19 May 2026 17:15:55 -0700
Subject: [PATCH] fix(dingtalk): transcribe native voice notes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sibling fix to PR #28918 (Discord voice notes). DingTalk's rich-text
"voice" item type is its native voice-message format, but the adapter
was routing it to MessageType.AUDIO — which gateway/run.py:7605 skips
for STT. The docs claim every voice-capable platform auto-transcribes,
so this brings DingTalk in line.

Generic audio uploads (mapped to "file" by DINGTALK_TYPE_MAPPING) are
unchanged — they were already classified as DOCUMENT, not AUDIO.

Adds tests/gateway/test_dingtalk.py::TestExtractMedia covering both the
voice path and the audio-passthrough invariant.
---
 gateway/platforms/dingtalk.py  |  9 +++++-
 tests/gateway/test_dingtalk.py | 52 ++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/gateway/platforms/dingtalk.py b/gateway/platforms/dingtalk.py
index c77de2f3616..6e599ed2210 100644
--- a/gateway/platforms/dingtalk.py
+++ b/gateway/platforms/dingtalk.py
@@ -774,7 +774,14 @@ class DingTalkAdapter(BasePlatformAdapter):
                             elif mapped == "audio":
                                 media_types.append("audio")
                                 if msg_type == MessageType.TEXT:
-                                    msg_type = MessageType.AUDIO
+                                    # DingTalk's "voice" rich-text item is a
+                                    # native voice note — route through STT.
+                                    # "audio" comes from file uploads only;
+                                    # keep those as AUDIO (no auto-STT).
+                                    if item_type == "voice":
+                                        msg_type = MessageType.VOICE
+                                    else:
+                                        msg_type = MessageType.AUDIO
                             elif mapped == "video":
                                 media_types.append("video")
                                 if msg_type == MessageType.TEXT:
diff --git a/tests/gateway/test_dingtalk.py b/tests/gateway/test_dingtalk.py
index 570eb997ba0..6b2db13299d 100644
--- a/tests/gateway/test_dingtalk.py
+++ b/tests/gateway/test_dingtalk.py
@@ -542,6 +542,58 @@ class TestExtractText:
         assert DingTalkAdapter._extract_text(msg) == ""
 
 
+class TestExtractMedia:
+    """_extract_media must split native voice rich-text items (auto-STT)
+    from generic audio file uploads (kept as attachments, no STT)."""
+
+    def _msg_with_rich_text(self, items):
+        msg = MagicMock()
+        msg.text = None
+        msg.image_content = None
+        msg.rich_text_content = None
+        msg.rich_text = items
+        return msg
+
+    def test_voice_rich_text_item_classified_as_voice(self):
+        """Native DingTalk voice notes (type=voice) must enter the auto-STT
+        path via MessageType.VOICE — the gateway skips STT for AUDIO."""
+        from gateway.platforms.dingtalk import DingTalkAdapter
+        from gateway.platforms.base import MessageType
+
+        msg = self._msg_with_rich_text(
+            [{"type": "voice", "downloadCode": "dl_voice_abc"}]
+        )
+        msg_type, urls, mtypes = DingTalkAdapter._extract_media(
+            DingTalkAdapter, msg
+        )
+        assert msg_type == MessageType.VOICE
+        assert urls == ["dl_voice_abc"]
+        assert mtypes == ["audio"]
+
+    def test_audio_rich_text_item_stays_audio(self):
+        """Generic audio uploads (e.g. an mp3 the user attached) must NOT
+        be auto-transcribed — they stay MessageType.AUDIO."""
+        from gateway.platforms.dingtalk import DingTalkAdapter, DINGTALK_TYPE_MAPPING
+        from gateway.platforms.base import MessageType
+
+        # Simulate a future/non-voice audio rich-text item by extending the
+        # mapping so item_type != "voice" but still routes through the
+        # ``mapped == "audio"`` branch.
+        DINGTALK_TYPE_MAPPING["audio"] = "audio"
+        try:
+            msg = self._msg_with_rich_text(
+                [{"type": "audio", "downloadCode": "dl_audio_xyz"}]
+            )
+            msg_type, urls, mtypes = DingTalkAdapter._extract_media(
+                DingTalkAdapter, msg
+            )
+            assert msg_type == MessageType.AUDIO
+            assert urls == ["dl_audio_xyz"]
+            assert mtypes == ["audio"]
+        finally:
+            del DINGTALK_TYPE_MAPPING["audio"]
+
+
 # ---------------------------------------------------------------------------
 # Group gating — require_mention + allowed_users (parity with other platforms)
 # ---------------------------------------------------------------------------