mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-03 07:21:54 +00:00
fix(dingtalk): transcribe native voice notes
Sibling fix to PR #28918 (Discord voice notes). DingTalk's rich-text "voice" item type is its native voice-message format, but the adapter was routing it to MessageType.AUDIO — which gateway/run.py:7605 skips for STT. The docs claim every voice-capable platform auto-transcribes, so this brings DingTalk in line. Generic audio uploads (mapped to "file" by DINGTALK_TYPE_MAPPING) are unchanged — they were already classified as DOCUMENT, not AUDIO. Adds tests/gateway/test_dingtalk.py::TestExtractMedia covering both the voice path and the audio-passthrough invariant.
This commit is contained in:
parent
448a3f9ea2
commit
93734c26e5
2 changed files with 60 additions and 1 deletions
|
|
@ -774,7 +774,14 @@ class DingTalkAdapter(BasePlatformAdapter):
|
|||
elif mapped == "audio":
|
||||
media_types.append("audio")
|
||||
if msg_type == MessageType.TEXT:
|
||||
msg_type = MessageType.AUDIO
|
||||
# DingTalk's "voice" rich-text item is a
|
||||
# native voice note — route through STT.
|
||||
# "audio" comes from file uploads only;
|
||||
# keep those as AUDIO (no auto-STT).
|
||||
if item_type == "voice":
|
||||
msg_type = MessageType.VOICE
|
||||
else:
|
||||
msg_type = MessageType.AUDIO
|
||||
elif mapped == "video":
|
||||
media_types.append("video")
|
||||
if msg_type == MessageType.TEXT:
|
||||
|
|
|
|||
|
|
@ -542,6 +542,58 @@ class TestExtractText:
|
|||
assert DingTalkAdapter._extract_text(msg) == ""
|
||||
|
||||
|
||||
class TestExtractMedia:
|
||||
"""_extract_media must split native voice rich-text items (auto-STT)
|
||||
from generic audio file uploads (kept as attachments, no STT)."""
|
||||
|
||||
def _msg_with_rich_text(self, items):
|
||||
msg = MagicMock()
|
||||
msg.text = None
|
||||
msg.image_content = None
|
||||
msg.rich_text_content = None
|
||||
msg.rich_text = items
|
||||
return msg
|
||||
|
||||
def test_voice_rich_text_item_classified_as_voice(self):
|
||||
"""Native DingTalk voice notes (type=voice) must enter the auto-STT
|
||||
path via MessageType.VOICE — the gateway skips STT for AUDIO."""
|
||||
from gateway.platforms.dingtalk import DingTalkAdapter
|
||||
from gateway.platforms.base import MessageType
|
||||
|
||||
msg = self._msg_with_rich_text(
|
||||
[{"type": "voice", "downloadCode": "dl_voice_abc"}]
|
||||
)
|
||||
msg_type, urls, mtypes = DingTalkAdapter._extract_media(
|
||||
DingTalkAdapter, msg
|
||||
)
|
||||
assert msg_type == MessageType.VOICE
|
||||
assert urls == ["dl_voice_abc"]
|
||||
assert mtypes == ["audio"]
|
||||
|
||||
def test_audio_rich_text_item_stays_audio(self):
|
||||
"""Generic audio uploads (e.g. an mp3 the user attached) must NOT
|
||||
be auto-transcribed — they stay MessageType.AUDIO."""
|
||||
from gateway.platforms.dingtalk import DingTalkAdapter, DINGTALK_TYPE_MAPPING
|
||||
from gateway.platforms.base import MessageType
|
||||
|
||||
# Simulate a future/non-voice audio rich-text item by extending the
|
||||
# mapping so item_type != "voice" but still routes through the
|
||||
# ``mapped == "audio"`` branch.
|
||||
DINGTALK_TYPE_MAPPING["audio"] = "audio"
|
||||
try:
|
||||
msg = self._msg_with_rich_text(
|
||||
[{"type": "audio", "downloadCode": "dl_audio_xyz"}]
|
||||
)
|
||||
msg_type, urls, mtypes = DingTalkAdapter._extract_media(
|
||||
DingTalkAdapter, msg
|
||||
)
|
||||
assert msg_type == MessageType.AUDIO
|
||||
assert urls == ["dl_audio_xyz"]
|
||||
assert mtypes == ["audio"]
|
||||
finally:
|
||||
del DINGTALK_TYPE_MAPPING["audio"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Group gating — require_mention + allowed_users (parity with other platforms)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue