From cd592c105cbbc0bbf927b30ee9d061b1dd7b0b1b Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 27 Jun 2026 04:40:05 -0700 Subject: [PATCH] feat(send_message): native WhatsApp media delivery via Baileys bridge (#53598) send_message with MEDIA:/path to a WhatsApp target previously dropped the attachment: the WhatsApp branch never passed media_files, the plugin's _standalone_send accepted the param but only POSTed text, and WhatsApp was absent from the media-supported platform list. - send_message_tool: add a Platform.WHATSAPP media block (mirrors Feishu) that routes media_files through the whatsapp plugin's standalone_sender_fn, and add whatsapp to the supported-media list strings. - whatsapp adapter: _standalone_send now sends text first (skipped when the chunk is media-only), then uploads each file via the bridge /send-media endpoint with a mediaType derived from extension/is_voice/force_document, so images/videos/voice arrive as native bubbles instead of documents. - _bridge_media_type classifier maps ext -> image|video|audio|document. Closes #19105 (remaining send_message gap). Other items in the report (inbound video paths, image_generate auto-deliver, history dedup, native gateway bubbles) already landed on main. --- plugins/platforms/whatsapp/adapter.py | 86 +++++-- .../tools/test_whatsapp_send_message_media.py | 221 ++++++++++++++++++ tools/send_message_tool.py | 31 ++- 3 files changed, 322 insertions(+), 16 deletions(-) create mode 100644 tests/tools/test_whatsapp_send_message_media.py diff --git a/plugins/platforms/whatsapp/adapter.py b/plugins/platforms/whatsapp/adapter.py index cc5f5f95ac2..dc4361213e5 100644 --- a/plugins/platforms/whatsapp/adapter.py +++ b/plugins/platforms/whatsapp/adapter.py @@ -1293,6 +1293,31 @@ class WhatsAppAdapter(WhatsAppBehaviorMixin, BasePlatformAdapter): # ────────────────────────────────────────────────────────────────────────── +_WA_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"} +_WA_VIDEO_EXTS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".3gp"} +_WA_AUDIO_EXTS = {".ogg", ".opus", ".mp3", ".wav", ".m4a", ".flac"} + + +def _bridge_media_type(file_path: str, is_voice: bool, force_document: bool) -> str: + """Map a local media file to the bridge /send-media ``mediaType``. + + Returns one of ``image`` | ``video`` | ``audio`` | ``document`` so the + Baileys bridge renders the right native WhatsApp message kind. Voice notes + and audio files route to ``audio``; ``force_document`` (the [[as_document]] + directive) forces every file to ``document`` regardless of extension. + """ + if force_document: + return "document" + ext = os.path.splitext(file_path)[1].lower() + if is_voice or ext in _WA_AUDIO_EXTS: + return "audio" + if ext in _WA_IMAGE_EXTS: + return "image" + if ext in _WA_VIDEO_EXTS: + return "video" + return "document" + + async def _standalone_send( pconfig, chat_id, @@ -1316,22 +1341,55 @@ async def _standalone_send( try: bridge_port = extra.get("bridge_port", 3000) normalized_chat_id = to_whatsapp_jid(chat_id) + media = media_files or [] + text = message or "" + last_message_id = None async with aiohttp.ClientSession() as session: - async with session.post( - f"http://localhost:{bridge_port}/send", - json={"chatId": normalized_chat_id, "message": message}, - timeout=aiohttp.ClientTimeout(total=30), - ) as resp: - if resp.status == 200: + # 1) Text first (skip the /send call when this chunk is media-only). + if text.strip(): + async with session.post( + f"http://localhost:{bridge_port}/send", + json={"chatId": normalized_chat_id, "message": text}, + timeout=aiohttp.ClientTimeout(total=30), + ) as resp: + if resp.status != 200: + body = await resp.text() + return {"error": f"WhatsApp bridge error ({resp.status}): {body}"} data = await resp.json() - return { - "success": True, - "platform": "whatsapp", - "chat_id": normalized_chat_id, - "message_id": data.get("messageId"), - } - body = await resp.text() - return {"error": f"WhatsApp bridge error ({resp.status}): {body}"} + last_message_id = data.get("messageId") + + # 2) Each media file as a native attachment via /send-media. The + # bridge maps mediaType -> image/video/audio/document message kinds + # so PNG/JPEG/WebP/GIF arrive as inline images, MP4 as a video + # bubble, and ogg/opus as a voice note — not a file/document. + for media_path, is_voice in media: + if not os.path.exists(media_path): + return {"error": f"WhatsApp media file not found: {media_path}"} + media_type = _bridge_media_type(media_path, is_voice, force_document) + payload: Dict[str, Any] = { + "chatId": normalized_chat_id, + "filePath": media_path, + "mediaType": media_type, + } + if media_type == "document": + payload["fileName"] = os.path.basename(media_path) + async with session.post( + f"http://localhost:{bridge_port}/send-media", + json=payload, + timeout=aiohttp.ClientTimeout(total=120), + ) as resp: + if resp.status != 200: + body = await resp.text() + return {"error": f"WhatsApp media error ({resp.status}): {body}"} + data = await resp.json() + last_message_id = data.get("messageId") or last_message_id + + return { + "success": True, + "platform": "whatsapp", + "chat_id": normalized_chat_id, + "message_id": last_message_id, + } except Exception as e: return {"error": f"WhatsApp send failed: {e}"} diff --git a/tests/tools/test_whatsapp_send_message_media.py b/tests/tools/test_whatsapp_send_message_media.py new file mode 100644 index 00000000000..d1fac4495e0 --- /dev/null +++ b/tests/tools/test_whatsapp_send_message_media.py @@ -0,0 +1,221 @@ +"""WhatsApp media delivery for send_message (#19105). + +Covers two layers: + +* ``_bridge_media_type`` — extension/voice/force_document -> bridge mediaType. +* ``_standalone_send`` — text-first then per-file ``/send-media`` uploads, + media-only (skip ``/send``), and missing-file errors. The bridge HTTP calls + are mocked at the ``aiohttp.ClientSession`` boundary. +""" + +import asyncio +import os +import tempfile +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from plugins.platforms.whatsapp.adapter import _bridge_media_type, _standalone_send + + +# --------------------------------------------------------------------------- +# _bridge_media_type +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "path,is_voice,force_document,expected", + [ + ("a.png", False, False, "image"), + ("a.JPG", False, False, "image"), + ("a.jpeg", False, False, "image"), + ("a.webp", False, False, "image"), + ("a.gif", False, False, "image"), + ("a.mp4", False, False, "video"), + ("a.mov", False, False, "video"), + ("a.webm", False, False, "video"), + ("a.ogg", True, False, "audio"), + ("a.opus", False, False, "audio"), + ("a.mp3", False, False, "audio"), + ("a.wav", False, False, "audio"), + ("a.pdf", False, False, "document"), + ("a.zip", False, False, "document"), + # force_document overrides everything + ("a.png", False, True, "document"), + ("a.mp4", False, True, "document"), + # is_voice wins over a video extension + ("a.mp4", True, False, "audio"), + ], +) +def test_bridge_media_type(path, is_voice, force_document, expected): + assert _bridge_media_type(path, is_voice, force_document) == expected + + +# --------------------------------------------------------------------------- +# _standalone_send — bridge HTTP mocked +# --------------------------------------------------------------------------- + + +def _resp(status, json_data=None, text_data=None): + r = AsyncMock() + r.status = status + r.json = AsyncMock(return_value=json_data or {}) + r.text = AsyncMock(return_value=text_data or "") + return r + + +def _session_with(responses): + """Build a mocked aiohttp.ClientSession that returns *responses* in order + and records every POST (url, json_payload).""" + calls = [] + idx = [0] + + def _post(url, **kwargs): + calls.append((url, kwargs.get("json"))) + r = responses[idx[0]] if idx[0] < len(responses) else responses[-1] + idx[0] += 1 + ctx = MagicMock() + ctx.__aenter__ = AsyncMock(return_value=r) + ctx.__aexit__ = AsyncMock(return_value=False) + return ctx + + session = MagicMock() + session.post = MagicMock(side_effect=_post) + session_ctx = MagicMock() + session_ctx.__aenter__ = AsyncMock(return_value=session) + session_ctx.__aexit__ = AsyncMock(return_value=False) + return session_ctx, calls + + +def _pconfig(): + return SimpleNamespace(token="", extra={"bridge_port": 3000}) + + +def _tmpfile(suffix): + f = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) + f.write(b"x") + f.close() + return f.name + + +def test_text_plus_mixed_media_routes_native_types(): + img = _tmpfile(".png") + vid = _tmpfile(".mp4") + voice = _tmpfile(".ogg") + try: + session_ctx, calls = _session_with( + [ + _resp(200, {"messageId": "t1"}), + _resp(200, {"messageId": "m1"}), + _resp(200, {"messageId": "m2"}), + _resp(200, {"messageId": "m3"}), + ] + ) + with patch("aiohttp.ClientSession", return_value=session_ctx): + res = asyncio.run( + _standalone_send( + _pconfig(), + "12345", + "hello", + media_files=[(img, False), (vid, False), (voice, True)], + ) + ) + assert res["success"] is True + # text first, then three media uploads in order + assert calls[0][0].endswith("/send") + assert calls[0][1]["message"] == "hello" + media_types = [c[1]["mediaType"] for c in calls if c[0].endswith("/send-media")] + assert media_types == ["image", "video", "audio"] + # chat id normalized to a WhatsApp JID + assert "@" in calls[0][1]["chatId"] + finally: + for p in (img, vid, voice): + os.unlink(p) + + +def test_media_only_skips_text_send(): + img = _tmpfile(".jpg") + try: + session_ctx, calls = _session_with([_resp(200, {"messageId": "m1"})]) + with patch("aiohttp.ClientSession", return_value=session_ctx): + res = asyncio.run( + _standalone_send(_pconfig(), "12345", "", media_files=[(img, False)]) + ) + assert res["success"] is True + assert all(c[0].endswith("/send-media") for c in calls) + finally: + os.unlink(img) + + +def test_force_document_sends_image_as_document(): + img = _tmpfile(".png") + try: + session_ctx, calls = _session_with( + [_resp(200, {"messageId": "t1"}), _resp(200, {"messageId": "m1"})] + ) + with patch("aiohttp.ClientSession", return_value=session_ctx): + res = asyncio.run( + _standalone_send( + _pconfig(), + "12345", + "doc", + media_files=[(img, False)], + force_document=True, + ) + ) + assert res["success"] is True + media_call = [c for c in calls if c[0].endswith("/send-media")][0] + assert media_call[1]["mediaType"] == "document" + assert media_call[1]["fileName"] == os.path.basename(img) + finally: + os.unlink(img) + + +def test_missing_media_file_errors(): + session_ctx, _ = _session_with([_resp(200, {"messageId": "t1"})]) + with patch("aiohttp.ClientSession", return_value=session_ctx): + res = asyncio.run( + _standalone_send( + _pconfig(), + "12345", + "hi", + media_files=[("/no/such/file.png", False)], + ) + ) + assert "error" in res + assert "not found" in res["error"] + + +def test_media_upload_error_propagates(): + img = _tmpfile(".png") + try: + session_ctx, _ = _session_with( + [ + _resp(200, {"messageId": "t1"}), + _resp(500, text_data="boom"), + ] + ) + with patch("aiohttp.ClientSession", return_value=session_ctx): + res = asyncio.run( + _standalone_send( + _pconfig(), "12345", "hi", media_files=[(img, False)] + ) + ) + assert "error" in res + assert "500" in res["error"] + finally: + os.unlink(img) + + +def test_text_only_unchanged_behavior(): + session_ctx, calls = _session_with([_resp(200, {"messageId": "t1"})]) + with patch("aiohttp.ClientSession", return_value=session_ctx): + res = asyncio.run(_standalone_send(_pconfig(), "12345", "just text")) + assert res == { + "success": True, + "platform": "whatsapp", + "chat_id": calls[0][1]["chatId"], + "message_id": "t1", + } + assert len(calls) == 1 and calls[0][0].endswith("/send") diff --git a/tools/send_message_tool.py b/tools/send_message_tool.py index f7c32bff72e..054da5290f5 100644 --- a/tools/send_message_tool.py +++ b/tools/send_message_tool.py @@ -903,11 +903,38 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, last_result = result return last_result + # --- WhatsApp: native media attachment support via the registry's + # standalone_sender_fn (plugins/platforms/whatsapp/adapter.py::_standalone_send). + # The plugin uploads each file through the local Baileys bridge /send-media + # endpoint so images/videos/audio arrive as native bubbles, not documents. #41112 + if platform == Platform.WHATSAPP and media_files: + from gateway.platform_registry import platform_registry as _pr_wa + from hermes_cli.plugins import discover_plugins as _dp_wa + _dp_wa() + _wa_entry = _pr_wa.get("whatsapp") + if _wa_entry is None or _wa_entry.standalone_sender_fn is None: + return {"error": "WhatsApp plugin not registered or missing standalone_sender_fn"} + last_result = None + for i, chunk in enumerate(chunks): + is_last = (i == len(chunks) - 1) + result = await _wa_entry.standalone_sender_fn( + pconfig, + chat_id, + chunk, + media_files=media_files if is_last else None, + thread_id=thread_id, + force_document=force_document, + ) + if isinstance(result, dict) and result.get("error"): + return result + last_result = result + return last_result + # --- Non-media platforms --- if media_files and not message.strip(): return { "error": ( - f"send_message MEDIA delivery is currently only supported for telegram, discord, matrix, weixin, signal, yuanbao and feishu; " + f"send_message MEDIA delivery is currently only supported for telegram, discord, matrix, weixin, signal, yuanbao, feishu and whatsapp; " f"target {platform.value} had only media attachments" ) } @@ -915,7 +942,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, if media_files: warning = ( f"MEDIA attachments were omitted for {platform.value}; " - "native send_message media delivery is currently only supported for telegram, discord, matrix, weixin, signal, yuanbao and feishu" + "native send_message media delivery is currently only supported for telegram, discord, matrix, weixin, signal, yuanbao, feishu and whatsapp" ) last_result = None