mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
feat(send_message): native WhatsApp media delivery via Baileys bridge (#53598)
send_message with MEDIA:/path to a WhatsApp target previously dropped the attachment: the WhatsApp branch never passed media_files, the plugin's _standalone_send accepted the param but only POSTed text, and WhatsApp was absent from the media-supported platform list. - send_message_tool: add a Platform.WHATSAPP media block (mirrors Feishu) that routes media_files through the whatsapp plugin's standalone_sender_fn, and add whatsapp to the supported-media list strings. - whatsapp adapter: _standalone_send now sends text first (skipped when the chunk is media-only), then uploads each file via the bridge /send-media endpoint with a mediaType derived from extension/is_voice/force_document, so images/videos/voice arrive as native bubbles instead of documents. - _bridge_media_type classifier maps ext -> image|video|audio|document. Closes #19105 (remaining send_message gap). Other items in the report (inbound video paths, image_generate auto-deliver, history dedup, native gateway bubbles) already landed on main.
This commit is contained in:
parent
88c02469cc
commit
cd592c105c
3 changed files with 322 additions and 16 deletions
|
|
@ -1293,6 +1293,31 @@ class WhatsAppAdapter(WhatsAppBehaviorMixin, BasePlatformAdapter):
|
|||
# ──────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
_WA_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
|
||||
_WA_VIDEO_EXTS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".3gp"}
|
||||
_WA_AUDIO_EXTS = {".ogg", ".opus", ".mp3", ".wav", ".m4a", ".flac"}
|
||||
|
||||
|
||||
def _bridge_media_type(file_path: str, is_voice: bool, force_document: bool) -> str:
|
||||
"""Map a local media file to the bridge /send-media ``mediaType``.
|
||||
|
||||
Returns one of ``image`` | ``video`` | ``audio`` | ``document`` so the
|
||||
Baileys bridge renders the right native WhatsApp message kind. Voice notes
|
||||
and audio files route to ``audio``; ``force_document`` (the [[as_document]]
|
||||
directive) forces every file to ``document`` regardless of extension.
|
||||
"""
|
||||
if force_document:
|
||||
return "document"
|
||||
ext = os.path.splitext(file_path)[1].lower()
|
||||
if is_voice or ext in _WA_AUDIO_EXTS:
|
||||
return "audio"
|
||||
if ext in _WA_IMAGE_EXTS:
|
||||
return "image"
|
||||
if ext in _WA_VIDEO_EXTS:
|
||||
return "video"
|
||||
return "document"
|
||||
|
||||
|
||||
async def _standalone_send(
|
||||
pconfig,
|
||||
chat_id,
|
||||
|
|
@ -1316,22 +1341,55 @@ async def _standalone_send(
|
|||
try:
|
||||
bridge_port = extra.get("bridge_port", 3000)
|
||||
normalized_chat_id = to_whatsapp_jid(chat_id)
|
||||
media = media_files or []
|
||||
text = message or ""
|
||||
last_message_id = None
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
f"http://localhost:{bridge_port}/send",
|
||||
json={"chatId": normalized_chat_id, "message": message},
|
||||
timeout=aiohttp.ClientTimeout(total=30),
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
# 1) Text first (skip the /send call when this chunk is media-only).
|
||||
if text.strip():
|
||||
async with session.post(
|
||||
f"http://localhost:{bridge_port}/send",
|
||||
json={"chatId": normalized_chat_id, "message": text},
|
||||
timeout=aiohttp.ClientTimeout(total=30),
|
||||
) as resp:
|
||||
if resp.status != 200:
|
||||
body = await resp.text()
|
||||
return {"error": f"WhatsApp bridge error ({resp.status}): {body}"}
|
||||
data = await resp.json()
|
||||
return {
|
||||
"success": True,
|
||||
"platform": "whatsapp",
|
||||
"chat_id": normalized_chat_id,
|
||||
"message_id": data.get("messageId"),
|
||||
}
|
||||
body = await resp.text()
|
||||
return {"error": f"WhatsApp bridge error ({resp.status}): {body}"}
|
||||
last_message_id = data.get("messageId")
|
||||
|
||||
# 2) Each media file as a native attachment via /send-media. The
|
||||
# bridge maps mediaType -> image/video/audio/document message kinds
|
||||
# so PNG/JPEG/WebP/GIF arrive as inline images, MP4 as a video
|
||||
# bubble, and ogg/opus as a voice note — not a file/document.
|
||||
for media_path, is_voice in media:
|
||||
if not os.path.exists(media_path):
|
||||
return {"error": f"WhatsApp media file not found: {media_path}"}
|
||||
media_type = _bridge_media_type(media_path, is_voice, force_document)
|
||||
payload: Dict[str, Any] = {
|
||||
"chatId": normalized_chat_id,
|
||||
"filePath": media_path,
|
||||
"mediaType": media_type,
|
||||
}
|
||||
if media_type == "document":
|
||||
payload["fileName"] = os.path.basename(media_path)
|
||||
async with session.post(
|
||||
f"http://localhost:{bridge_port}/send-media",
|
||||
json=payload,
|
||||
timeout=aiohttp.ClientTimeout(total=120),
|
||||
) as resp:
|
||||
if resp.status != 200:
|
||||
body = await resp.text()
|
||||
return {"error": f"WhatsApp media error ({resp.status}): {body}"}
|
||||
data = await resp.json()
|
||||
last_message_id = data.get("messageId") or last_message_id
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"platform": "whatsapp",
|
||||
"chat_id": normalized_chat_id,
|
||||
"message_id": last_message_id,
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": f"WhatsApp send failed: {e}"}
|
||||
|
||||
|
|
|
|||
221
tests/tools/test_whatsapp_send_message_media.py
Normal file
221
tests/tools/test_whatsapp_send_message_media.py
Normal file
|
|
@ -0,0 +1,221 @@
|
|||
"""WhatsApp media delivery for send_message (#19105).
|
||||
|
||||
Covers two layers:
|
||||
|
||||
* ``_bridge_media_type`` — extension/voice/force_document -> bridge mediaType.
|
||||
* ``_standalone_send`` — text-first then per-file ``/send-media`` uploads,
|
||||
media-only (skip ``/send``), and missing-file errors. The bridge HTTP calls
|
||||
are mocked at the ``aiohttp.ClientSession`` boundary.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import tempfile
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from plugins.platforms.whatsapp.adapter import _bridge_media_type, _standalone_send
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _bridge_media_type
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"path,is_voice,force_document,expected",
|
||||
[
|
||||
("a.png", False, False, "image"),
|
||||
("a.JPG", False, False, "image"),
|
||||
("a.jpeg", False, False, "image"),
|
||||
("a.webp", False, False, "image"),
|
||||
("a.gif", False, False, "image"),
|
||||
("a.mp4", False, False, "video"),
|
||||
("a.mov", False, False, "video"),
|
||||
("a.webm", False, False, "video"),
|
||||
("a.ogg", True, False, "audio"),
|
||||
("a.opus", False, False, "audio"),
|
||||
("a.mp3", False, False, "audio"),
|
||||
("a.wav", False, False, "audio"),
|
||||
("a.pdf", False, False, "document"),
|
||||
("a.zip", False, False, "document"),
|
||||
# force_document overrides everything
|
||||
("a.png", False, True, "document"),
|
||||
("a.mp4", False, True, "document"),
|
||||
# is_voice wins over a video extension
|
||||
("a.mp4", True, False, "audio"),
|
||||
],
|
||||
)
|
||||
def test_bridge_media_type(path, is_voice, force_document, expected):
|
||||
assert _bridge_media_type(path, is_voice, force_document) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _standalone_send — bridge HTTP mocked
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _resp(status, json_data=None, text_data=None):
|
||||
r = AsyncMock()
|
||||
r.status = status
|
||||
r.json = AsyncMock(return_value=json_data or {})
|
||||
r.text = AsyncMock(return_value=text_data or "")
|
||||
return r
|
||||
|
||||
|
||||
def _session_with(responses):
|
||||
"""Build a mocked aiohttp.ClientSession that returns *responses* in order
|
||||
and records every POST (url, json_payload)."""
|
||||
calls = []
|
||||
idx = [0]
|
||||
|
||||
def _post(url, **kwargs):
|
||||
calls.append((url, kwargs.get("json")))
|
||||
r = responses[idx[0]] if idx[0] < len(responses) else responses[-1]
|
||||
idx[0] += 1
|
||||
ctx = MagicMock()
|
||||
ctx.__aenter__ = AsyncMock(return_value=r)
|
||||
ctx.__aexit__ = AsyncMock(return_value=False)
|
||||
return ctx
|
||||
|
||||
session = MagicMock()
|
||||
session.post = MagicMock(side_effect=_post)
|
||||
session_ctx = MagicMock()
|
||||
session_ctx.__aenter__ = AsyncMock(return_value=session)
|
||||
session_ctx.__aexit__ = AsyncMock(return_value=False)
|
||||
return session_ctx, calls
|
||||
|
||||
|
||||
def _pconfig():
|
||||
return SimpleNamespace(token="", extra={"bridge_port": 3000})
|
||||
|
||||
|
||||
def _tmpfile(suffix):
|
||||
f = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
|
||||
f.write(b"x")
|
||||
f.close()
|
||||
return f.name
|
||||
|
||||
|
||||
def test_text_plus_mixed_media_routes_native_types():
|
||||
img = _tmpfile(".png")
|
||||
vid = _tmpfile(".mp4")
|
||||
voice = _tmpfile(".ogg")
|
||||
try:
|
||||
session_ctx, calls = _session_with(
|
||||
[
|
||||
_resp(200, {"messageId": "t1"}),
|
||||
_resp(200, {"messageId": "m1"}),
|
||||
_resp(200, {"messageId": "m2"}),
|
||||
_resp(200, {"messageId": "m3"}),
|
||||
]
|
||||
)
|
||||
with patch("aiohttp.ClientSession", return_value=session_ctx):
|
||||
res = asyncio.run(
|
||||
_standalone_send(
|
||||
_pconfig(),
|
||||
"12345",
|
||||
"hello",
|
||||
media_files=[(img, False), (vid, False), (voice, True)],
|
||||
)
|
||||
)
|
||||
assert res["success"] is True
|
||||
# text first, then three media uploads in order
|
||||
assert calls[0][0].endswith("/send")
|
||||
assert calls[0][1]["message"] == "hello"
|
||||
media_types = [c[1]["mediaType"] for c in calls if c[0].endswith("/send-media")]
|
||||
assert media_types == ["image", "video", "audio"]
|
||||
# chat id normalized to a WhatsApp JID
|
||||
assert "@" in calls[0][1]["chatId"]
|
||||
finally:
|
||||
for p in (img, vid, voice):
|
||||
os.unlink(p)
|
||||
|
||||
|
||||
def test_media_only_skips_text_send():
|
||||
img = _tmpfile(".jpg")
|
||||
try:
|
||||
session_ctx, calls = _session_with([_resp(200, {"messageId": "m1"})])
|
||||
with patch("aiohttp.ClientSession", return_value=session_ctx):
|
||||
res = asyncio.run(
|
||||
_standalone_send(_pconfig(), "12345", "", media_files=[(img, False)])
|
||||
)
|
||||
assert res["success"] is True
|
||||
assert all(c[0].endswith("/send-media") for c in calls)
|
||||
finally:
|
||||
os.unlink(img)
|
||||
|
||||
|
||||
def test_force_document_sends_image_as_document():
|
||||
img = _tmpfile(".png")
|
||||
try:
|
||||
session_ctx, calls = _session_with(
|
||||
[_resp(200, {"messageId": "t1"}), _resp(200, {"messageId": "m1"})]
|
||||
)
|
||||
with patch("aiohttp.ClientSession", return_value=session_ctx):
|
||||
res = asyncio.run(
|
||||
_standalone_send(
|
||||
_pconfig(),
|
||||
"12345",
|
||||
"doc",
|
||||
media_files=[(img, False)],
|
||||
force_document=True,
|
||||
)
|
||||
)
|
||||
assert res["success"] is True
|
||||
media_call = [c for c in calls if c[0].endswith("/send-media")][0]
|
||||
assert media_call[1]["mediaType"] == "document"
|
||||
assert media_call[1]["fileName"] == os.path.basename(img)
|
||||
finally:
|
||||
os.unlink(img)
|
||||
|
||||
|
||||
def test_missing_media_file_errors():
|
||||
session_ctx, _ = _session_with([_resp(200, {"messageId": "t1"})])
|
||||
with patch("aiohttp.ClientSession", return_value=session_ctx):
|
||||
res = asyncio.run(
|
||||
_standalone_send(
|
||||
_pconfig(),
|
||||
"12345",
|
||||
"hi",
|
||||
media_files=[("/no/such/file.png", False)],
|
||||
)
|
||||
)
|
||||
assert "error" in res
|
||||
assert "not found" in res["error"]
|
||||
|
||||
|
||||
def test_media_upload_error_propagates():
|
||||
img = _tmpfile(".png")
|
||||
try:
|
||||
session_ctx, _ = _session_with(
|
||||
[
|
||||
_resp(200, {"messageId": "t1"}),
|
||||
_resp(500, text_data="boom"),
|
||||
]
|
||||
)
|
||||
with patch("aiohttp.ClientSession", return_value=session_ctx):
|
||||
res = asyncio.run(
|
||||
_standalone_send(
|
||||
_pconfig(), "12345", "hi", media_files=[(img, False)]
|
||||
)
|
||||
)
|
||||
assert "error" in res
|
||||
assert "500" in res["error"]
|
||||
finally:
|
||||
os.unlink(img)
|
||||
|
||||
|
||||
def test_text_only_unchanged_behavior():
|
||||
session_ctx, calls = _session_with([_resp(200, {"messageId": "t1"})])
|
||||
with patch("aiohttp.ClientSession", return_value=session_ctx):
|
||||
res = asyncio.run(_standalone_send(_pconfig(), "12345", "just text"))
|
||||
assert res == {
|
||||
"success": True,
|
||||
"platform": "whatsapp",
|
||||
"chat_id": calls[0][1]["chatId"],
|
||||
"message_id": "t1",
|
||||
}
|
||||
assert len(calls) == 1 and calls[0][0].endswith("/send")
|
||||
|
|
@ -903,11 +903,38 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
|
|||
last_result = result
|
||||
return last_result
|
||||
|
||||
# --- WhatsApp: native media attachment support via the registry's
|
||||
# standalone_sender_fn (plugins/platforms/whatsapp/adapter.py::_standalone_send).
|
||||
# The plugin uploads each file through the local Baileys bridge /send-media
|
||||
# endpoint so images/videos/audio arrive as native bubbles, not documents. #41112
|
||||
if platform == Platform.WHATSAPP and media_files:
|
||||
from gateway.platform_registry import platform_registry as _pr_wa
|
||||
from hermes_cli.plugins import discover_plugins as _dp_wa
|
||||
_dp_wa()
|
||||
_wa_entry = _pr_wa.get("whatsapp")
|
||||
if _wa_entry is None or _wa_entry.standalone_sender_fn is None:
|
||||
return {"error": "WhatsApp plugin not registered or missing standalone_sender_fn"}
|
||||
last_result = None
|
||||
for i, chunk in enumerate(chunks):
|
||||
is_last = (i == len(chunks) - 1)
|
||||
result = await _wa_entry.standalone_sender_fn(
|
||||
pconfig,
|
||||
chat_id,
|
||||
chunk,
|
||||
media_files=media_files if is_last else None,
|
||||
thread_id=thread_id,
|
||||
force_document=force_document,
|
||||
)
|
||||
if isinstance(result, dict) and result.get("error"):
|
||||
return result
|
||||
last_result = result
|
||||
return last_result
|
||||
|
||||
# --- Non-media platforms ---
|
||||
if media_files and not message.strip():
|
||||
return {
|
||||
"error": (
|
||||
f"send_message MEDIA delivery is currently only supported for telegram, discord, matrix, weixin, signal, yuanbao and feishu; "
|
||||
f"send_message MEDIA delivery is currently only supported for telegram, discord, matrix, weixin, signal, yuanbao, feishu and whatsapp; "
|
||||
f"target {platform.value} had only media attachments"
|
||||
)
|
||||
}
|
||||
|
|
@ -915,7 +942,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
|
|||
if media_files:
|
||||
warning = (
|
||||
f"MEDIA attachments were omitted for {platform.value}; "
|
||||
"native send_message media delivery is currently only supported for telegram, discord, matrix, weixin, signal, yuanbao and feishu"
|
||||
"native send_message media delivery is currently only supported for telegram, discord, matrix, weixin, signal, yuanbao, feishu and whatsapp"
|
||||
)
|
||||
|
||||
last_result = None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue