mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
feat(photon): Add voice message support to Photon adapter
Extend the sidecar and Python adapter to handle `voice` content alongside `attachment`. Voice notes are inlined as base64 (same size-cap logic), surfaced as `MessageType.VOICE`, and include an optional `duration` field in fallback markers when bytes are unavailable.
This commit is contained in:
parent
9fb83eaa2f
commit
dbf2470d46
4 changed files with 172 additions and 71 deletions
|
|
@ -119,14 +119,14 @@ All env vars are documented in `plugin.yaml`. The most important:
|
|||
|
||||
## Attachments & limitations
|
||||
|
||||
- **Inbound attachments are downloaded.** The sidecar reads the bytes
|
||||
(`content.read()`) and base64-inlines them on the NDJSON event; the adapter
|
||||
caches them to the shared media cache and populates `media_urls` /
|
||||
`media_types`, so the agent sees the real image/file (vision included) —
|
||||
parity with the BlueBubbles iMessage channel. Attachments larger than
|
||||
- **Inbound attachments and voice notes are downloaded.** The sidecar reads
|
||||
the bytes (`content.read()`) and base64-inlines them on the NDJSON event; the
|
||||
adapter caches them to the shared media cache and populates `media_urls` /
|
||||
`media_types`, so the agent sees the real image/file or can transcribe the
|
||||
voice note — parity with the BlueBubbles iMessage channel. Media larger than
|
||||
`PHOTON_MAX_INLINE_ATTACHMENT_BYTES` (default 20 MB), or any byte read that
|
||||
fails, fall back to a text marker (`[Photon attachment received: …]`) so the
|
||||
agent still knows something arrived.
|
||||
fails, falls back to a text marker (`[Photon attachment received: …]` or
|
||||
`[Photon voice received: …]`) so the agent still knows something arrived.
|
||||
- **Outbound attachments are supported.** Images, voice notes, video, and
|
||||
documents are sent via `space.send(attachment(...))` /
|
||||
`space.send(voice(...))` through the sidecar's `/send-attachment`
|
||||
|
|
|
|||
|
|
@ -435,13 +435,15 @@ class PhotonAdapter(BasePlatformAdapter):
|
|||
"space": {"id": "...", "type": "dm"|"group", "phone": "+E164"},
|
||||
"sender": {"id": "+E164"},
|
||||
"content": {"type": "text", "text": "..."}
|
||||
| {"type": "attachment", "id", "name", "mimeType",
|
||||
"size", "data"?, "encoding"?},
|
||||
| {"type": "attachment"|"voice", "id", "name",
|
||||
"mimeType", "size", "duration"?, "data"?,
|
||||
"encoding"?},
|
||||
"timestamp": "2026-05-14T19:06:32.000Z"
|
||||
|
||||
Attachment content carries the bytes inline as base64 ``data`` (with
|
||||
``encoding == "base64"``) when the sidecar could read them within its
|
||||
size cap; otherwise only metadata is present and we surface a marker.
|
||||
Attachment and voice content carry the bytes inline as base64 ``data``
|
||||
(with ``encoding == "base64"``) when the sidecar could read them
|
||||
within its size cap; otherwise only metadata is present and we surface
|
||||
a marker.
|
||||
}
|
||||
"""
|
||||
space = event.get("space") or {}
|
||||
|
|
@ -476,23 +478,38 @@ class PhotonAdapter(BasePlatformAdapter):
|
|||
if ctype == "text":
|
||||
text = content.get("text") or ""
|
||||
mtype = MessageType.TEXT
|
||||
elif ctype == "attachment":
|
||||
name = content.get("name") or "(unnamed)"
|
||||
elif ctype in {"attachment", "voice"}:
|
||||
is_voice = ctype == "voice"
|
||||
name = content.get("name") or ("voice" if is_voice else "(unnamed)")
|
||||
mime = content.get("mimeType") or ""
|
||||
mtype = _attachment_message_type(mime)
|
||||
cached = _cache_inbound_attachment(content, name, mime)
|
||||
mtype = MessageType.VOICE if is_voice else _attachment_message_type(mime)
|
||||
cached = _cache_inbound_attachment(
|
||||
content, name, mime, force_audio=is_voice
|
||||
)
|
||||
if cached:
|
||||
media_urls.append(cached)
|
||||
media_types.append(mime or "application/octet-stream")
|
||||
media_types.append(
|
||||
mime or ("audio/mp4" if is_voice else "application/octet-stream")
|
||||
)
|
||||
# The real bytes are attached, so the agent sees the media
|
||||
# itself — a short marker is enough text, and it keeps group
|
||||
# mention-gating consistent with plain messages.
|
||||
text = "(attachment)"
|
||||
text = "(voice)" if is_voice else "(attachment)"
|
||||
else:
|
||||
# No bytes (over the sidecar cap, a failed read, or a caching
|
||||
# failure) — fall back to a metadata marker so the agent still
|
||||
# knows something arrived.
|
||||
text = f"[Photon attachment received: {name} ({mime})]"
|
||||
label = "voice" if is_voice else "attachment"
|
||||
duration = content.get("duration")
|
||||
duration_text = (
|
||||
f", duration: {duration}s"
|
||||
if isinstance(duration, (int, float))
|
||||
else ""
|
||||
)
|
||||
text = (
|
||||
f"[Photon {label} received: {name} "
|
||||
f"({mime or 'unknown MIME'}{duration_text})]"
|
||||
)
|
||||
else:
|
||||
text = f"[Photon content type not handled: {ctype}]"
|
||||
mtype = MessageType.TEXT
|
||||
|
|
@ -950,7 +967,11 @@ _AUDIO_EXT_BY_MIME = {
|
|||
|
||||
|
||||
def _cache_inbound_attachment(
|
||||
content: Dict[str, Any], name: str, mime: str
|
||||
content: Dict[str, Any],
|
||||
name: str,
|
||||
mime: str,
|
||||
*,
|
||||
force_audio: bool = False,
|
||||
) -> Optional[str]:
|
||||
"""Decode a base64-inlined inbound attachment and cache it locally.
|
||||
|
||||
|
|
@ -988,8 +1009,10 @@ def _cache_inbound_attachment(
|
|||
# Bytes don't look like a supported image (e.g. HEIC magic) —
|
||||
# still deliver them as a document rather than dropping them.
|
||||
return cache_document_from_bytes(raw, name)
|
||||
if mime.startswith("audio/"):
|
||||
ext = suffix or _AUDIO_EXT_BY_MIME.get(mime, ".mp3")
|
||||
if force_audio or mime.startswith("audio/"):
|
||||
ext = suffix or _AUDIO_EXT_BY_MIME.get(
|
||||
mime, ".m4a" if force_audio else ".mp3"
|
||||
)
|
||||
return cache_audio_from_bytes(raw, ext)
|
||||
# Video, application/*, and everything else → document cache.
|
||||
return cache_document_from_bytes(raw, name)
|
||||
|
|
|
|||
|
|
@ -48,11 +48,11 @@ const port = parseInt(process.env.PHOTON_SIDECAR_PORT || "8789", 10);
|
|||
const bind = process.env.PHOTON_SIDECAR_BIND || "127.0.0.1";
|
||||
const sharedToken = process.env.PHOTON_SIDECAR_TOKEN;
|
||||
|
||||
// Inbound attachments are read into memory and base64-inlined on the NDJSON
|
||||
// Inbound binary content is read into memory and base64-inlined on the NDJSON
|
||||
// event so the Python adapter can cache the real bytes (and the agent can see
|
||||
// the image). Cap the size we inline — above it we forward metadata only and
|
||||
// the adapter surfaces a text marker, so one large video can't balloon a
|
||||
// single NDJSON line. Override via PHOTON_MAX_INLINE_ATTACHMENT_BYTES.
|
||||
// images / transcribe voice). Cap the size we inline — above it we forward
|
||||
// metadata only and the adapter surfaces a text marker, so one large clip can't
|
||||
// balloon a single NDJSON line. Override via PHOTON_MAX_INLINE_ATTACHMENT_BYTES.
|
||||
const MAX_INLINE_ATTACHMENT_BYTES =
|
||||
Number(process.env.PHOTON_MAX_INLINE_ATTACHMENT_BYTES) || 20 * 1024 * 1024;
|
||||
const DM_CHAT_GUID_RE = /^any;-;(\+\d{6,})$/;
|
||||
|
|
@ -164,6 +164,57 @@ async function deliver(line) {
|
|||
}
|
||||
}
|
||||
|
||||
async function normalizeBinaryContent(content) {
|
||||
const meta = {
|
||||
type: content.type,
|
||||
id: content.id ?? null,
|
||||
name: content.name ?? null,
|
||||
mimeType: content.mimeType ?? null,
|
||||
size: typeof content.size === "number" ? content.size : null,
|
||||
};
|
||||
if (content.type === "voice" && typeof content.duration === "number") {
|
||||
meta.duration = content.duration;
|
||||
}
|
||||
|
||||
// Read the bytes eagerly and base64-inline them as `data` so the Python
|
||||
// adapter can cache the real file (the agent then sees images and can run
|
||||
// STT on voice notes). Spectrum content objects may not outlive this stream
|
||||
// iteration, so a lazy/on-demand fetch isn't safe. Over-cap content (when
|
||||
// size is known up front) is forwarded as metadata only and the adapter falls
|
||||
// back to a text marker. A read failure must never break the inbound loop.
|
||||
const label = `${content.type} ${meta.name ?? meta.id ?? "(unnamed)"}`;
|
||||
if (meta.size !== null && meta.size > MAX_INLINE_ATTACHMENT_BYTES) {
|
||||
console.error(
|
||||
`photon-sidecar: ${label} (${meta.size} bytes) ` +
|
||||
`exceeds inline cap ${MAX_INLINE_ATTACHMENT_BYTES}; forwarding metadata only`
|
||||
);
|
||||
return meta;
|
||||
}
|
||||
if (typeof content.read === "function") {
|
||||
try {
|
||||
const buf = await content.read();
|
||||
// Guard the case where size was unknown but the bytes turn out to be
|
||||
// over the cap.
|
||||
if (buf && buf.length > MAX_INLINE_ATTACHMENT_BYTES) {
|
||||
console.error(
|
||||
`photon-sidecar: ${label} (${buf.length} bytes) ` +
|
||||
`exceeds inline cap after read; forwarding metadata only`
|
||||
);
|
||||
return meta;
|
||||
}
|
||||
meta.data = Buffer.from(buf).toString("base64");
|
||||
meta.encoding = "base64";
|
||||
} catch (e) {
|
||||
console.error(
|
||||
`photon-sidecar: failed to read ${content.type} bytes ` +
|
||||
"(forwarding metadata only): " +
|
||||
(e && e.stack ? e.stack : String(e))
|
||||
);
|
||||
}
|
||||
}
|
||||
return meta;
|
||||
}
|
||||
|
||||
async function normalizeContent(content) {
|
||||
if (!content || typeof content !== "object") {
|
||||
return { type: "unknown" };
|
||||
|
|
@ -171,51 +222,8 @@ async function normalizeContent(content) {
|
|||
if (content.type === "text") {
|
||||
return { type: "text", text: content.text || "" };
|
||||
}
|
||||
if (content.type === "attachment") {
|
||||
const meta = {
|
||||
type: "attachment",
|
||||
id: content.id ?? null,
|
||||
name: content.name ?? null,
|
||||
mimeType: content.mimeType ?? null,
|
||||
size: typeof content.size === "number" ? content.size : null,
|
||||
};
|
||||
// Read the bytes eagerly and base64-inline them as `data` so the Python
|
||||
// adapter can cache the real file (the agent then sees the image itself).
|
||||
// The spectrum-ts attachment object may not outlive this stream
|
||||
// iteration, so a lazy/on-demand fetch isn't safe. Over-cap attachments
|
||||
// (when size is known up front) are forwarded as metadata only and the
|
||||
// adapter falls back to a text marker. A read failure must never break
|
||||
// the inbound loop — we just drop `data` and forward metadata.
|
||||
if (meta.size !== null && meta.size > MAX_INLINE_ATTACHMENT_BYTES) {
|
||||
console.error(
|
||||
`photon-sidecar: attachment ${meta.name ?? meta.id} (${meta.size} bytes) ` +
|
||||
`exceeds inline cap ${MAX_INLINE_ATTACHMENT_BYTES}; forwarding metadata only`
|
||||
);
|
||||
return meta;
|
||||
}
|
||||
if (typeof content.read === "function") {
|
||||
try {
|
||||
const buf = await content.read();
|
||||
// Guard the case where size was unknown but the bytes turn out to be
|
||||
// over the cap.
|
||||
if (buf && buf.length > MAX_INLINE_ATTACHMENT_BYTES) {
|
||||
console.error(
|
||||
`photon-sidecar: attachment ${meta.name ?? meta.id} (${buf.length} bytes) ` +
|
||||
`exceeds inline cap after read; forwarding metadata only`
|
||||
);
|
||||
return meta;
|
||||
}
|
||||
meta.data = Buffer.from(buf).toString("base64");
|
||||
meta.encoding = "base64";
|
||||
} catch (e) {
|
||||
console.error(
|
||||
"photon-sidecar: failed to read attachment bytes " +
|
||||
"(forwarding metadata only): " +
|
||||
(e && e.stack ? e.stack : String(e))
|
||||
);
|
||||
}
|
||||
}
|
||||
return meta;
|
||||
if (content.type === "attachment" || content.type === "voice") {
|
||||
return await normalizeBinaryContent(content);
|
||||
}
|
||||
return { type: content.type || "unknown" };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -101,6 +101,18 @@ def _attachment_event(
|
|||
}
|
||||
|
||||
|
||||
def _voice_event(
|
||||
content: Dict[str, Any], msg_id: str = "spc-msg-voice"
|
||||
) -> Dict[str, Any]:
|
||||
return {
|
||||
"messageId": msg_id,
|
||||
"space": {"id": "+15551234567", "type": "dm", "phone": "+15551234567"},
|
||||
"sender": {"id": "+15551234567"},
|
||||
"content": {"type": "voice", **content},
|
||||
"timestamp": "2026-05-14T19:06:32.000Z",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dispatch_attachment_without_bytes_surfaces_marker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
|
|
@ -156,6 +168,64 @@ async def test_dispatch_attachment_downloads_image(
|
|||
cached.unlink(missing_ok=True)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dispatch_voice_downloads_audio(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Inbound Spectrum voice content is cached and routed to auto-STT."""
|
||||
adapter = _make_adapter(monkeypatch)
|
||||
captured = _capture(adapter, monkeypatch)
|
||||
|
||||
raw = b"OggS" + b"\x00" * 32
|
||||
event = _voice_event(
|
||||
{
|
||||
"name": "note.ogg",
|
||||
"mimeType": "audio/ogg",
|
||||
"duration": 7,
|
||||
"size": len(raw),
|
||||
"data": base64.b64encode(raw).decode("ascii"),
|
||||
"encoding": "base64",
|
||||
}
|
||||
)
|
||||
await adapter._dispatch_inbound(event)
|
||||
|
||||
assert len(captured) == 1
|
||||
ev = captured[0]
|
||||
assert ev.message_type == MessageType.VOICE
|
||||
assert ev.media_types == ["audio/ogg"]
|
||||
assert len(ev.media_urls) == 1
|
||||
cached = Path(ev.media_urls[0])
|
||||
try:
|
||||
assert cached.is_file()
|
||||
assert cached.read_bytes() == raw
|
||||
assert ev.text == "(voice)"
|
||||
finally:
|
||||
cached.unlink(missing_ok=True)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dispatch_voice_without_bytes_surfaces_marker(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Metadata-only voice still tells the agent a voice note arrived."""
|
||||
adapter = _make_adapter(monkeypatch)
|
||||
captured = _capture(adapter, monkeypatch)
|
||||
|
||||
event = _voice_event(
|
||||
{"name": "note.m4a", "mimeType": "audio/mp4", "duration": 12, "size": 12345}
|
||||
)
|
||||
await adapter._dispatch_inbound(event)
|
||||
|
||||
assert len(captured) == 1
|
||||
ev = captured[0]
|
||||
assert "Photon voice received" in ev.text
|
||||
assert "note.m4a" in ev.text
|
||||
assert "duration: 12s" in ev.text
|
||||
assert ev.message_type == MessageType.VOICE
|
||||
assert ev.media_urls == []
|
||||
assert ev.media_types == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_dispatch_attachment_downloads_document(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue