feat(photon): Add voice message support to Photon adapter

Extend the sidecar and Python adapter to handle `voice` content
alongside `attachment`. Voice notes are inlined as base64 (same
size-cap logic), surfaced as `MessageType.VOICE`, and include an
optional `duration` field in fallback markers when bytes are
unavailable.
This commit is contained in:
underthestars-zhy 2026-06-08 21:29:20 -07:00 committed by Teknium
parent 9fb83eaa2f
commit dbf2470d46
4 changed files with 172 additions and 71 deletions

View file

@ -119,14 +119,14 @@ All env vars are documented in `plugin.yaml`. The most important:
## Attachments & limitations
- **Inbound attachments are downloaded.** The sidecar reads the bytes
(`content.read()`) and base64-inlines them on the NDJSON event; the adapter
caches them to the shared media cache and populates `media_urls` /
`media_types`, so the agent sees the real image/file (vision included) —
parity with the BlueBubbles iMessage channel. Attachments larger than
- **Inbound attachments and voice notes are downloaded.** The sidecar reads
the bytes (`content.read()`) and base64-inlines them on the NDJSON event; the
adapter caches them to the shared media cache and populates `media_urls` /
`media_types`, so the agent sees the real image/file or can transcribe the
voice note — parity with the BlueBubbles iMessage channel. Media larger than
`PHOTON_MAX_INLINE_ATTACHMENT_BYTES` (default 20 MB), or any byte read that
fails, fall back to a text marker (`[Photon attachment received: …]`) so the
agent still knows something arrived.
fails, falls back to a text marker (`[Photon attachment received: …]` or
`[Photon voice received: …]`) so the agent still knows something arrived.
- **Outbound attachments are supported.** Images, voice notes, video, and
documents are sent via `space.send(attachment(...))` /
`space.send(voice(...))` through the sidecar's `/send-attachment`

View file

@ -435,13 +435,15 @@ class PhotonAdapter(BasePlatformAdapter):
"space": {"id": "...", "type": "dm"|"group", "phone": "+E164"},
"sender": {"id": "+E164"},
"content": {"type": "text", "text": "..."}
| {"type": "attachment", "id", "name", "mimeType",
"size", "data"?, "encoding"?},
| {"type": "attachment"|"voice", "id", "name",
"mimeType", "size", "duration"?, "data"?,
"encoding"?},
"timestamp": "2026-05-14T19:06:32.000Z"
Attachment content carries the bytes inline as base64 ``data`` (with
``encoding == "base64"``) when the sidecar could read them within its
size cap; otherwise only metadata is present and we surface a marker.
Attachment and voice content carry the bytes inline as base64 ``data``
(with ``encoding == "base64"``) when the sidecar could read them
within its size cap; otherwise only metadata is present and we surface
a marker.
}
"""
space = event.get("space") or {}
@ -476,23 +478,38 @@ class PhotonAdapter(BasePlatformAdapter):
if ctype == "text":
text = content.get("text") or ""
mtype = MessageType.TEXT
elif ctype == "attachment":
name = content.get("name") or "(unnamed)"
elif ctype in {"attachment", "voice"}:
is_voice = ctype == "voice"
name = content.get("name") or ("voice" if is_voice else "(unnamed)")
mime = content.get("mimeType") or ""
mtype = _attachment_message_type(mime)
cached = _cache_inbound_attachment(content, name, mime)
mtype = MessageType.VOICE if is_voice else _attachment_message_type(mime)
cached = _cache_inbound_attachment(
content, name, mime, force_audio=is_voice
)
if cached:
media_urls.append(cached)
media_types.append(mime or "application/octet-stream")
media_types.append(
mime or ("audio/mp4" if is_voice else "application/octet-stream")
)
# The real bytes are attached, so the agent sees the media
# itself — a short marker is enough text, and it keeps group
# mention-gating consistent with plain messages.
text = "(attachment)"
text = "(voice)" if is_voice else "(attachment)"
else:
# No bytes (over the sidecar cap, a failed read, or a caching
# failure) — fall back to a metadata marker so the agent still
# knows something arrived.
text = f"[Photon attachment received: {name} ({mime})]"
label = "voice" if is_voice else "attachment"
duration = content.get("duration")
duration_text = (
f", duration: {duration}s"
if isinstance(duration, (int, float))
else ""
)
text = (
f"[Photon {label} received: {name} "
f"({mime or 'unknown MIME'}{duration_text})]"
)
else:
text = f"[Photon content type not handled: {ctype}]"
mtype = MessageType.TEXT
@ -950,7 +967,11 @@ _AUDIO_EXT_BY_MIME = {
def _cache_inbound_attachment(
content: Dict[str, Any], name: str, mime: str
content: Dict[str, Any],
name: str,
mime: str,
*,
force_audio: bool = False,
) -> Optional[str]:
"""Decode a base64-inlined inbound attachment and cache it locally.
@ -988,8 +1009,10 @@ def _cache_inbound_attachment(
# Bytes don't look like a supported image (e.g. HEIC magic) —
# still deliver them as a document rather than dropping them.
return cache_document_from_bytes(raw, name)
if mime.startswith("audio/"):
ext = suffix or _AUDIO_EXT_BY_MIME.get(mime, ".mp3")
if force_audio or mime.startswith("audio/"):
ext = suffix or _AUDIO_EXT_BY_MIME.get(
mime, ".m4a" if force_audio else ".mp3"
)
return cache_audio_from_bytes(raw, ext)
# Video, application/*, and everything else → document cache.
return cache_document_from_bytes(raw, name)

View file

@ -48,11 +48,11 @@ const port = parseInt(process.env.PHOTON_SIDECAR_PORT || "8789", 10);
const bind = process.env.PHOTON_SIDECAR_BIND || "127.0.0.1";
const sharedToken = process.env.PHOTON_SIDECAR_TOKEN;
// Inbound attachments are read into memory and base64-inlined on the NDJSON
// Inbound binary content is read into memory and base64-inlined on the NDJSON
// event so the Python adapter can cache the real bytes (and the agent can see
// the image). Cap the size we inline — above it we forward metadata only and
// the adapter surfaces a text marker, so one large video can't balloon a
// single NDJSON line. Override via PHOTON_MAX_INLINE_ATTACHMENT_BYTES.
// images / transcribe voice). Cap the size we inline — above it we forward
// metadata only and the adapter surfaces a text marker, so one large clip can't
// balloon a single NDJSON line. Override via PHOTON_MAX_INLINE_ATTACHMENT_BYTES.
const MAX_INLINE_ATTACHMENT_BYTES =
Number(process.env.PHOTON_MAX_INLINE_ATTACHMENT_BYTES) || 20 * 1024 * 1024;
const DM_CHAT_GUID_RE = /^any;-;(\+\d{6,})$/;
@ -164,6 +164,57 @@ async function deliver(line) {
}
}
async function normalizeBinaryContent(content) {
const meta = {
type: content.type,
id: content.id ?? null,
name: content.name ?? null,
mimeType: content.mimeType ?? null,
size: typeof content.size === "number" ? content.size : null,
};
if (content.type === "voice" && typeof content.duration === "number") {
meta.duration = content.duration;
}
// Read the bytes eagerly and base64-inline them as `data` so the Python
// adapter can cache the real file (the agent then sees images and can run
// STT on voice notes). Spectrum content objects may not outlive this stream
// iteration, so a lazy/on-demand fetch isn't safe. Over-cap content (when
// size is known up front) is forwarded as metadata only and the adapter falls
// back to a text marker. A read failure must never break the inbound loop.
const label = `${content.type} ${meta.name ?? meta.id ?? "(unnamed)"}`;
if (meta.size !== null && meta.size > MAX_INLINE_ATTACHMENT_BYTES) {
console.error(
`photon-sidecar: ${label} (${meta.size} bytes) ` +
`exceeds inline cap ${MAX_INLINE_ATTACHMENT_BYTES}; forwarding metadata only`
);
return meta;
}
if (typeof content.read === "function") {
try {
const buf = await content.read();
// Guard the case where size was unknown but the bytes turn out to be
// over the cap.
if (buf && buf.length > MAX_INLINE_ATTACHMENT_BYTES) {
console.error(
`photon-sidecar: ${label} (${buf.length} bytes) ` +
`exceeds inline cap after read; forwarding metadata only`
);
return meta;
}
meta.data = Buffer.from(buf).toString("base64");
meta.encoding = "base64";
} catch (e) {
console.error(
`photon-sidecar: failed to read ${content.type} bytes ` +
"(forwarding metadata only): " +
(e && e.stack ? e.stack : String(e))
);
}
}
return meta;
}
async function normalizeContent(content) {
if (!content || typeof content !== "object") {
return { type: "unknown" };
@ -171,51 +222,8 @@ async function normalizeContent(content) {
if (content.type === "text") {
return { type: "text", text: content.text || "" };
}
if (content.type === "attachment") {
const meta = {
type: "attachment",
id: content.id ?? null,
name: content.name ?? null,
mimeType: content.mimeType ?? null,
size: typeof content.size === "number" ? content.size : null,
};
// Read the bytes eagerly and base64-inline them as `data` so the Python
// adapter can cache the real file (the agent then sees the image itself).
// The spectrum-ts attachment object may not outlive this stream
// iteration, so a lazy/on-demand fetch isn't safe. Over-cap attachments
// (when size is known up front) are forwarded as metadata only and the
// adapter falls back to a text marker. A read failure must never break
// the inbound loop — we just drop `data` and forward metadata.
if (meta.size !== null && meta.size > MAX_INLINE_ATTACHMENT_BYTES) {
console.error(
`photon-sidecar: attachment ${meta.name ?? meta.id} (${meta.size} bytes) ` +
`exceeds inline cap ${MAX_INLINE_ATTACHMENT_BYTES}; forwarding metadata only`
);
return meta;
}
if (typeof content.read === "function") {
try {
const buf = await content.read();
// Guard the case where size was unknown but the bytes turn out to be
// over the cap.
if (buf && buf.length > MAX_INLINE_ATTACHMENT_BYTES) {
console.error(
`photon-sidecar: attachment ${meta.name ?? meta.id} (${buf.length} bytes) ` +
`exceeds inline cap after read; forwarding metadata only`
);
return meta;
}
meta.data = Buffer.from(buf).toString("base64");
meta.encoding = "base64";
} catch (e) {
console.error(
"photon-sidecar: failed to read attachment bytes " +
"(forwarding metadata only): " +
(e && e.stack ? e.stack : String(e))
);
}
}
return meta;
if (content.type === "attachment" || content.type === "voice") {
return await normalizeBinaryContent(content);
}
return { type: content.type || "unknown" };
}

View file

@ -101,6 +101,18 @@ def _attachment_event(
}
def _voice_event(
content: Dict[str, Any], msg_id: str = "spc-msg-voice"
) -> Dict[str, Any]:
return {
"messageId": msg_id,
"space": {"id": "+15551234567", "type": "dm", "phone": "+15551234567"},
"sender": {"id": "+15551234567"},
"content": {"type": "voice", **content},
"timestamp": "2026-05-14T19:06:32.000Z",
}
@pytest.mark.asyncio
async def test_dispatch_attachment_without_bytes_surfaces_marker(
monkeypatch: pytest.MonkeyPatch,
@ -156,6 +168,64 @@ async def test_dispatch_attachment_downloads_image(
cached.unlink(missing_ok=True)
@pytest.mark.asyncio
async def test_dispatch_voice_downloads_audio(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Inbound Spectrum voice content is cached and routed to auto-STT."""
adapter = _make_adapter(monkeypatch)
captured = _capture(adapter, monkeypatch)
raw = b"OggS" + b"\x00" * 32
event = _voice_event(
{
"name": "note.ogg",
"mimeType": "audio/ogg",
"duration": 7,
"size": len(raw),
"data": base64.b64encode(raw).decode("ascii"),
"encoding": "base64",
}
)
await adapter._dispatch_inbound(event)
assert len(captured) == 1
ev = captured[0]
assert ev.message_type == MessageType.VOICE
assert ev.media_types == ["audio/ogg"]
assert len(ev.media_urls) == 1
cached = Path(ev.media_urls[0])
try:
assert cached.is_file()
assert cached.read_bytes() == raw
assert ev.text == "(voice)"
finally:
cached.unlink(missing_ok=True)
@pytest.mark.asyncio
async def test_dispatch_voice_without_bytes_surfaces_marker(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Metadata-only voice still tells the agent a voice note arrived."""
adapter = _make_adapter(monkeypatch)
captured = _capture(adapter, monkeypatch)
event = _voice_event(
{"name": "note.m4a", "mimeType": "audio/mp4", "duration": 12, "size": 12345}
)
await adapter._dispatch_inbound(event)
assert len(captured) == 1
ev = captured[0]
assert "Photon voice received" in ev.text
assert "note.m4a" in ev.text
assert "duration: 12s" in ev.text
assert ev.message_type == MessageType.VOICE
assert ev.media_urls == []
assert ev.media_types == []
@pytest.mark.asyncio
async def test_dispatch_attachment_downloads_document(
monkeypatch: pytest.MonkeyPatch,