feat(photon): Add voice message support to Photon adapter

Extend the sidecar and Python adapter to handle `voice` content alongside `attachment`. Voice notes are inlined as base64 (same size-cap logic), surfaced as `MessageType.VOICE`, and include an optional `duration` field in fallback markers when bytes are unavailable.
2026-06-09 08:21:50 +00:00 · 2026-06-08 21:29:20 -07:00 · 2026-06-08 21:29:20 -07:00 · dbf2470d46
commit dbf2470d46
parent 9fb83eaa2f
4 changed files with 172 additions and 71 deletions
--- a/plugins/platforms/photon/README.md
+++ b/plugins/platforms/photon/README.md
@ -119,14 +119,14 @@ All env vars are documented in `plugin.yaml`. The most important:

 ## Attachments & limitations

- **Inbound attachments are downloaded.** The sidecar reads the bytes
-  (`content.read()`) and base64-inlines them on the NDJSON event; the adapter
-  caches them to the shared media cache and populates `media_urls` /
-  `media_types`, so the agent sees the real image/file (vision included) —
-  parity with the BlueBubbles iMessage channel. Attachments larger than
+- **Inbound attachments and voice notes are downloaded.** The sidecar reads
+  the bytes (`content.read()`) and base64-inlines them on the NDJSON event; the
+  adapter caches them to the shared media cache and populates `media_urls` /
+  `media_types`, so the agent sees the real image/file or can transcribe the
+  voice note — parity with the BlueBubbles iMessage channel. Media larger than
  `PHOTON_MAX_INLINE_ATTACHMENT_BYTES` (default 20 MB), or any byte read that
-  fails, fall back to a text marker (`[Photon attachment received: …]`) so the
-  agent still knows something arrived.
+  fails, falls back to a text marker (`[Photon attachment received: …]` or
+  `[Photon voice received: …]`) so the agent still knows something arrived.
 - **Outbound attachments are supported.** Images, voice notes, video, and
  documents are sent via `space.send(attachment(...))` /
  `space.send(voice(...))` through the sidecar's `/send-attachment`
--- a/plugins/platforms/photon/adapter.py
+++ b/plugins/platforms/photon/adapter.py
@ -435,13 +435,15 @@ class PhotonAdapter(BasePlatformAdapter):
              "space": {"id": "...", "type": "dm"|"group", "phone": "+E164"},
              "sender": {"id": "+E164"},
              "content": {"type": "text", "text": "..."}
-                       | {"type": "attachment", "id", "name", "mimeType",
-                          "size", "data"?, "encoding"?},
+                       | {"type": "attachment"|"voice", "id", "name",
+                          "mimeType", "size", "duration"?, "data"?,
+                          "encoding"?},
              "timestamp": "2026-05-14T19:06:32.000Z"

-        Attachment content carries the bytes inline as base64 ``data`` (with
-        ``encoding == "base64"``) when the sidecar could read them within its
-        size cap; otherwise only metadata is present and we surface a marker.
+        Attachment and voice content carry the bytes inline as base64 ``data``
+        (with ``encoding == "base64"``) when the sidecar could read them
+        within its size cap; otherwise only metadata is present and we surface
+        a marker.
            }
        """
        space = event.get("space") or {}
@ -476,23 +478,38 @@ class PhotonAdapter(BasePlatformAdapter):
        if ctype == "text":
            text = content.get("text") or ""
            mtype = MessageType.TEXT
-        elif ctype == "attachment":
-            name = content.get("name") or "(unnamed)"
+        elif ctype in {"attachment", "voice"}:
+            is_voice = ctype == "voice"
+            name = content.get("name") or ("voice" if is_voice else "(unnamed)")
            mime = content.get("mimeType") or ""
-            mtype = _attachment_message_type(mime)
-            cached = _cache_inbound_attachment(content, name, mime)
+            mtype = MessageType.VOICE if is_voice else _attachment_message_type(mime)
+            cached = _cache_inbound_attachment(
+                content, name, mime, force_audio=is_voice
+            )
            if cached:
                media_urls.append(cached)
-                media_types.append(mime or "application/octet-stream")
+                media_types.append(
+                    mime or ("audio/mp4" if is_voice else "application/octet-stream")
+                )
                # The real bytes are attached, so the agent sees the media
                # itself — a short marker is enough text, and it keeps group
                # mention-gating consistent with plain messages.
-                text = "(attachment)"
+                text = "(voice)" if is_voice else "(attachment)"
            else:
                # No bytes (over the sidecar cap, a failed read, or a caching
                # failure) — fall back to a metadata marker so the agent still
                # knows something arrived.
-                text = f"[Photon attachment received: {name} ({mime})]"
+                label = "voice" if is_voice else "attachment"
+                duration = content.get("duration")
+                duration_text = (
+                    f", duration: {duration}s"
+                    if isinstance(duration, (int, float))
+                    else ""
+                )
+                text = (
+                    f"[Photon {label} received: {name} "
+                    f"({mime or 'unknown MIME'}{duration_text})]"
+                )
        else:
            text = f"[Photon content type not handled: {ctype}]"
            mtype = MessageType.TEXT
@ -950,7 +967,11 @@ _AUDIO_EXT_BY_MIME = {


 def _cache_inbound_attachment(
-    content: Dict[str, Any], name: str, mime: str
+    content: Dict[str, Any],
+    name: str,
+    mime: str,
+    *,
+    force_audio: bool = False,
 ) -> Optional[str]:
    """Decode a base64-inlined inbound attachment and cache it locally.

@ -988,8 +1009,10 @@ def _cache_inbound_attachment(
                # Bytes don't look like a supported image (e.g. HEIC magic) —
                # still deliver them as a document rather than dropping them.
                return cache_document_from_bytes(raw, name)
-        if mime.startswith("audio/"):
-            ext = suffix or _AUDIO_EXT_BY_MIME.get(mime, ".mp3")
+        if force_audio or mime.startswith("audio/"):
+            ext = suffix or _AUDIO_EXT_BY_MIME.get(
+                mime, ".m4a" if force_audio else ".mp3"
+            )
            return cache_audio_from_bytes(raw, ext)
        # Video, application/*, and everything else → document cache.
        return cache_document_from_bytes(raw, name)
--- a/plugins/platforms/photon/sidecar/index.mjs
+++ b/plugins/platforms/photon/sidecar/index.mjs
@ -48,11 +48,11 @@ const port = parseInt(process.env.PHOTON_SIDECAR_PORT || "8789", 10);
 const bind = process.env.PHOTON_SIDECAR_BIND || "127.0.0.1";
 const sharedToken = process.env.PHOTON_SIDECAR_TOKEN;

-// Inbound attachments are read into memory and base64-inlined on the NDJSON
+// Inbound binary content is read into memory and base64-inlined on the NDJSON
 // event so the Python adapter can cache the real bytes (and the agent can see
-// the image). Cap the size we inline — above it we forward metadata only and
-// the adapter surfaces a text marker, so one large video can't balloon a
-// single NDJSON line. Override via PHOTON_MAX_INLINE_ATTACHMENT_BYTES.
+// images / transcribe voice). Cap the size we inline — above it we forward
+// metadata only and the adapter surfaces a text marker, so one large clip can't
+// balloon a single NDJSON line. Override via PHOTON_MAX_INLINE_ATTACHMENT_BYTES.
 const MAX_INLINE_ATTACHMENT_BYTES =
  Number(process.env.PHOTON_MAX_INLINE_ATTACHMENT_BYTES) || 20 * 1024 * 1024;
 const DM_CHAT_GUID_RE = /^any;-;(\+\d{6,})$/;
@ -164,6 +164,57 @@ async function deliver(line) {
  }
 }

+async function normalizeBinaryContent(content) {
+  const meta = {
+    type: content.type,
+    id: content.id ?? null,
+    name: content.name ?? null,
+    mimeType: content.mimeType ?? null,
+    size: typeof content.size === "number" ? content.size : null,
+  };
+  if (content.type === "voice" && typeof content.duration === "number") {
+    meta.duration = content.duration;
+  }
+
+  // Read the bytes eagerly and base64-inline them as `data` so the Python
+  // adapter can cache the real file (the agent then sees images and can run
+  // STT on voice notes). Spectrum content objects may not outlive this stream
+  // iteration, so a lazy/on-demand fetch isn't safe. Over-cap content (when
+  // size is known up front) is forwarded as metadata only and the adapter falls
+  // back to a text marker. A read failure must never break the inbound loop.
+  const label = `${content.type} ${meta.name ?? meta.id ?? "(unnamed)"}`;
+  if (meta.size !== null && meta.size > MAX_INLINE_ATTACHMENT_BYTES) {
+    console.error(
+      `photon-sidecar: ${label} (${meta.size} bytes) ` +
+        `exceeds inline cap ${MAX_INLINE_ATTACHMENT_BYTES}; forwarding metadata only`
+    );
+    return meta;
+  }
+  if (typeof content.read === "function") {
+    try {
+      const buf = await content.read();
+      // Guard the case where size was unknown but the bytes turn out to be
+      // over the cap.
+      if (buf && buf.length > MAX_INLINE_ATTACHMENT_BYTES) {
+        console.error(
+          `photon-sidecar: ${label} (${buf.length} bytes) ` +
+            `exceeds inline cap after read; forwarding metadata only`
+        );
+        return meta;
+      }
+      meta.data = Buffer.from(buf).toString("base64");
+      meta.encoding = "base64";
+    } catch (e) {
+      console.error(
+        `photon-sidecar: failed to read ${content.type} bytes ` +
+          "(forwarding metadata only): " +
+          (e && e.stack ? e.stack : String(e))
+      );
+    }
+  }
+  return meta;
+}
+
 async function normalizeContent(content) {
  if (!content || typeof content !== "object") {
    return { type: "unknown" };
@ -171,51 +222,8 @@ async function normalizeContent(content) {
  if (content.type === "text") {
    return { type: "text", text: content.text || "" };
  }
-  if (content.type === "attachment") {
-    const meta = {
-      type: "attachment",
-      id: content.id ?? null,
-      name: content.name ?? null,
-      mimeType: content.mimeType ?? null,
-      size: typeof content.size === "number" ? content.size : null,
-    };
-    // Read the bytes eagerly and base64-inline them as `data` so the Python
-    // adapter can cache the real file (the agent then sees the image itself).
-    // The spectrum-ts attachment object may not outlive this stream
-    // iteration, so a lazy/on-demand fetch isn't safe. Over-cap attachments
-    // (when size is known up front) are forwarded as metadata only and the
-    // adapter falls back to a text marker. A read failure must never break
-    // the inbound loop — we just drop `data` and forward metadata.
-    if (meta.size !== null && meta.size > MAX_INLINE_ATTACHMENT_BYTES) {
-      console.error(
-        `photon-sidecar: attachment ${meta.name ?? meta.id} (${meta.size} bytes) ` +
-          `exceeds inline cap ${MAX_INLINE_ATTACHMENT_BYTES}; forwarding metadata only`
-      );
-      return meta;
-    }
-    if (typeof content.read === "function") {
-      try {
-        const buf = await content.read();
-        // Guard the case where size was unknown but the bytes turn out to be
-        // over the cap.
-        if (buf && buf.length > MAX_INLINE_ATTACHMENT_BYTES) {
-          console.error(
-            `photon-sidecar: attachment ${meta.name ?? meta.id} (${buf.length} bytes) ` +
-              `exceeds inline cap after read; forwarding metadata only`
-          );
-          return meta;
-        }
-        meta.data = Buffer.from(buf).toString("base64");
-        meta.encoding = "base64";
-      } catch (e) {
-        console.error(
-          "photon-sidecar: failed to read attachment bytes " +
-            "(forwarding metadata only): " +
-            (e && e.stack ? e.stack : String(e))
-        );
-      }
-    }
-    return meta;
+  if (content.type === "attachment" || content.type === "voice") {
+    return await normalizeBinaryContent(content);
  }
  return { type: content.type || "unknown" };
 }
--- a/tests/plugins/platforms/photon/test_inbound.py
+++ b/tests/plugins/platforms/photon/test_inbound.py
@ -101,6 +101,18 @@ def _attachment_event(
    }


+def _voice_event(
+    content: Dict[str, Any], msg_id: str = "spc-msg-voice"
+) -> Dict[str, Any]:
+    return {
+        "messageId": msg_id,
+        "space": {"id": "+15551234567", "type": "dm", "phone": "+15551234567"},
+        "sender": {"id": "+15551234567"},
+        "content": {"type": "voice", **content},
+        "timestamp": "2026-05-14T19:06:32.000Z",
+    }
+
+
@pytest.mark.asyncio
 async def test_dispatch_attachment_without_bytes_surfaces_marker(
    monkeypatch: pytest.MonkeyPatch,
@ -156,6 +168,64 @@ async def test_dispatch_attachment_downloads_image(
        cached.unlink(missing_ok=True)


+@pytest.mark.asyncio
+async def test_dispatch_voice_downloads_audio(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Inbound Spectrum voice content is cached and routed to auto-STT."""
+    adapter = _make_adapter(monkeypatch)
+    captured = _capture(adapter, monkeypatch)
+
+    raw = b"OggS" + b"\x00" * 32
+    event = _voice_event(
+        {
+            "name": "note.ogg",
+            "mimeType": "audio/ogg",
+            "duration": 7,
+            "size": len(raw),
+            "data": base64.b64encode(raw).decode("ascii"),
+            "encoding": "base64",
+        }
+    )
+    await adapter._dispatch_inbound(event)
+
+    assert len(captured) == 1
+    ev = captured[0]
+    assert ev.message_type == MessageType.VOICE
+    assert ev.media_types == ["audio/ogg"]
+    assert len(ev.media_urls) == 1
+    cached = Path(ev.media_urls[0])
+    try:
+        assert cached.is_file()
+        assert cached.read_bytes() == raw
+        assert ev.text == "(voice)"
+    finally:
+        cached.unlink(missing_ok=True)
+
+
+@pytest.mark.asyncio
+async def test_dispatch_voice_without_bytes_surfaces_marker(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Metadata-only voice still tells the agent a voice note arrived."""
+    adapter = _make_adapter(monkeypatch)
+    captured = _capture(adapter, monkeypatch)
+
+    event = _voice_event(
+        {"name": "note.m4a", "mimeType": "audio/mp4", "duration": 12, "size": 12345}
+    )
+    await adapter._dispatch_inbound(event)
+
+    assert len(captured) == 1
+    ev = captured[0]
+    assert "Photon voice received" in ev.text
+    assert "note.m4a" in ev.text
+    assert "duration: 12s" in ev.text
+    assert ev.message_type == MessageType.VOICE
+    assert ev.media_urls == []
+    assert ev.media_types == []
+
+
@pytest.mark.asyncio
 async def test_dispatch_attachment_downloads_document(
    monkeypatch: pytest.MonkeyPatch,