diff --git a/plugins/platforms/photon/README.md b/plugins/platforms/photon/README.md index eb307fa94c3..a2cd92ec4a7 100644 --- a/plugins/platforms/photon/README.md +++ b/plugins/platforms/photon/README.md @@ -119,14 +119,14 @@ All env vars are documented in `plugin.yaml`. The most important: ## Attachments & limitations -- **Inbound attachments are downloaded.** The sidecar reads the bytes - (`content.read()`) and base64-inlines them on the NDJSON event; the adapter - caches them to the shared media cache and populates `media_urls` / - `media_types`, so the agent sees the real image/file (vision included) — - parity with the BlueBubbles iMessage channel. Attachments larger than +- **Inbound attachments and voice notes are downloaded.** The sidecar reads + the bytes (`content.read()`) and base64-inlines them on the NDJSON event; the + adapter caches them to the shared media cache and populates `media_urls` / + `media_types`, so the agent sees the real image/file or can transcribe the + voice note — parity with the BlueBubbles iMessage channel. Media larger than `PHOTON_MAX_INLINE_ATTACHMENT_BYTES` (default 20 MB), or any byte read that - fails, fall back to a text marker (`[Photon attachment received: …]`) so the - agent still knows something arrived. + fails, falls back to a text marker (`[Photon attachment received: …]` or + `[Photon voice received: …]`) so the agent still knows something arrived. - **Outbound attachments are supported.** Images, voice notes, video, and documents are sent via `space.send(attachment(...))` / `space.send(voice(...))` through the sidecar's `/send-attachment` diff --git a/plugins/platforms/photon/adapter.py b/plugins/platforms/photon/adapter.py index e6323d1df8a..92c900dbb2f 100644 --- a/plugins/platforms/photon/adapter.py +++ b/plugins/platforms/photon/adapter.py @@ -435,13 +435,15 @@ class PhotonAdapter(BasePlatformAdapter): "space": {"id": "...", "type": "dm"|"group", "phone": "+E164"}, "sender": {"id": "+E164"}, "content": {"type": "text", "text": "..."} - | {"type": "attachment", "id", "name", "mimeType", - "size", "data"?, "encoding"?}, + | {"type": "attachment"|"voice", "id", "name", + "mimeType", "size", "duration"?, "data"?, + "encoding"?}, "timestamp": "2026-05-14T19:06:32.000Z" - Attachment content carries the bytes inline as base64 ``data`` (with - ``encoding == "base64"``) when the sidecar could read them within its - size cap; otherwise only metadata is present and we surface a marker. + Attachment and voice content carry the bytes inline as base64 ``data`` + (with ``encoding == "base64"``) when the sidecar could read them + within its size cap; otherwise only metadata is present and we surface + a marker. } """ space = event.get("space") or {} @@ -476,23 +478,38 @@ class PhotonAdapter(BasePlatformAdapter): if ctype == "text": text = content.get("text") or "" mtype = MessageType.TEXT - elif ctype == "attachment": - name = content.get("name") or "(unnamed)" + elif ctype in {"attachment", "voice"}: + is_voice = ctype == "voice" + name = content.get("name") or ("voice" if is_voice else "(unnamed)") mime = content.get("mimeType") or "" - mtype = _attachment_message_type(mime) - cached = _cache_inbound_attachment(content, name, mime) + mtype = MessageType.VOICE if is_voice else _attachment_message_type(mime) + cached = _cache_inbound_attachment( + content, name, mime, force_audio=is_voice + ) if cached: media_urls.append(cached) - media_types.append(mime or "application/octet-stream") + media_types.append( + mime or ("audio/mp4" if is_voice else "application/octet-stream") + ) # The real bytes are attached, so the agent sees the media # itself — a short marker is enough text, and it keeps group # mention-gating consistent with plain messages. - text = "(attachment)" + text = "(voice)" if is_voice else "(attachment)" else: # No bytes (over the sidecar cap, a failed read, or a caching # failure) — fall back to a metadata marker so the agent still # knows something arrived. - text = f"[Photon attachment received: {name} ({mime})]" + label = "voice" if is_voice else "attachment" + duration = content.get("duration") + duration_text = ( + f", duration: {duration}s" + if isinstance(duration, (int, float)) + else "" + ) + text = ( + f"[Photon {label} received: {name} " + f"({mime or 'unknown MIME'}{duration_text})]" + ) else: text = f"[Photon content type not handled: {ctype}]" mtype = MessageType.TEXT @@ -950,7 +967,11 @@ _AUDIO_EXT_BY_MIME = { def _cache_inbound_attachment( - content: Dict[str, Any], name: str, mime: str + content: Dict[str, Any], + name: str, + mime: str, + *, + force_audio: bool = False, ) -> Optional[str]: """Decode a base64-inlined inbound attachment and cache it locally. @@ -988,8 +1009,10 @@ def _cache_inbound_attachment( # Bytes don't look like a supported image (e.g. HEIC magic) — # still deliver them as a document rather than dropping them. return cache_document_from_bytes(raw, name) - if mime.startswith("audio/"): - ext = suffix or _AUDIO_EXT_BY_MIME.get(mime, ".mp3") + if force_audio or mime.startswith("audio/"): + ext = suffix or _AUDIO_EXT_BY_MIME.get( + mime, ".m4a" if force_audio else ".mp3" + ) return cache_audio_from_bytes(raw, ext) # Video, application/*, and everything else → document cache. return cache_document_from_bytes(raw, name) diff --git a/plugins/platforms/photon/sidecar/index.mjs b/plugins/platforms/photon/sidecar/index.mjs index a84f1390afa..1eceb538a17 100644 --- a/plugins/platforms/photon/sidecar/index.mjs +++ b/plugins/platforms/photon/sidecar/index.mjs @@ -48,11 +48,11 @@ const port = parseInt(process.env.PHOTON_SIDECAR_PORT || "8789", 10); const bind = process.env.PHOTON_SIDECAR_BIND || "127.0.0.1"; const sharedToken = process.env.PHOTON_SIDECAR_TOKEN; -// Inbound attachments are read into memory and base64-inlined on the NDJSON +// Inbound binary content is read into memory and base64-inlined on the NDJSON // event so the Python adapter can cache the real bytes (and the agent can see -// the image). Cap the size we inline — above it we forward metadata only and -// the adapter surfaces a text marker, so one large video can't balloon a -// single NDJSON line. Override via PHOTON_MAX_INLINE_ATTACHMENT_BYTES. +// images / transcribe voice). Cap the size we inline — above it we forward +// metadata only and the adapter surfaces a text marker, so one large clip can't +// balloon a single NDJSON line. Override via PHOTON_MAX_INLINE_ATTACHMENT_BYTES. const MAX_INLINE_ATTACHMENT_BYTES = Number(process.env.PHOTON_MAX_INLINE_ATTACHMENT_BYTES) || 20 * 1024 * 1024; const DM_CHAT_GUID_RE = /^any;-;(\+\d{6,})$/; @@ -164,6 +164,57 @@ async function deliver(line) { } } +async function normalizeBinaryContent(content) { + const meta = { + type: content.type, + id: content.id ?? null, + name: content.name ?? null, + mimeType: content.mimeType ?? null, + size: typeof content.size === "number" ? content.size : null, + }; + if (content.type === "voice" && typeof content.duration === "number") { + meta.duration = content.duration; + } + + // Read the bytes eagerly and base64-inline them as `data` so the Python + // adapter can cache the real file (the agent then sees images and can run + // STT on voice notes). Spectrum content objects may not outlive this stream + // iteration, so a lazy/on-demand fetch isn't safe. Over-cap content (when + // size is known up front) is forwarded as metadata only and the adapter falls + // back to a text marker. A read failure must never break the inbound loop. + const label = `${content.type} ${meta.name ?? meta.id ?? "(unnamed)"}`; + if (meta.size !== null && meta.size > MAX_INLINE_ATTACHMENT_BYTES) { + console.error( + `photon-sidecar: ${label} (${meta.size} bytes) ` + + `exceeds inline cap ${MAX_INLINE_ATTACHMENT_BYTES}; forwarding metadata only` + ); + return meta; + } + if (typeof content.read === "function") { + try { + const buf = await content.read(); + // Guard the case where size was unknown but the bytes turn out to be + // over the cap. + if (buf && buf.length > MAX_INLINE_ATTACHMENT_BYTES) { + console.error( + `photon-sidecar: ${label} (${buf.length} bytes) ` + + `exceeds inline cap after read; forwarding metadata only` + ); + return meta; + } + meta.data = Buffer.from(buf).toString("base64"); + meta.encoding = "base64"; + } catch (e) { + console.error( + `photon-sidecar: failed to read ${content.type} bytes ` + + "(forwarding metadata only): " + + (e && e.stack ? e.stack : String(e)) + ); + } + } + return meta; +} + async function normalizeContent(content) { if (!content || typeof content !== "object") { return { type: "unknown" }; @@ -171,51 +222,8 @@ async function normalizeContent(content) { if (content.type === "text") { return { type: "text", text: content.text || "" }; } - if (content.type === "attachment") { - const meta = { - type: "attachment", - id: content.id ?? null, - name: content.name ?? null, - mimeType: content.mimeType ?? null, - size: typeof content.size === "number" ? content.size : null, - }; - // Read the bytes eagerly and base64-inline them as `data` so the Python - // adapter can cache the real file (the agent then sees the image itself). - // The spectrum-ts attachment object may not outlive this stream - // iteration, so a lazy/on-demand fetch isn't safe. Over-cap attachments - // (when size is known up front) are forwarded as metadata only and the - // adapter falls back to a text marker. A read failure must never break - // the inbound loop — we just drop `data` and forward metadata. - if (meta.size !== null && meta.size > MAX_INLINE_ATTACHMENT_BYTES) { - console.error( - `photon-sidecar: attachment ${meta.name ?? meta.id} (${meta.size} bytes) ` + - `exceeds inline cap ${MAX_INLINE_ATTACHMENT_BYTES}; forwarding metadata only` - ); - return meta; - } - if (typeof content.read === "function") { - try { - const buf = await content.read(); - // Guard the case where size was unknown but the bytes turn out to be - // over the cap. - if (buf && buf.length > MAX_INLINE_ATTACHMENT_BYTES) { - console.error( - `photon-sidecar: attachment ${meta.name ?? meta.id} (${buf.length} bytes) ` + - `exceeds inline cap after read; forwarding metadata only` - ); - return meta; - } - meta.data = Buffer.from(buf).toString("base64"); - meta.encoding = "base64"; - } catch (e) { - console.error( - "photon-sidecar: failed to read attachment bytes " + - "(forwarding metadata only): " + - (e && e.stack ? e.stack : String(e)) - ); - } - } - return meta; + if (content.type === "attachment" || content.type === "voice") { + return await normalizeBinaryContent(content); } return { type: content.type || "unknown" }; } diff --git a/tests/plugins/platforms/photon/test_inbound.py b/tests/plugins/platforms/photon/test_inbound.py index f3d4bfa328e..a31cfc15d9f 100644 --- a/tests/plugins/platforms/photon/test_inbound.py +++ b/tests/plugins/platforms/photon/test_inbound.py @@ -101,6 +101,18 @@ def _attachment_event( } +def _voice_event( + content: Dict[str, Any], msg_id: str = "spc-msg-voice" +) -> Dict[str, Any]: + return { + "messageId": msg_id, + "space": {"id": "+15551234567", "type": "dm", "phone": "+15551234567"}, + "sender": {"id": "+15551234567"}, + "content": {"type": "voice", **content}, + "timestamp": "2026-05-14T19:06:32.000Z", + } + + @pytest.mark.asyncio async def test_dispatch_attachment_without_bytes_surfaces_marker( monkeypatch: pytest.MonkeyPatch, @@ -156,6 +168,64 @@ async def test_dispatch_attachment_downloads_image( cached.unlink(missing_ok=True) +@pytest.mark.asyncio +async def test_dispatch_voice_downloads_audio( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Inbound Spectrum voice content is cached and routed to auto-STT.""" + adapter = _make_adapter(monkeypatch) + captured = _capture(adapter, monkeypatch) + + raw = b"OggS" + b"\x00" * 32 + event = _voice_event( + { + "name": "note.ogg", + "mimeType": "audio/ogg", + "duration": 7, + "size": len(raw), + "data": base64.b64encode(raw).decode("ascii"), + "encoding": "base64", + } + ) + await adapter._dispatch_inbound(event) + + assert len(captured) == 1 + ev = captured[0] + assert ev.message_type == MessageType.VOICE + assert ev.media_types == ["audio/ogg"] + assert len(ev.media_urls) == 1 + cached = Path(ev.media_urls[0]) + try: + assert cached.is_file() + assert cached.read_bytes() == raw + assert ev.text == "(voice)" + finally: + cached.unlink(missing_ok=True) + + +@pytest.mark.asyncio +async def test_dispatch_voice_without_bytes_surfaces_marker( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Metadata-only voice still tells the agent a voice note arrived.""" + adapter = _make_adapter(monkeypatch) + captured = _capture(adapter, monkeypatch) + + event = _voice_event( + {"name": "note.m4a", "mimeType": "audio/mp4", "duration": 12, "size": 12345} + ) + await adapter._dispatch_inbound(event) + + assert len(captured) == 1 + ev = captured[0] + assert "Photon voice received" in ev.text + assert "note.m4a" in ev.text + assert "duration: 12s" in ev.text + assert ev.message_type == MessageType.VOICE + assert ev.media_urls == [] + assert ev.media_types == [] + + @pytest.mark.asyncio async def test_dispatch_attachment_downloads_document( monkeypatch: pytest.MonkeyPatch,