fix(gateway): classify mixed attachments per-attachment + transcode uncommon image formats

A document attached alongside an image in the same Discord message was swept into the vision pipeline and 400'd the whole turn ("Could not process image"), and was simultaneously never surfaced to the agent as a readable file. Restores the "any file type works" contract for mixed messages and fixes the HTTP 400. Bug 1 — mixed attachments: the inbound routing loop keyed image/audio/video classification off the message-level type (PHOTO/VOICE/AUDIO), so a doc in a PHOTO message landed in image_paths and poisoned the vision call. The document context-note path was gated on message_type == DOCUMENT, so that same doc never reached the agent at all. Now classification is per-attachment (trust each attachment's own MIME; fall back to the message-level type only when MIME is unknown), via shared _event_media_is_* helpers used by both _build_media_placeholder and the main inbound loop. The document note now fires for any non-image/audio/video attachment regardless of message-level type. Bug 2 — uncommon formats: AVIF/HEIC/BMP/TIFF/ICO produced the same generic 400 because providers only accept PNG/JPEG/GIF/WEBP. image_routing now transcodes those to PNG via Pillow before declaring media_type, skipping cleanly (logged) if Pillow/plugins are missing. SVG is vector — Pillow can't rasterize it — so it's skipped rather than transcoded. Closes #25935. Co-authored-by: LeonSGP43 <cine.dreamer.one@gmail.com> Co-authored-by: cypres0099 <74935762+cypres0099@users.noreply.github.com>
2026-07-01 12:02:05 +00:00 · 2026-06-27 19:11:20 -07:00 · 2026-06-27 19:11:20 -07:00 · 505bc27d8d
commit 505bc27d8d
parent 0c372274cd
5 changed files with 371 additions and 15 deletions
--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@ -388,14 +388,98 @@ def _sniff_mime_from_bytes(raw: bytes) -> Optional[str]:
    # BMP: "BM"
    if raw.startswith(b"BM"):
        return "image/bmp"
-    # HEIC/HEIF: ftypheic / ftypheix / ftypmif1 / ftypmsf1 etc.
-    if len(raw) >= 12 and raw[4:8] == b"ftyp" and raw[8:12] in {
-        b"heic", b"heix", b"hevc", b"hevx", b"mif1", b"msf1", b"heim", b"heis",
-    }:
-        return "image/heic"
+    # ISO-BMFF family (HEIC/HEIF/AVIF): bytes 4..8 == 'ftyp', major brand at 8..12
+    if len(raw) >= 12 and raw[4:8] == b"ftyp":
+        brand = raw[8:12]
+        if brand in {b"avif", b"avis"}:
+            return "image/avif"
+        if brand in {
+            b"heic", b"heix", b"hevc", b"hevx",
+            b"mif1", b"msf1", b"heim", b"heis",
+        }:
+            return "image/heic"
+    # TIFF: II*\0 (little-endian) or MM\0* (big-endian)
+    if raw[:4] in {b"II*\x00", b"MM\x00*"}:
+        return "image/tiff"
+    # ICO: 00 00 01 00 (reserved=0, type=1=icon)
+    if raw[:4] == b"\x00\x00\x01\x00":
+        return "image/x-icon"
+    # SVG: text-based, look for an <svg tag near the start (skip BOM/whitespace)
+    head = raw[:512].lstrip().lower()
+    if head.startswith(b"<?xml") or head.startswith(b"<svg"):
+        if b"<svg" in head:
+            return "image/svg+xml"
    return None


+# Formats every major vision provider (Anthropic, OpenAI, Gemini, Bedrock)
+# accepts natively. Anything outside this set has to be transcoded to PNG
+# before we declare media_type, otherwise the provider returns HTTP 400
+# ("Could not process image" / "Unsupported image media type") and the
+# whole turn fails with no salvage path.
+#
+# Discord (and a few other chat platforms) freely accept attachments in
+# formats outside this set -- AVIF screenshots from Chromium, HEIC from
+# iPhones, TIFF from scanners, BMP from old Windows tools, ICO -- so users
+# do hit this in practice. SVG is vector and Pillow cannot rasterize it;
+# it is skipped (logged) rather than transcoded.
+_UNIVERSALLY_SUPPORTED_MIMES = frozenset({
+    "image/png", "image/jpeg", "image/gif", "image/webp",
+})
+
+
+def _transcode_to_png(raw: bytes) -> Optional[bytes]:
+    """Decode arbitrary image bytes with Pillow and re-encode as PNG.
+
+    Returns None if Pillow isn't installed or can't decode the input
+    (rare formats, corrupted bytes, missing optional decoder plugin for
+    HEIC/AVIF, or vector formats like SVG). Caller falls back to skipping
+    the image so the rest of the turn still works.
+
+    HEIC/HEIF and AVIF need optional Pillow plugins; we try to register
+    them on demand and swallow ImportError so a missing plugin just
+    looks like 'Pillow can't decode this' rather than crashing.
+    """
+    try:
+        from PIL import Image
+    except ImportError:
+        logger.info(
+            "image_routing: Pillow not installed; cannot transcode "
+            "non-standard image format to PNG. Install with `pip install Pillow` "
+            "(and `pillow-heif` / `pillow-avif-plugin` for those formats)."
+        )
+        return None
+    # Optional plugin registration. Silent on failure: an unsupported
+    # format will just fall through to Image.open raising below.
+    try:
+        import pillow_heif  # type: ignore
+
+        pillow_heif.register_heif_opener()
+    except Exception:
+        pass
+    try:
+        import pillow_avif  # type: ignore  # noqa: F401  -- registers AVIF on import
+    except Exception:
+        pass
+    try:
+        from io import BytesIO
+
+        with Image.open(BytesIO(raw)) as im:
+            # Pick an output mode PNG can serialise. Anything other than
+            # the standard set gets normalised to RGBA so transparency is
+            # preserved where the source had it.
+            if im.mode not in {"RGB", "RGBA", "L", "LA", "P"}:
+                im = im.convert("RGBA")
+            buf = BytesIO()
+            im.save(buf, format="PNG", optimize=False)
+            return buf.getvalue()
+    except Exception as exc:
+        logger.info(
+            "image_routing: Pillow could not transcode image to PNG -- %s", exc
+        )
+        return None
+
+
 def _guess_mime(path: Path, raw: Optional[bytes] = None) -> str:
    """Return image MIME type for *path*.

@ -431,8 +515,18 @@ def _file_to_data_url(path: Path) -> Optional[str]:
    accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
    quality tax just because one other provider is stricter.

-    Returns None only if the file can't be read (missing, permission
-    denied, etc.); the caller reports those paths in ``skipped``.
+    Format compatibility IS handled here: if the sniffed MIME isn't one
+    of ``_UNIVERSALLY_SUPPORTED_MIMES`` (i.e. it's something like AVIF,
+    HEIC, BMP, TIFF, or ICO that some providers reject outright), we
+    transcode to PNG with Pillow before declaring media_type. This fixes
+    the user-visible "Could not process image" HTTP 400 from Anthropic on
+    Discord-attached AVIF/HEIC/BMP files.
+
+    Returns None if the file can't be read OR if the format isn't
+    universally supported AND Pillow can't transcode it (Pillow missing,
+    HEIC/AVIF plugin missing, vector format like SVG, corrupt bytes). The
+    caller reports those paths in ``skipped`` and the rest of the turn
+    proceeds.
    """
    try:
        raw = path.read_bytes()
@ -440,6 +534,22 @@ def _file_to_data_url(path: Path) -> Optional[str]:
        logger.warning("image_routing: failed to read %s — %s", path, exc)
        return None
    mime = _guess_mime(path, raw=raw)
+    if mime not in _UNIVERSALLY_SUPPORTED_MIMES:
+        transcoded = _transcode_to_png(raw)
+        if transcoded is None:
+            logger.warning(
+                "image_routing: %s is %s which is not accepted by all major "
+                "vision providers and could not be transcoded to PNG; "
+                "skipping this attachment.",
+                path, mime,
+            )
+            return None
+        logger.info(
+            "image_routing: transcoded %s (%s) -> image/png for provider compatibility",
+            path.name, mime,
+        )
+        raw = transcoded
+        mime = "image/png"
    b64 = base64.b64encode(raw).decode("ascii")
    return f"data:{mime};base64,{b64}"

--- a/gateway/run.py
+++ b/gateway/run.py
@ -1911,6 +1911,47 @@ def _try_resolve_fallback_provider() -> dict | None:
    return None


+def _event_media_type_at(event, index: int) -> str:
+    """Return the per-attachment MIME for the attachment at *index*.
+
+    Empty string when the platform didn't populate a per-file MIME for
+    that slot (some adapters only set a message-level type).
+    """
+    media_types = getattr(event, "media_types", None) or []
+    return media_types[index] if index < len(media_types) else ""
+
+
+def _event_media_is_image(event, index: int) -> bool:
+    """True if the attachment at *index* is an image.
+
+    Trust the per-attachment MIME when present. Only fall back to the
+    message-level ``PHOTO`` type when this attachment's MIME is unknown --
+    otherwise a document (or any non-image) uploaded alongside an image in
+    the same message gets mis-routed as an image, base64'd into a vision
+    content part, and the provider 400s ("Could not process image").
+    """
+    mtype = _event_media_type_at(event, index)
+    if mtype:
+        return mtype.startswith("image/")
+    return getattr(event, "message_type", None) == MessageType.PHOTO
+
+
+def _event_media_is_audio(event, index: int) -> bool:
+    """True if the attachment at *index* is audio (per-attachment MIME first)."""
+    mtype = _event_media_type_at(event, index)
+    if mtype:
+        return mtype.startswith("audio/")
+    return getattr(event, "message_type", None) in {MessageType.VOICE, MessageType.AUDIO}
+
+
+def _event_media_is_video(event, index: int) -> bool:
+    """True if the attachment at *index* is video (per-attachment MIME first)."""
+    mtype = _event_media_type_at(event, index)
+    if mtype:
+        return mtype.startswith("video/")
+    return getattr(event, "message_type", None) == MessageType.VIDEO
+
+
 def _build_media_placeholder(event) -> str:
    """Build a text placeholder for media-only events so they aren't dropped.

@ -1921,14 +1962,12 @@ def _build_media_placeholder(event) -> str:
    """
    parts = []
    media_urls = getattr(event, "media_urls", None) or []
-    media_types = getattr(event, "media_types", None) or []
    for i, url in enumerate(media_urls):
-        mtype = media_types[i] if i < len(media_types) else ""
-        if mtype.startswith("image/") or getattr(event, "message_type", None) == MessageType.PHOTO:
+        if _event_media_is_image(event, i):
            parts.append(f"[User sent an image: {url}]")
-        elif mtype.startswith("audio/"):
+        elif _event_media_is_audio(event, i):
            parts.append(f"[User sent audio: {url}]")
-        elif mtype.startswith("video/") or getattr(event, "message_type", None) == MessageType.VIDEO:
+        elif _event_media_is_video(event, i):
            parts.append(f"[User sent a video: {url}]")
        else:
            parts.append(f"[User sent a file: {url}]")
@ -9220,7 +9259,12 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
            audio_paths = []
            for i, path in enumerate(event.media_urls):
                mtype = event.media_types[i] if i < len(event.media_types) else ""
-                if mtype.startswith("image/") or event.message_type == MessageType.PHOTO:
+                # Classify images per-attachment: trust this attachment's own
+                # MIME, and only honour the message-level PHOTO type when the
+                # per-attachment MIME is unknown. Otherwise a document (or any
+                # non-image) sent alongside an image in the same message gets
+                # mis-routed here as an image and the provider 400s.
+                if _event_media_is_image(event, i):
                    image_paths.append(path)
                # MessageType.AUDIO = audio file attachment (e.g. .mp3, .m4a) — never STT
                # MessageType.VOICE = voice message (Opus/OGG) — always STT
@ -9231,7 +9275,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
                    and event.message_type not in {MessageType.AUDIO, MessageType.DOCUMENT}
                ):
                    audio_paths.append(path)
-                if mtype.startswith("video/") or event.message_type == MessageType.VIDEO:
+                if mtype.startswith("video/") or (not mtype and event.message_type == MessageType.VIDEO):
                    video_paths.append(path)

            if image_paths:
@ -9350,12 +9394,25 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
                )
                message_text = f"{_note}\n\n{message_text}"

-        if event.media_urls and event.message_type == MessageType.DOCUMENT:
+        if event.media_urls:
            import mimetypes as _mimetypes
            from tools.credential_files import to_agent_visible_cache_path

            _TEXT_EXTENSIONS = {".txt", ".md", ".csv", ".log", ".json", ".xml", ".yaml", ".yml", ".toml", ".ini", ".cfg"}
            for i, path in enumerate(event.media_urls):
+                # Per-attachment document handling. Skip anything already routed
+                # as image / audio / video by the buckets above — only genuine
+                # non-media files get a path-pointing context note. This makes a
+                # document mixed into a PHOTO/VOICE message (whole-message type
+                # != DOCUMENT) still reach the agent as a readable cached file,
+                # instead of being silently dropped because the message-level
+                # type wasn't DOCUMENT.
+                if (
+                    _event_media_is_image(event, i)
+                    or _event_media_is_audio(event, i)
+                    or _event_media_is_video(event, i)
+                ):
+                    continue
                mtype = event.media_types[i] if i < len(event.media_types) else ""
                if mtype in {"", "application/octet-stream"}:
                    _ext = os.path.splitext(path)[1].lower()
--- a/scripts/release.py
+++ b/scripts/release.py
@ -1425,6 +1425,7 @@ AUTHOR_MAP = {
    "nouseman666@gmail.com": "nouseman666",  # PR #19088
    "ginwu05@gmail.com": "GinWU05",  # PR #19093
    "shashwatgokhe2@gmail.com": "shashwatgokhe",  # PR #19196
+    "74935762+cypres0099@users.noreply.github.com": "cypres0099",  # PR #25935 salvage (mixed-attachment image routing)
    "stevenchou.ai@gmail.com": "stevenchouai",  # PR #19221
    "leo.gong@phizchat.com": "agilejava",  # PR #19346
    "acc001k@pm.me": "acc001k",  # PR #19358
--- a/tests/agent/test_image_routing.py
+++ b/tests/agent/test_image_routing.py
@ -636,3 +636,109 @@ class TestBuildNativeContentPartsURLs:
        )
        assert parts[0]["type"] == "text"
        assert parts[0]["text"].startswith("What do you see in this image?")
+
+
+# ─── Format compatibility: transcode non-universal formats to PNG ────────────
+
+
+class TestFormatCompatibility:
+    """Some image formats Discord (and other chat platforms) accept aren't
+    accepted by every major vision provider. Anthropic for example returns
+    HTTP 400 'Could not process image' for AVIF/HEIC/BMP/TIFF/ICO/SVG.
+
+    We transcode anything outside the universal-safe set (PNG/JPEG/GIF/WEBP)
+    to PNG with Pillow before declaring media_type so the provider call
+    actually succeeds. Regression coverage for the user-reported Discord
+    'Could not process image' HTTP 400 (issue #25935).
+    """
+
+    def test_avif_sniffed_correctly(self):
+        from agent.image_routing import _sniff_mime_from_bytes
+        avif_header = b"\x00\x00\x00\x20ftypavif\x00\x00\x00\x00"
+        assert _sniff_mime_from_bytes(avif_header) == "image/avif"
+
+    def test_tiff_sniffed_both_endians(self):
+        from agent.image_routing import _sniff_mime_from_bytes
+        assert _sniff_mime_from_bytes(b"II*\x00" + b"\x00" * 16) == "image/tiff"
+        assert _sniff_mime_from_bytes(b"MM\x00*" + b"\x00" * 16) == "image/tiff"
+
+    def test_ico_sniffed_correctly(self):
+        from agent.image_routing import _sniff_mime_from_bytes
+        assert _sniff_mime_from_bytes(b"\x00\x00\x01\x00" + b"\x00" * 16) == "image/x-icon"
+
+    def test_heic_still_sniffed(self):
+        from agent.image_routing import _sniff_mime_from_bytes
+        heic_header = b"\x00\x00\x00\x20ftypheic\x00\x00\x00\x00"
+        assert _sniff_mime_from_bytes(heic_header) == "image/heic"
+
+    def test_svg_sniffed_correctly(self):
+        from agent.image_routing import _sniff_mime_from_bytes
+        assert _sniff_mime_from_bytes(b'<svg xmlns="http://www.w3.org/2000/svg"/>') == "image/svg+xml"
+        assert _sniff_mime_from_bytes(b'<?xml version="1.0"?><svg/>') == "image/svg+xml"
+
+    def test_bmp_transcoded_to_png(self, tmp_path: Path):
+        """BMP file should land as image/png in the data URL, not image/bmp,
+        because not every provider (Anthropic) accepts BMP."""
+        import pytest
+        Image = pytest.importorskip("PIL.Image", reason="Pillow not installed; transcode is best-effort")
+        from agent.image_routing import _file_to_data_url
+
+        img_path = tmp_path / "scan.bmp"
+        Image.new("RGB", (4, 4), (255, 0, 0)).save(img_path, format="BMP")
+        url = _file_to_data_url(img_path)
+        assert url is not None
+        assert url.startswith("data:image/png;base64,"), (
+            f"BMP must be transcoded to PNG for cross-provider compatibility, got: {url[:60]}"
+        )
+
+    def test_tiff_transcoded_to_png(self, tmp_path: Path):
+        import pytest
+        Image = pytest.importorskip("PIL.Image", reason="Pillow not installed; transcode is best-effort")
+        from agent.image_routing import _file_to_data_url
+
+        img_path = tmp_path / "scan.tiff"
+        Image.new("RGB", (4, 4), (0, 255, 0)).save(img_path, format="TIFF")
+        url = _file_to_data_url(img_path)
+        assert url is not None
+        assert url.startswith("data:image/png;base64,")
+
+    def test_png_passes_through_no_transcode(self, tmp_path: Path):
+        """Universal-safe formats must NOT be re-encoded — preserves bytes."""
+        from agent.image_routing import _file_to_data_url
+
+        img_path = tmp_path / "ok.png"
+        img_path.write_bytes(_png_bytes())
+        url = _file_to_data_url(img_path)
+        assert url is not None
+        assert url.startswith("data:image/png;base64,")
+        b64 = url.split(",", 1)[1]
+        assert base64.b64decode(b64) == _png_bytes()
+
+    def test_jpeg_passes_through_no_transcode(self, tmp_path: Path):
+        from agent.image_routing import _file_to_data_url
+
+        img_path = tmp_path / "ok.jpg"
+        img_path.write_bytes(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xd9")
+        url = _file_to_data_url(img_path)
+        assert url is not None
+        assert url.startswith("data:image/jpeg;base64,")
+
+    def test_transcode_failure_is_skipped_not_crashed(self, tmp_path: Path):
+        """If Pillow can't decode (corrupted bytes labeled as a rare format),
+        return None so the caller skips it rather than sending broken data."""
+        from agent.image_routing import _file_to_data_url
+
+        img_path = tmp_path / "corrupt.avif"
+        img_path.write_bytes(b"\x00\x00\x00\x20ftypavif" + b"\x00" * 32)
+        url = _file_to_data_url(img_path)
+        assert url is None
+
+    def test_svg_skipped_not_transcoded(self, tmp_path: Path):
+        """SVG is vector; Pillow can't rasterize it. It must be skipped
+        (None) rather than producing an invalid data URL."""
+        from agent.image_routing import _file_to_data_url
+
+        img_path = tmp_path / "icon.svg"
+        img_path.write_bytes(b'<svg xmlns="http://www.w3.org/2000/svg" width="4" height="4"/>')
+        url = _file_to_data_url(img_path)
+        assert url is None
--- a/tests/gateway/test_mixed_attachment_routing.py
+++ b/tests/gateway/test_mixed_attachment_routing.py
@ -0,0 +1,82 @@
+"""Regression tests for mixed-attachment routing in gateway/run.py.
+
+Issue #25935: when a message mixes a real image with a document (e.g. a .md
+brief), Discord types the whole message MessageType.PHOTO. The per-attachment
+loops must classify each attachment by its OWN mimetype:
+
+  * A document must NOT be swept into image_paths just because the message-level
+    type is PHOTO — mislabelling it as an image sent its bytes to the vision
+    endpoint, which rejected them with a non-retryable HTTP 400 and killed the
+    whole turn ("Could not process image").
+  * That same document must STILL reach the agent as a readable cached file via
+    the document context-note path, even though the message-level type isn't
+    DOCUMENT.
+
+The message-level fallback (PHOTO/VOICE/AUDIO/VIDEO) is preserved only for
+attachments whose per-file mimetype is unknown (empty) — platforms that don't
+populate media_types.
+"""
+
+from types import SimpleNamespace
+
+from gateway.platforms.base import MessageType
+from gateway.run import (
+    _build_media_placeholder,
+    _event_media_is_audio,
+    _event_media_is_image,
+    _event_media_is_video,
+)
+
+
+def _evt(media_urls, media_types, message_type):
+    return SimpleNamespace(
+        media_urls=media_urls,
+        media_types=media_types,
+        message_type=message_type,
+    )
+
+
+# ─── per-attachment classification helpers ───────────────────────────────────
+
+
+def test_image_trusts_own_mime_over_photo_message_type():
+    evt = _evt(["/c/pic.png", "/c/brief.md"], ["image/png", "text/markdown"], MessageType.PHOTO)
+    assert _event_media_is_image(evt, 0) is True
+    # The document must NOT be promoted to an image by the PHOTO fallback.
+    assert _event_media_is_image(evt, 1) is False
+
+
+def test_unknown_mime_falls_back_to_photo_message_type():
+    # Platforms that don't populate media_types rely on the message-level type.
+    evt = _evt(["/c/photo.jpg"], [""], MessageType.PHOTO)
+    assert _event_media_is_image(evt, 0) is True
+
+
+def test_audio_classified_per_attachment():
+    evt = _evt(["/c/clip.ogg", "/c/shot.png"], ["audio/ogg", "image/png"], MessageType.PHOTO)
+    assert _event_media_is_audio(evt, 0) is True
+    assert _event_media_is_audio(evt, 1) is False
+    assert _event_media_is_image(evt, 1) is True
+
+
+def test_video_classified_per_attachment():
+    evt = _evt(["/c/movie.mp4", "/c/notes.md"], ["video/mp4", "text/markdown"], MessageType.PHOTO)
+    assert _event_media_is_video(evt, 0) is True
+    assert _event_media_is_video(evt, 1) is False
+
+
+# ─── _build_media_placeholder ────────────────────────────────────────────────
+
+
+def test_placeholder_document_in_photo_message_is_not_an_image():
+    evt = _evt(["/c/product.png", "/c/brief.md"], ["image/png", "text/markdown"], MessageType.PHOTO)
+    out = _build_media_placeholder(evt)
+    assert "[User sent an image: /c/product.png]" in out
+    assert "[User sent an image: /c/brief.md]" not in out
+    assert "[User sent a file: /c/brief.md]" in out
+
+
+def test_placeholder_image_with_unknown_mime_uses_photo_fallback():
+    evt = _evt(["/c/photo.jpg"], [""], MessageType.PHOTO)
+    out = _build_media_placeholder(evt)
+    assert "[User sent an image: /c/photo.jpg]" in out