diff --git a/agent/image_routing.py b/agent/image_routing.py index c8b3f6640c6..3014acf1cab 100644 --- a/agent/image_routing.py +++ b/agent/image_routing.py @@ -388,14 +388,98 @@ def _sniff_mime_from_bytes(raw: bytes) -> Optional[str]: # BMP: "BM" if raw.startswith(b"BM"): return "image/bmp" - # HEIC/HEIF: ftypheic / ftypheix / ftypmif1 / ftypmsf1 etc. - if len(raw) >= 12 and raw[4:8] == b"ftyp" and raw[8:12] in { - b"heic", b"heix", b"hevc", b"hevx", b"mif1", b"msf1", b"heim", b"heis", - }: - return "image/heic" + # ISO-BMFF family (HEIC/HEIF/AVIF): bytes 4..8 == 'ftyp', major brand at 8..12 + if len(raw) >= 12 and raw[4:8] == b"ftyp": + brand = raw[8:12] + if brand in {b"avif", b"avis"}: + return "image/avif" + if brand in { + b"heic", b"heix", b"hevc", b"hevx", + b"mif1", b"msf1", b"heim", b"heis", + }: + return "image/heic" + # TIFF: II*\0 (little-endian) or MM\0* (big-endian) + if raw[:4] in {b"II*\x00", b"MM\x00*"}: + return "image/tiff" + # ICO: 00 00 01 00 (reserved=0, type=1=icon) + if raw[:4] == b"\x00\x00\x01\x00": + return "image/x-icon" + # SVG: text-based, look for an Optional[bytes]: + """Decode arbitrary image bytes with Pillow and re-encode as PNG. + + Returns None if Pillow isn't installed or can't decode the input + (rare formats, corrupted bytes, missing optional decoder plugin for + HEIC/AVIF, or vector formats like SVG). Caller falls back to skipping + the image so the rest of the turn still works. + + HEIC/HEIF and AVIF need optional Pillow plugins; we try to register + them on demand and swallow ImportError so a missing plugin just + looks like 'Pillow can't decode this' rather than crashing. + """ + try: + from PIL import Image + except ImportError: + logger.info( + "image_routing: Pillow not installed; cannot transcode " + "non-standard image format to PNG. Install with `pip install Pillow` " + "(and `pillow-heif` / `pillow-avif-plugin` for those formats)." + ) + return None + # Optional plugin registration. Silent on failure: an unsupported + # format will just fall through to Image.open raising below. + try: + import pillow_heif # type: ignore + + pillow_heif.register_heif_opener() + except Exception: + pass + try: + import pillow_avif # type: ignore # noqa: F401 -- registers AVIF on import + except Exception: + pass + try: + from io import BytesIO + + with Image.open(BytesIO(raw)) as im: + # Pick an output mode PNG can serialise. Anything other than + # the standard set gets normalised to RGBA so transparency is + # preserved where the source had it. + if im.mode not in {"RGB", "RGBA", "L", "LA", "P"}: + im = im.convert("RGBA") + buf = BytesIO() + im.save(buf, format="PNG", optimize=False) + return buf.getvalue() + except Exception as exc: + logger.info( + "image_routing: Pillow could not transcode image to PNG -- %s", exc + ) + return None + + def _guess_mime(path: Path, raw: Optional[bytes] = None) -> str: """Return image MIME type for *path*. @@ -431,8 +515,18 @@ def _file_to_data_url(path: Path) -> Optional[str]: accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent quality tax just because one other provider is stricter. - Returns None only if the file can't be read (missing, permission - denied, etc.); the caller reports those paths in ``skipped``. + Format compatibility IS handled here: if the sniffed MIME isn't one + of ``_UNIVERSALLY_SUPPORTED_MIMES`` (i.e. it's something like AVIF, + HEIC, BMP, TIFF, or ICO that some providers reject outright), we + transcode to PNG with Pillow before declaring media_type. This fixes + the user-visible "Could not process image" HTTP 400 from Anthropic on + Discord-attached AVIF/HEIC/BMP files. + + Returns None if the file can't be read OR if the format isn't + universally supported AND Pillow can't transcode it (Pillow missing, + HEIC/AVIF plugin missing, vector format like SVG, corrupt bytes). The + caller reports those paths in ``skipped`` and the rest of the turn + proceeds. """ try: raw = path.read_bytes() @@ -440,6 +534,22 @@ def _file_to_data_url(path: Path) -> Optional[str]: logger.warning("image_routing: failed to read %s — %s", path, exc) return None mime = _guess_mime(path, raw=raw) + if mime not in _UNIVERSALLY_SUPPORTED_MIMES: + transcoded = _transcode_to_png(raw) + if transcoded is None: + logger.warning( + "image_routing: %s is %s which is not accepted by all major " + "vision providers and could not be transcoded to PNG; " + "skipping this attachment.", + path, mime, + ) + return None + logger.info( + "image_routing: transcoded %s (%s) -> image/png for provider compatibility", + path.name, mime, + ) + raw = transcoded + mime = "image/png" b64 = base64.b64encode(raw).decode("ascii") return f"data:{mime};base64,{b64}" diff --git a/gateway/run.py b/gateway/run.py index f7b751b0c98..a60de0daa35 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1911,6 +1911,47 @@ def _try_resolve_fallback_provider() -> dict | None: return None +def _event_media_type_at(event, index: int) -> str: + """Return the per-attachment MIME for the attachment at *index*. + + Empty string when the platform didn't populate a per-file MIME for + that slot (some adapters only set a message-level type). + """ + media_types = getattr(event, "media_types", None) or [] + return media_types[index] if index < len(media_types) else "" + + +def _event_media_is_image(event, index: int) -> bool: + """True if the attachment at *index* is an image. + + Trust the per-attachment MIME when present. Only fall back to the + message-level ``PHOTO`` type when this attachment's MIME is unknown -- + otherwise a document (or any non-image) uploaded alongside an image in + the same message gets mis-routed as an image, base64'd into a vision + content part, and the provider 400s ("Could not process image"). + """ + mtype = _event_media_type_at(event, index) + if mtype: + return mtype.startswith("image/") + return getattr(event, "message_type", None) == MessageType.PHOTO + + +def _event_media_is_audio(event, index: int) -> bool: + """True if the attachment at *index* is audio (per-attachment MIME first).""" + mtype = _event_media_type_at(event, index) + if mtype: + return mtype.startswith("audio/") + return getattr(event, "message_type", None) in {MessageType.VOICE, MessageType.AUDIO} + + +def _event_media_is_video(event, index: int) -> bool: + """True if the attachment at *index* is video (per-attachment MIME first).""" + mtype = _event_media_type_at(event, index) + if mtype: + return mtype.startswith("video/") + return getattr(event, "message_type", None) == MessageType.VIDEO + + def _build_media_placeholder(event) -> str: """Build a text placeholder for media-only events so they aren't dropped. @@ -1921,14 +1962,12 @@ def _build_media_placeholder(event) -> str: """ parts = [] media_urls = getattr(event, "media_urls", None) or [] - media_types = getattr(event, "media_types", None) or [] for i, url in enumerate(media_urls): - mtype = media_types[i] if i < len(media_types) else "" - if mtype.startswith("image/") or getattr(event, "message_type", None) == MessageType.PHOTO: + if _event_media_is_image(event, i): parts.append(f"[User sent an image: {url}]") - elif mtype.startswith("audio/"): + elif _event_media_is_audio(event, i): parts.append(f"[User sent audio: {url}]") - elif mtype.startswith("video/") or getattr(event, "message_type", None) == MessageType.VIDEO: + elif _event_media_is_video(event, i): parts.append(f"[User sent a video: {url}]") else: parts.append(f"[User sent a file: {url}]") @@ -9220,7 +9259,12 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew audio_paths = [] for i, path in enumerate(event.media_urls): mtype = event.media_types[i] if i < len(event.media_types) else "" - if mtype.startswith("image/") or event.message_type == MessageType.PHOTO: + # Classify images per-attachment: trust this attachment's own + # MIME, and only honour the message-level PHOTO type when the + # per-attachment MIME is unknown. Otherwise a document (or any + # non-image) sent alongside an image in the same message gets + # mis-routed here as an image and the provider 400s. + if _event_media_is_image(event, i): image_paths.append(path) # MessageType.AUDIO = audio file attachment (e.g. .mp3, .m4a) — never STT # MessageType.VOICE = voice message (Opus/OGG) — always STT @@ -9231,7 +9275,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew and event.message_type not in {MessageType.AUDIO, MessageType.DOCUMENT} ): audio_paths.append(path) - if mtype.startswith("video/") or event.message_type == MessageType.VIDEO: + if mtype.startswith("video/") or (not mtype and event.message_type == MessageType.VIDEO): video_paths.append(path) if image_paths: @@ -9350,12 +9394,25 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew ) message_text = f"{_note}\n\n{message_text}" - if event.media_urls and event.message_type == MessageType.DOCUMENT: + if event.media_urls: import mimetypes as _mimetypes from tools.credential_files import to_agent_visible_cache_path _TEXT_EXTENSIONS = {".txt", ".md", ".csv", ".log", ".json", ".xml", ".yaml", ".yml", ".toml", ".ini", ".cfg"} for i, path in enumerate(event.media_urls): + # Per-attachment document handling. Skip anything already routed + # as image / audio / video by the buckets above — only genuine + # non-media files get a path-pointing context note. This makes a + # document mixed into a PHOTO/VOICE message (whole-message type + # != DOCUMENT) still reach the agent as a readable cached file, + # instead of being silently dropped because the message-level + # type wasn't DOCUMENT. + if ( + _event_media_is_image(event, i) + or _event_media_is_audio(event, i) + or _event_media_is_video(event, i) + ): + continue mtype = event.media_types[i] if i < len(event.media_types) else "" if mtype in {"", "application/octet-stream"}: _ext = os.path.splitext(path)[1].lower() diff --git a/scripts/release.py b/scripts/release.py index e324fb805d0..e19378a0838 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -1425,6 +1425,7 @@ AUTHOR_MAP = { "nouseman666@gmail.com": "nouseman666", # PR #19088 "ginwu05@gmail.com": "GinWU05", # PR #19093 "shashwatgokhe2@gmail.com": "shashwatgokhe", # PR #19196 + "74935762+cypres0099@users.noreply.github.com": "cypres0099", # PR #25935 salvage (mixed-attachment image routing) "stevenchou.ai@gmail.com": "stevenchouai", # PR #19221 "leo.gong@phizchat.com": "agilejava", # PR #19346 "acc001k@pm.me": "acc001k", # PR #19358 diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py index b5a43f1ff0e..2019bc182d0 100644 --- a/tests/agent/test_image_routing.py +++ b/tests/agent/test_image_routing.py @@ -636,3 +636,109 @@ class TestBuildNativeContentPartsURLs: ) assert parts[0]["type"] == "text" assert parts[0]["text"].startswith("What do you see in this image?") + + +# ─── Format compatibility: transcode non-universal formats to PNG ──────────── + + +class TestFormatCompatibility: + """Some image formats Discord (and other chat platforms) accept aren't + accepted by every major vision provider. Anthropic for example returns + HTTP 400 'Could not process image' for AVIF/HEIC/BMP/TIFF/ICO/SVG. + + We transcode anything outside the universal-safe set (PNG/JPEG/GIF/WEBP) + to PNG with Pillow before declaring media_type so the provider call + actually succeeds. Regression coverage for the user-reported Discord + 'Could not process image' HTTP 400 (issue #25935). + """ + + def test_avif_sniffed_correctly(self): + from agent.image_routing import _sniff_mime_from_bytes + avif_header = b"\x00\x00\x00\x20ftypavif\x00\x00\x00\x00" + assert _sniff_mime_from_bytes(avif_header) == "image/avif" + + def test_tiff_sniffed_both_endians(self): + from agent.image_routing import _sniff_mime_from_bytes + assert _sniff_mime_from_bytes(b"II*\x00" + b"\x00" * 16) == "image/tiff" + assert _sniff_mime_from_bytes(b"MM\x00*" + b"\x00" * 16) == "image/tiff" + + def test_ico_sniffed_correctly(self): + from agent.image_routing import _sniff_mime_from_bytes + assert _sniff_mime_from_bytes(b"\x00\x00\x01\x00" + b"\x00" * 16) == "image/x-icon" + + def test_heic_still_sniffed(self): + from agent.image_routing import _sniff_mime_from_bytes + heic_header = b"\x00\x00\x00\x20ftypheic\x00\x00\x00\x00" + assert _sniff_mime_from_bytes(heic_header) == "image/heic" + + def test_svg_sniffed_correctly(self): + from agent.image_routing import _sniff_mime_from_bytes + assert _sniff_mime_from_bytes(b'') == "image/svg+xml" + assert _sniff_mime_from_bytes(b'') == "image/svg+xml" + + def test_bmp_transcoded_to_png(self, tmp_path: Path): + """BMP file should land as image/png in the data URL, not image/bmp, + because not every provider (Anthropic) accepts BMP.""" + import pytest + Image = pytest.importorskip("PIL.Image", reason="Pillow not installed; transcode is best-effort") + from agent.image_routing import _file_to_data_url + + img_path = tmp_path / "scan.bmp" + Image.new("RGB", (4, 4), (255, 0, 0)).save(img_path, format="BMP") + url = _file_to_data_url(img_path) + assert url is not None + assert url.startswith("data:image/png;base64,"), ( + f"BMP must be transcoded to PNG for cross-provider compatibility, got: {url[:60]}" + ) + + def test_tiff_transcoded_to_png(self, tmp_path: Path): + import pytest + Image = pytest.importorskip("PIL.Image", reason="Pillow not installed; transcode is best-effort") + from agent.image_routing import _file_to_data_url + + img_path = tmp_path / "scan.tiff" + Image.new("RGB", (4, 4), (0, 255, 0)).save(img_path, format="TIFF") + url = _file_to_data_url(img_path) + assert url is not None + assert url.startswith("data:image/png;base64,") + + def test_png_passes_through_no_transcode(self, tmp_path: Path): + """Universal-safe formats must NOT be re-encoded — preserves bytes.""" + from agent.image_routing import _file_to_data_url + + img_path = tmp_path / "ok.png" + img_path.write_bytes(_png_bytes()) + url = _file_to_data_url(img_path) + assert url is not None + assert url.startswith("data:image/png;base64,") + b64 = url.split(",", 1)[1] + assert base64.b64decode(b64) == _png_bytes() + + def test_jpeg_passes_through_no_transcode(self, tmp_path: Path): + from agent.image_routing import _file_to_data_url + + img_path = tmp_path / "ok.jpg" + img_path.write_bytes(b"\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xd9") + url = _file_to_data_url(img_path) + assert url is not None + assert url.startswith("data:image/jpeg;base64,") + + def test_transcode_failure_is_skipped_not_crashed(self, tmp_path: Path): + """If Pillow can't decode (corrupted bytes labeled as a rare format), + return None so the caller skips it rather than sending broken data.""" + from agent.image_routing import _file_to_data_url + + img_path = tmp_path / "corrupt.avif" + img_path.write_bytes(b"\x00\x00\x00\x20ftypavif" + b"\x00" * 32) + url = _file_to_data_url(img_path) + assert url is None + + def test_svg_skipped_not_transcoded(self, tmp_path: Path): + """SVG is vector; Pillow can't rasterize it. It must be skipped + (None) rather than producing an invalid data URL.""" + from agent.image_routing import _file_to_data_url + + img_path = tmp_path / "icon.svg" + img_path.write_bytes(b'') + url = _file_to_data_url(img_path) + assert url is None diff --git a/tests/gateway/test_mixed_attachment_routing.py b/tests/gateway/test_mixed_attachment_routing.py new file mode 100644 index 00000000000..dfce8920102 --- /dev/null +++ b/tests/gateway/test_mixed_attachment_routing.py @@ -0,0 +1,82 @@ +"""Regression tests for mixed-attachment routing in gateway/run.py. + +Issue #25935: when a message mixes a real image with a document (e.g. a .md +brief), Discord types the whole message MessageType.PHOTO. The per-attachment +loops must classify each attachment by its OWN mimetype: + + * A document must NOT be swept into image_paths just because the message-level + type is PHOTO — mislabelling it as an image sent its bytes to the vision + endpoint, which rejected them with a non-retryable HTTP 400 and killed the + whole turn ("Could not process image"). + * That same document must STILL reach the agent as a readable cached file via + the document context-note path, even though the message-level type isn't + DOCUMENT. + +The message-level fallback (PHOTO/VOICE/AUDIO/VIDEO) is preserved only for +attachments whose per-file mimetype is unknown (empty) — platforms that don't +populate media_types. +""" + +from types import SimpleNamespace + +from gateway.platforms.base import MessageType +from gateway.run import ( + _build_media_placeholder, + _event_media_is_audio, + _event_media_is_image, + _event_media_is_video, +) + + +def _evt(media_urls, media_types, message_type): + return SimpleNamespace( + media_urls=media_urls, + media_types=media_types, + message_type=message_type, + ) + + +# ─── per-attachment classification helpers ─────────────────────────────────── + + +def test_image_trusts_own_mime_over_photo_message_type(): + evt = _evt(["/c/pic.png", "/c/brief.md"], ["image/png", "text/markdown"], MessageType.PHOTO) + assert _event_media_is_image(evt, 0) is True + # The document must NOT be promoted to an image by the PHOTO fallback. + assert _event_media_is_image(evt, 1) is False + + +def test_unknown_mime_falls_back_to_photo_message_type(): + # Platforms that don't populate media_types rely on the message-level type. + evt = _evt(["/c/photo.jpg"], [""], MessageType.PHOTO) + assert _event_media_is_image(evt, 0) is True + + +def test_audio_classified_per_attachment(): + evt = _evt(["/c/clip.ogg", "/c/shot.png"], ["audio/ogg", "image/png"], MessageType.PHOTO) + assert _event_media_is_audio(evt, 0) is True + assert _event_media_is_audio(evt, 1) is False + assert _event_media_is_image(evt, 1) is True + + +def test_video_classified_per_attachment(): + evt = _evt(["/c/movie.mp4", "/c/notes.md"], ["video/mp4", "text/markdown"], MessageType.PHOTO) + assert _event_media_is_video(evt, 0) is True + assert _event_media_is_video(evt, 1) is False + + +# ─── _build_media_placeholder ──────────────────────────────────────────────── + + +def test_placeholder_document_in_photo_message_is_not_an_image(): + evt = _evt(["/c/product.png", "/c/brief.md"], ["image/png", "text/markdown"], MessageType.PHOTO) + out = _build_media_placeholder(evt) + assert "[User sent an image: /c/product.png]" in out + assert "[User sent an image: /c/brief.md]" not in out + assert "[User sent a file: /c/brief.md]" in out + + +def test_placeholder_image_with_unknown_mime_uses_photo_fallback(): + evt = _evt(["/c/photo.jpg"], [""], MessageType.PHOTO) + out = _build_media_placeholder(evt) + assert "[User sent an image: /c/photo.jpg]" in out