From c8e3e3918509d4c43432ec2cf19ef6a1cfe9cd9c Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 7 May 2026 07:14:16 -0700 Subject: [PATCH] fix(mcp): surface image tool results as MEDIA tags instead of dropping them (#21328) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MCP tool results can include ImageContent blocks (screenshots from Playwright/Blockbench/Puppeteer etc). The tool result handler only extracted block.text, so image blocks were silently dropped and the agent saw an empty or text-only response — losing the actual payload. Add _cache_mcp_image_block() that base64-decodes the block, validates the bytes via gateway.platforms.base.cache_image_from_bytes (which sniffs for PNG/JPEG/WebP signatures and rejects non-images), writes to the shared `~/.hermes/cache/images/` dir, and returns a MEDIA: tag. The handler appends that tag to the result parts so downstream gateway adapters render the image inline. Logs and drops on malformed base64 / non-image payload rather than raising — a single bad block shouldn't kill the tool call. Distilled from #17915 (c3115644151) and #10848 (gnanirahulnutakki), both too stale to cherry-pick (branches diverged enough to revert dozens of unrelated fixes). Went with #10848's approach of plumbing through Hermes' existing MEDIA tag / cache_image_from_bytes infrastructure rather than #17915's raw tempfile path, because it integrates with the remote-backend mount system and messaging adapters that already handle MEDIA tags natively. Co-authored-by: c3115644151 Co-authored-by: gnanirahulnutakki --- tests/tools/test_mcp_image_content.py | 138 ++++++++++++++++++++++++++ tools/mcp_tool.py | 76 +++++++++++++- 2 files changed, 212 insertions(+), 2 deletions(-) create mode 100644 tests/tools/test_mcp_image_content.py diff --git a/tests/tools/test_mcp_image_content.py b/tests/tools/test_mcp_image_content.py new file mode 100644 index 0000000000..ba60fdfecb --- /dev/null +++ b/tests/tools/test_mcp_image_content.py @@ -0,0 +1,138 @@ +"""Regression tests for MCP ImageContent block handling. + +Background +========== +MCP tool results may include ``ImageContent`` blocks (screenshots from +Playwright / Blockbench / Puppeteer / any server that returns renders). +The tool result handler in ``tools/mcp_tool.py`` used to iterate content +blocks looking only for ``block.text`` — image blocks were silently dropped +and the agent saw an empty result. Distilled from @c3115644151's PR #17915 +and @gnanirahulnutakki's PR #10848 (both too stale to cherry-pick); this +test file locks in #10848's approach of plumbing the bytes through +Hermes' existing ``cache_image_from_bytes`` so a ``MEDIA:`` tag +goes back to the agent and through to messaging adapters that render +images natively. +""" + +from __future__ import annotations + +import base64 +from types import SimpleNamespace +from unittest.mock import patch + +import pytest + + +def _png_bytes(): + """Return a minimal valid PNG byte sequence. + + Hermes' ``cache_image_from_bytes`` has a format-sniff guard that rejects + non-image payloads — use a real PNG signature so the test exercises the + full pipeline instead of the reject path. + """ + # 1x1 transparent PNG + return base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" + ) + + +class TestMimeExtension: + def test_maps_jpeg_variants_to_jpg(self): + from tools.mcp_tool import _mcp_image_extension_for_mime_type + assert _mcp_image_extension_for_mime_type("image/jpeg") == ".jpg" + assert _mcp_image_extension_for_mime_type("image/jpg") == ".jpg" + assert _mcp_image_extension_for_mime_type("IMAGE/JPEG") == ".jpg" + assert _mcp_image_extension_for_mime_type("image/jpeg; charset=utf-8") == ".jpg" + + def test_png_falls_through_to_mimetypes(self): + from tools.mcp_tool import _mcp_image_extension_for_mime_type + assert _mcp_image_extension_for_mime_type("image/png") == ".png" + + def test_unknown_defaults_to_png(self): + from tools.mcp_tool import _mcp_image_extension_for_mime_type + assert _mcp_image_extension_for_mime_type("") == ".png" + assert _mcp_image_extension_for_mime_type("image/unheard-of-format") == ".png" + + +class TestCacheMcpImageBlock: + def test_returns_media_tag_for_valid_image_block(self, tmp_path, monkeypatch): + """A well-formed ImageContent block with valid PNG bytes caches + to the image dir and the helper returns a ``MEDIA:`` tag.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + from tools.mcp_tool import _cache_mcp_image_block + + block = SimpleNamespace( + data=base64.b64encode(_png_bytes()).decode("ascii"), + mimeType="image/png", + ) + tag = _cache_mcp_image_block(block) + assert tag.startswith("MEDIA:"), f"expected MEDIA: tag, got {tag!r}" + # The cached file should be in Hermes' image cache dir + from gateway.platforms.base import get_image_cache_dir + cache_dir = str(get_image_cache_dir().resolve()) + assert tag.startswith(f"MEDIA:{cache_dir}"), ( + f"cached file not under HERMES_HOME image cache dir. " + f"tag={tag!r}, cache_dir={cache_dir!r}" + ) + # And it should exist + have the PNG bytes + path = tag[len("MEDIA:"):] + with open(path, "rb") as fh: + assert fh.read() == _png_bytes() + + def test_returns_empty_when_block_is_not_an_image(self, tmp_path, monkeypatch): + """Non-image MIME types shouldn't trigger caching.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + from tools.mcp_tool import _cache_mcp_image_block + + block = SimpleNamespace( + data=base64.b64encode(b"some bytes").decode("ascii"), + mimeType="application/pdf", + ) + assert _cache_mcp_image_block(block) == "" + + def test_returns_empty_when_block_has_no_data(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + from tools.mcp_tool import _cache_mcp_image_block + + block = SimpleNamespace(data=None, mimeType="image/png") + assert _cache_mcp_image_block(block) == "" + + def test_returns_empty_on_malformed_base64(self, tmp_path, monkeypatch): + """A server that sends garbage base64 shouldn't crash the handler — + we log and drop the block, letting any text blocks still come through.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + from tools.mcp_tool import _cache_mcp_image_block + + block = SimpleNamespace( + data="!!!not-base64!!!", + mimeType="image/png", + ) + assert _cache_mcp_image_block(block) == "" + + def test_returns_empty_when_bytes_dont_look_like_an_image(self, tmp_path, monkeypatch): + """``cache_image_from_bytes`` has a format sniff; if the claimed + ``image/png`` is actually an HTML error page, the cache raises and + we log + drop rather than propagate.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + from tools.mcp_tool import _cache_mcp_image_block + + block = SimpleNamespace( + data=base64.b64encode(b"error").decode("ascii"), + mimeType="image/png", + ) + assert _cache_mcp_image_block(block) == "" + + def test_handles_jpeg(self, tmp_path, monkeypatch): + """JPEG signature should also be accepted.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + from tools.mcp_tool import _cache_mcp_image_block + + # minimal JPEG SOI marker + filler + jpeg = b"\xff\xd8\xff\xe0" + b"\x00" * 100 + b"\xff\xd9" + block = SimpleNamespace( + data=base64.b64encode(jpeg).decode("ascii"), + mimeType="image/jpeg", + ) + tag = _cache_mcp_image_block(block) + assert tag.startswith("MEDIA:") + assert tag.endswith(".jpg"), f"expected .jpg extension, got {tag!r}" diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index fcfc5dbadc..95ac400fdb 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -426,6 +426,64 @@ def _resolve_stdio_command(command: str, env: dict) -> tuple[str, dict]: return resolved_command, resolved_env +# --------------------------------------------------------------------------- +# MCP ImageContent block → Hermes MEDIA tag +# --------------------------------------------------------------------------- + + +def _mcp_image_extension_for_mime_type(mime_type: str) -> str: + """Return a reasonable file extension for an MCP image MIME type.""" + import mimetypes + normalized = (mime_type or "").split(";", 1)[0].strip().lower() + if normalized in {"image/jpeg", "image/jpg"}: + return ".jpg" + return mimetypes.guess_extension(normalized) or ".png" + + +def _cache_mcp_image_block(block) -> str: + """Cache an MCP ``ImageContent`` block to the shared image cache and + return a ``MEDIA:`` tag that Hermes gateways know how to render. + + Returns an empty string when *block* is not an image, when the base64 + payload is malformed, or when the cache helper rejects the bytes (e.g. + non-image MIME masquerading as an image). Errors are logged, not raised: + a single bad block shouldn't kill the tool result, and the caller will + fall through to any text blocks that did parse. + """ + import base64 + + data = getattr(block, "data", None) + mime_type = getattr(block, "mimeType", None) + normalized_mime = str(mime_type or "").split(";", 1)[0].strip().lower() + if data is None or not normalized_mime.startswith("image/"): + return "" + + try: + raw_bytes = base64.b64decode(data) + except (TypeError, ValueError) as exc: + logger.warning("MCP image block decode failed (%s): %s", normalized_mime, exc) + return "" + + try: + from gateway.platforms.base import cache_image_from_bytes + + image_path = cache_image_from_bytes( + raw_bytes, + ext=_mcp_image_extension_for_mime_type(normalized_mime), + ) + except ImportError: + # gateway.platforms.base not importable in this process (e.g. cron + # without gateway deps). Fall back to silently dropping — callers + # get any text blocks that did parse. + logger.debug("MCP image caching skipped — gateway.platforms.base unavailable") + return "" + except Exception as exc: + logger.warning("MCP image block cache failed: %s", exc) + return "" + + return f"MEDIA:{image_path}" + + def _format_connect_error(exc: BaseException) -> str: """Render nested MCP connection errors into an actionable short message.""" @@ -2146,11 +2204,25 @@ def _make_tool_handler(server_name: str, tool_name: str, tool_timeout: float): ) }, ensure_ascii=False) - # Collect text from content blocks + # Collect text from content blocks. MCP tool results can also + # include ImageContent blocks (screenshot / Blockbench / Playwright + # etc.); cache those via the gateway's image-cache helper so they + # flow through Hermes' MEDIA: tag convention and out to messaging + # adapters that render images natively. Without this, image blocks + # were silently dropped and the agent got an empty response. + # + # Distilled from #17915 (c3115644151) and #10848 (gnanirahulnutakki), + # both too stale to cherry-pick. #10848's approach (integrate with + # Hermes' MEDIA tag + cache_image_from_bytes) was the cleaner of + # the two — plugs into existing infrastructure. parts: List[str] = [] for block in (result.content or []): - if hasattr(block, "text"): + if hasattr(block, "text") and block.text: parts.append(block.text) + continue + image_tag = _cache_mcp_image_block(block) + if image_tag: + parts.append(image_tag) text_result = "\n".join(parts) if parts else "" # Combine content + structuredContent when both are present.