mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(mcp): surface image tool results as MEDIA tags instead of dropping them (#21328)
MCP tool results can include ImageContent blocks (screenshots from Playwright/Blockbench/Puppeteer etc). The tool result handler only extracted block.text, so image blocks were silently dropped and the agent saw an empty or text-only response — losing the actual payload. Add _cache_mcp_image_block() that base64-decodes the block, validates the bytes via gateway.platforms.base.cache_image_from_bytes (which sniffs for PNG/JPEG/WebP signatures and rejects non-images), writes to the shared `~/.hermes/cache/images/` dir, and returns a MEDIA:<path> tag. The handler appends that tag to the result parts so downstream gateway adapters render the image inline. Logs and drops on malformed base64 / non-image payload rather than raising — a single bad block shouldn't kill the tool call. Distilled from #17915 (c3115644151) and #10848 (gnanirahulnutakki), both too stale to cherry-pick (branches diverged enough to revert dozens of unrelated fixes). Went with #10848's approach of plumbing through Hermes' existing MEDIA tag / cache_image_from_bytes infrastructure rather than #17915's raw tempfile path, because it integrates with the remote-backend mount system and messaging adapters that already handle MEDIA tags natively. Co-authored-by: c3115644151 <c3115644151@users.noreply.github.com> Co-authored-by: gnanirahulnutakki <gnanirahulnutakki@users.noreply.github.com>
This commit is contained in:
parent
dd2dc2bddf
commit
c8e3e39185
2 changed files with 212 additions and 2 deletions
138
tests/tools/test_mcp_image_content.py
Normal file
138
tests/tools/test_mcp_image_content.py
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
"""Regression tests for MCP ImageContent block handling.
|
||||||
|
|
||||||
|
Background
|
||||||
|
==========
|
||||||
|
MCP tool results may include ``ImageContent`` blocks (screenshots from
|
||||||
|
Playwright / Blockbench / Puppeteer / any server that returns renders).
|
||||||
|
The tool result handler in ``tools/mcp_tool.py`` used to iterate content
|
||||||
|
blocks looking only for ``block.text`` — image blocks were silently dropped
|
||||||
|
and the agent saw an empty result. Distilled from @c3115644151's PR #17915
|
||||||
|
and @gnanirahulnutakki's PR #10848 (both too stale to cherry-pick); this
|
||||||
|
test file locks in #10848's approach of plumbing the bytes through
|
||||||
|
Hermes' existing ``cache_image_from_bytes`` so a ``MEDIA:<path>`` tag
|
||||||
|
goes back to the agent and through to messaging adapters that render
|
||||||
|
images natively.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
from types import SimpleNamespace
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def _png_bytes():
|
||||||
|
"""Return a minimal valid PNG byte sequence.
|
||||||
|
|
||||||
|
Hermes' ``cache_image_from_bytes`` has a format-sniff guard that rejects
|
||||||
|
non-image payloads — use a real PNG signature so the test exercises the
|
||||||
|
full pipeline instead of the reject path.
|
||||||
|
"""
|
||||||
|
# 1x1 transparent PNG
|
||||||
|
return base64.b64decode(
|
||||||
|
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII="
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMimeExtension:
|
||||||
|
def test_maps_jpeg_variants_to_jpg(self):
|
||||||
|
from tools.mcp_tool import _mcp_image_extension_for_mime_type
|
||||||
|
assert _mcp_image_extension_for_mime_type("image/jpeg") == ".jpg"
|
||||||
|
assert _mcp_image_extension_for_mime_type("image/jpg") == ".jpg"
|
||||||
|
assert _mcp_image_extension_for_mime_type("IMAGE/JPEG") == ".jpg"
|
||||||
|
assert _mcp_image_extension_for_mime_type("image/jpeg; charset=utf-8") == ".jpg"
|
||||||
|
|
||||||
|
def test_png_falls_through_to_mimetypes(self):
|
||||||
|
from tools.mcp_tool import _mcp_image_extension_for_mime_type
|
||||||
|
assert _mcp_image_extension_for_mime_type("image/png") == ".png"
|
||||||
|
|
||||||
|
def test_unknown_defaults_to_png(self):
|
||||||
|
from tools.mcp_tool import _mcp_image_extension_for_mime_type
|
||||||
|
assert _mcp_image_extension_for_mime_type("") == ".png"
|
||||||
|
assert _mcp_image_extension_for_mime_type("image/unheard-of-format") == ".png"
|
||||||
|
|
||||||
|
|
||||||
|
class TestCacheMcpImageBlock:
|
||||||
|
def test_returns_media_tag_for_valid_image_block(self, tmp_path, monkeypatch):
|
||||||
|
"""A well-formed ImageContent block with valid PNG bytes caches
|
||||||
|
to the image dir and the helper returns a ``MEDIA:<path>`` tag."""
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
from tools.mcp_tool import _cache_mcp_image_block
|
||||||
|
|
||||||
|
block = SimpleNamespace(
|
||||||
|
data=base64.b64encode(_png_bytes()).decode("ascii"),
|
||||||
|
mimeType="image/png",
|
||||||
|
)
|
||||||
|
tag = _cache_mcp_image_block(block)
|
||||||
|
assert tag.startswith("MEDIA:"), f"expected MEDIA: tag, got {tag!r}"
|
||||||
|
# The cached file should be in Hermes' image cache dir
|
||||||
|
from gateway.platforms.base import get_image_cache_dir
|
||||||
|
cache_dir = str(get_image_cache_dir().resolve())
|
||||||
|
assert tag.startswith(f"MEDIA:{cache_dir}"), (
|
||||||
|
f"cached file not under HERMES_HOME image cache dir. "
|
||||||
|
f"tag={tag!r}, cache_dir={cache_dir!r}"
|
||||||
|
)
|
||||||
|
# And it should exist + have the PNG bytes
|
||||||
|
path = tag[len("MEDIA:"):]
|
||||||
|
with open(path, "rb") as fh:
|
||||||
|
assert fh.read() == _png_bytes()
|
||||||
|
|
||||||
|
def test_returns_empty_when_block_is_not_an_image(self, tmp_path, monkeypatch):
|
||||||
|
"""Non-image MIME types shouldn't trigger caching."""
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
from tools.mcp_tool import _cache_mcp_image_block
|
||||||
|
|
||||||
|
block = SimpleNamespace(
|
||||||
|
data=base64.b64encode(b"some bytes").decode("ascii"),
|
||||||
|
mimeType="application/pdf",
|
||||||
|
)
|
||||||
|
assert _cache_mcp_image_block(block) == ""
|
||||||
|
|
||||||
|
def test_returns_empty_when_block_has_no_data(self, tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
from tools.mcp_tool import _cache_mcp_image_block
|
||||||
|
|
||||||
|
block = SimpleNamespace(data=None, mimeType="image/png")
|
||||||
|
assert _cache_mcp_image_block(block) == ""
|
||||||
|
|
||||||
|
def test_returns_empty_on_malformed_base64(self, tmp_path, monkeypatch):
|
||||||
|
"""A server that sends garbage base64 shouldn't crash the handler —
|
||||||
|
we log and drop the block, letting any text blocks still come through."""
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
from tools.mcp_tool import _cache_mcp_image_block
|
||||||
|
|
||||||
|
block = SimpleNamespace(
|
||||||
|
data="!!!not-base64!!!",
|
||||||
|
mimeType="image/png",
|
||||||
|
)
|
||||||
|
assert _cache_mcp_image_block(block) == ""
|
||||||
|
|
||||||
|
def test_returns_empty_when_bytes_dont_look_like_an_image(self, tmp_path, monkeypatch):
|
||||||
|
"""``cache_image_from_bytes`` has a format sniff; if the claimed
|
||||||
|
``image/png`` is actually an HTML error page, the cache raises and
|
||||||
|
we log + drop rather than propagate."""
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
from tools.mcp_tool import _cache_mcp_image_block
|
||||||
|
|
||||||
|
block = SimpleNamespace(
|
||||||
|
data=base64.b64encode(b"<html>error</html>").decode("ascii"),
|
||||||
|
mimeType="image/png",
|
||||||
|
)
|
||||||
|
assert _cache_mcp_image_block(block) == ""
|
||||||
|
|
||||||
|
def test_handles_jpeg(self, tmp_path, monkeypatch):
|
||||||
|
"""JPEG signature should also be accepted."""
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
from tools.mcp_tool import _cache_mcp_image_block
|
||||||
|
|
||||||
|
# minimal JPEG SOI marker + filler
|
||||||
|
jpeg = b"\xff\xd8\xff\xe0" + b"\x00" * 100 + b"\xff\xd9"
|
||||||
|
block = SimpleNamespace(
|
||||||
|
data=base64.b64encode(jpeg).decode("ascii"),
|
||||||
|
mimeType="image/jpeg",
|
||||||
|
)
|
||||||
|
tag = _cache_mcp_image_block(block)
|
||||||
|
assert tag.startswith("MEDIA:")
|
||||||
|
assert tag.endswith(".jpg"), f"expected .jpg extension, got {tag!r}"
|
||||||
|
|
@ -426,6 +426,64 @@ def _resolve_stdio_command(command: str, env: dict) -> tuple[str, dict]:
|
||||||
return resolved_command, resolved_env
|
return resolved_command, resolved_env
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# MCP ImageContent block → Hermes MEDIA tag
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _mcp_image_extension_for_mime_type(mime_type: str) -> str:
|
||||||
|
"""Return a reasonable file extension for an MCP image MIME type."""
|
||||||
|
import mimetypes
|
||||||
|
normalized = (mime_type or "").split(";", 1)[0].strip().lower()
|
||||||
|
if normalized in {"image/jpeg", "image/jpg"}:
|
||||||
|
return ".jpg"
|
||||||
|
return mimetypes.guess_extension(normalized) or ".png"
|
||||||
|
|
||||||
|
|
||||||
|
def _cache_mcp_image_block(block) -> str:
|
||||||
|
"""Cache an MCP ``ImageContent`` block to the shared image cache and
|
||||||
|
return a ``MEDIA:<path>`` tag that Hermes gateways know how to render.
|
||||||
|
|
||||||
|
Returns an empty string when *block* is not an image, when the base64
|
||||||
|
payload is malformed, or when the cache helper rejects the bytes (e.g.
|
||||||
|
non-image MIME masquerading as an image). Errors are logged, not raised:
|
||||||
|
a single bad block shouldn't kill the tool result, and the caller will
|
||||||
|
fall through to any text blocks that did parse.
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
|
||||||
|
data = getattr(block, "data", None)
|
||||||
|
mime_type = getattr(block, "mimeType", None)
|
||||||
|
normalized_mime = str(mime_type or "").split(";", 1)[0].strip().lower()
|
||||||
|
if data is None or not normalized_mime.startswith("image/"):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw_bytes = base64.b64decode(data)
|
||||||
|
except (TypeError, ValueError) as exc:
|
||||||
|
logger.warning("MCP image block decode failed (%s): %s", normalized_mime, exc)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
from gateway.platforms.base import cache_image_from_bytes
|
||||||
|
|
||||||
|
image_path = cache_image_from_bytes(
|
||||||
|
raw_bytes,
|
||||||
|
ext=_mcp_image_extension_for_mime_type(normalized_mime),
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
# gateway.platforms.base not importable in this process (e.g. cron
|
||||||
|
# without gateway deps). Fall back to silently dropping — callers
|
||||||
|
# get any text blocks that did parse.
|
||||||
|
logger.debug("MCP image caching skipped — gateway.platforms.base unavailable")
|
||||||
|
return ""
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("MCP image block cache failed: %s", exc)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return f"MEDIA:{image_path}"
|
||||||
|
|
||||||
|
|
||||||
def _format_connect_error(exc: BaseException) -> str:
|
def _format_connect_error(exc: BaseException) -> str:
|
||||||
"""Render nested MCP connection errors into an actionable short message."""
|
"""Render nested MCP connection errors into an actionable short message."""
|
||||||
|
|
||||||
|
|
@ -2146,11 +2204,25 @@ def _make_tool_handler(server_name: str, tool_name: str, tool_timeout: float):
|
||||||
)
|
)
|
||||||
}, ensure_ascii=False)
|
}, ensure_ascii=False)
|
||||||
|
|
||||||
# Collect text from content blocks
|
# Collect text from content blocks. MCP tool results can also
|
||||||
|
# include ImageContent blocks (screenshot / Blockbench / Playwright
|
||||||
|
# etc.); cache those via the gateway's image-cache helper so they
|
||||||
|
# flow through Hermes' MEDIA: tag convention and out to messaging
|
||||||
|
# adapters that render images natively. Without this, image blocks
|
||||||
|
# were silently dropped and the agent got an empty response.
|
||||||
|
#
|
||||||
|
# Distilled from #17915 (c3115644151) and #10848 (gnanirahulnutakki),
|
||||||
|
# both too stale to cherry-pick. #10848's approach (integrate with
|
||||||
|
# Hermes' MEDIA tag + cache_image_from_bytes) was the cleaner of
|
||||||
|
# the two — plugs into existing infrastructure.
|
||||||
parts: List[str] = []
|
parts: List[str] = []
|
||||||
for block in (result.content or []):
|
for block in (result.content or []):
|
||||||
if hasattr(block, "text"):
|
if hasattr(block, "text") and block.text:
|
||||||
parts.append(block.text)
|
parts.append(block.text)
|
||||||
|
continue
|
||||||
|
image_tag = _cache_mcp_image_block(block)
|
||||||
|
if image_tag:
|
||||||
|
parts.append(image_tag)
|
||||||
text_result = "\n".join(parts) if parts else ""
|
text_result = "\n".join(parts) if parts else ""
|
||||||
|
|
||||||
# Combine content + structuredContent when both are present.
|
# Combine content + structuredContent when both are present.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue