mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-09 03:11:58 +00:00
In native image mode (vision-capable models like gpt-4o, claude-sonnet-4),
build_native_content_parts() previously emitted only the user's caption
plus image_url parts. The local file path of each attached image never
appeared in the conversation text, so the model could see the pixels but
had no string handle for tools that take image_url: str (custom MCP
tools, vision_analyze on a re-look, attach-to-tracker workflows).
The text-mode path already injects an equivalent hint via
Runner._enrich_message_with_vision ("...vision_analyze using image_url:
<path>..."). This brings native mode to parity by appending one
"[Image attached at: <path>]" line per successfully attached image to
the user-text part of the multimodal turn. Skipped (unreadable) paths
are NOT advertised, so the model is never told a non-existent file is
attached.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
256 lines
12 KiB
Python
256 lines
12 KiB
Python
"""Tests for agent/image_routing.py — the per-turn image input mode decision."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from agent.image_routing import (
|
|
_coerce_mode,
|
|
_explicit_aux_vision_override,
|
|
build_native_content_parts,
|
|
decide_image_input_mode,
|
|
)
|
|
|
|
|
|
# ─── _coerce_mode ────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestCoerceMode:
|
|
def test_valid_modes_pass_through(self):
|
|
assert _coerce_mode("auto") == "auto"
|
|
assert _coerce_mode("native") == "native"
|
|
assert _coerce_mode("text") == "text"
|
|
|
|
def test_case_insensitive(self):
|
|
assert _coerce_mode("NATIVE") == "native"
|
|
assert _coerce_mode("Auto") == "auto"
|
|
|
|
def test_invalid_falls_back_to_auto(self):
|
|
assert _coerce_mode("nonsense") == "auto"
|
|
assert _coerce_mode("") == "auto"
|
|
assert _coerce_mode(None) == "auto"
|
|
assert _coerce_mode(42) == "auto"
|
|
|
|
def test_strips_whitespace(self):
|
|
assert _coerce_mode(" native ") == "native"
|
|
|
|
|
|
# ─── _explicit_aux_vision_override ───────────────────────────────────────────
|
|
|
|
|
|
class TestExplicitAuxVisionOverride:
|
|
def test_none_config(self):
|
|
assert _explicit_aux_vision_override(None) is False
|
|
|
|
def test_empty_config(self):
|
|
assert _explicit_aux_vision_override({}) is False
|
|
|
|
def test_default_auto_is_not_explicit(self):
|
|
cfg = {"auxiliary": {"vision": {"provider": "auto", "model": "", "base_url": ""}}}
|
|
assert _explicit_aux_vision_override(cfg) is False
|
|
|
|
def test_provider_set_is_explicit(self):
|
|
cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": ""}}}
|
|
assert _explicit_aux_vision_override(cfg) is True
|
|
|
|
def test_model_set_is_explicit(self):
|
|
cfg = {"auxiliary": {"vision": {"provider": "auto", "model": "google/gemini-2.5-flash"}}}
|
|
assert _explicit_aux_vision_override(cfg) is True
|
|
|
|
def test_base_url_set_is_explicit(self):
|
|
cfg = {"auxiliary": {"vision": {"provider": "auto", "base_url": "http://localhost:11434"}}}
|
|
assert _explicit_aux_vision_override(cfg) is True
|
|
|
|
|
|
# ─── decide_image_input_mode ─────────────────────────────────────────────────
|
|
|
|
|
|
class TestDecideImageInputMode:
|
|
def test_explicit_native_overrides_everything(self):
|
|
cfg = {"agent": {"image_input_mode": "native"}}
|
|
# Non-vision model, aux-vision explicitly configured: native still wins.
|
|
cfg["auxiliary"] = {"vision": {"provider": "openrouter", "model": "foo"}}
|
|
with patch("agent.image_routing._lookup_supports_vision", return_value=False):
|
|
assert decide_image_input_mode("openrouter", "some-non-vision-model", cfg) == "native"
|
|
|
|
def test_explicit_text_overrides_everything(self):
|
|
cfg = {"agent": {"image_input_mode": "text"}}
|
|
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
|
|
assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text"
|
|
|
|
def test_auto_with_vision_capable_model(self):
|
|
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
|
|
assert decide_image_input_mode("anthropic", "claude-sonnet-4", {}) == "native"
|
|
|
|
def test_auto_with_non_vision_model(self):
|
|
with patch("agent.image_routing._lookup_supports_vision", return_value=False):
|
|
assert decide_image_input_mode("openrouter", "qwen/qwen3-235b", {}) == "text"
|
|
|
|
def test_auto_with_unknown_model(self):
|
|
with patch("agent.image_routing._lookup_supports_vision", return_value=None):
|
|
assert decide_image_input_mode("openrouter", "brand-new-slug", {}) == "text"
|
|
|
|
def test_auto_respects_aux_vision_override_even_for_vision_model(self):
|
|
"""If the user configured a dedicated vision backend, don't bypass it."""
|
|
cfg = {"auxiliary": {"vision": {"provider": "openrouter", "model": "google/gemini-2.5-flash"}}}
|
|
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
|
|
assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "text"
|
|
|
|
def test_none_config_is_auto(self):
|
|
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
|
|
assert decide_image_input_mode("anthropic", "claude-sonnet-4", None) == "native"
|
|
|
|
def test_invalid_mode_coerces_to_auto(self):
|
|
cfg = {"agent": {"image_input_mode": "weird-value"}}
|
|
with patch("agent.image_routing._lookup_supports_vision", return_value=True):
|
|
assert decide_image_input_mode("anthropic", "claude-sonnet-4", cfg) == "native"
|
|
|
|
|
|
# ─── build_native_content_parts ──────────────────────────────────────────────
|
|
|
|
|
|
def _png_bytes() -> bytes:
|
|
"""Return a tiny valid 1x1 transparent PNG."""
|
|
return base64.b64decode(
|
|
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGNgYGBgAAAABQABpfZFQAAAAABJRU5ErkJggg=="
|
|
)
|
|
|
|
|
|
class TestBuildNativeContentParts:
|
|
def test_text_then_image(self, tmp_path: Path):
|
|
img = tmp_path / "cat.png"
|
|
img.write_bytes(_png_bytes())
|
|
parts, skipped = build_native_content_parts("hello", [str(img)])
|
|
assert skipped == []
|
|
assert len(parts) == 2
|
|
assert parts[0]["type"] == "text"
|
|
# User caption is preserved and a per-image path hint is appended so
|
|
# the model can use the local path as a string argument for tools
|
|
# that take ``image_url: str`` (issue #18960).
|
|
assert parts[0]["text"] == f"hello\n\n[Image attached at: {img}]"
|
|
assert parts[1]["type"] == "image_url"
|
|
assert parts[1]["image_url"]["url"].startswith("data:image/png;base64,")
|
|
|
|
def test_empty_text_inserts_default_prompt(self, tmp_path: Path):
|
|
img = tmp_path / "cat.jpg"
|
|
img.write_bytes(_png_bytes())
|
|
parts, skipped = build_native_content_parts("", [str(img)])
|
|
assert skipped == []
|
|
# Even with empty user text, we insert a neutral prompt so the turn
|
|
# isn't just pixels, and the path hint is appended after.
|
|
assert parts[0]["type"] == "text"
|
|
assert parts[0]["text"] == (
|
|
f"What do you see in this image?\n\n[Image attached at: {img}]"
|
|
)
|
|
assert parts[1]["type"] == "image_url"
|
|
|
|
def test_missing_file_is_skipped(self, tmp_path: Path):
|
|
parts, skipped = build_native_content_parts("hi", [str(tmp_path / "missing.png")])
|
|
assert skipped == [str(tmp_path / "missing.png")]
|
|
# Skipped paths are NOT advertised in the path hints — the model
|
|
# would otherwise be told a non-existent file is attached.
|
|
assert parts == [{"type": "text", "text": "hi"}]
|
|
|
|
def test_path_hint_appended(self, tmp_path: Path):
|
|
"""The local path of each attached image is appended to the user
|
|
text part so MCP/skill tools that take ``image_url: str`` can be
|
|
invoked on the same image (issue #18960). Mirrors text-mode
|
|
behaviour (`Runner._enrich_message_with_vision`).
|
|
"""
|
|
img = tmp_path / "scan.png"
|
|
img.write_bytes(_png_bytes())
|
|
parts, _ = build_native_content_parts("attach this", [str(img)])
|
|
text_part = next(p for p in parts if p.get("type") == "text")
|
|
assert "[Image attached at:" in text_part["text"]
|
|
assert str(img) in text_part["text"]
|
|
# User caption is preserved verbatim ahead of the hint.
|
|
assert text_part["text"].startswith("attach this")
|
|
|
|
def test_path_hint_one_per_attached_image(self, tmp_path: Path):
|
|
"""Each successfully attached image gets its own path hint line;
|
|
skipped images do NOT appear in the hints.
|
|
"""
|
|
good = tmp_path / "good.png"
|
|
good.write_bytes(_png_bytes())
|
|
missing = tmp_path / "missing.png" # never created
|
|
parts, skipped = build_native_content_parts(
|
|
"see attached", [str(good), str(missing)]
|
|
)
|
|
assert skipped == [str(missing)]
|
|
text_part = next(p for p in parts if p.get("type") == "text")
|
|
assert text_part["text"].count("[Image attached at:") == 1
|
|
assert str(good) in text_part["text"]
|
|
assert str(missing) not in text_part["text"]
|
|
|
|
def test_multiple_images(self, tmp_path: Path):
|
|
img1 = tmp_path / "a.png"
|
|
img2 = tmp_path / "b.png"
|
|
img1.write_bytes(_png_bytes())
|
|
img2.write_bytes(_png_bytes())
|
|
parts, skipped = build_native_content_parts("compare these", [str(img1), str(img2)])
|
|
assert skipped == []
|
|
image_parts = [p for p in parts if p.get("type") == "image_url"]
|
|
assert len(image_parts) == 2
|
|
# Both paths surface in the text part, one per line.
|
|
text_part = next(p for p in parts if p.get("type") == "text")
|
|
assert text_part["text"].count("[Image attached at:") == 2
|
|
assert str(img1) in text_part["text"]
|
|
assert str(img2) in text_part["text"]
|
|
|
|
def test_mime_inference_jpg(self, tmp_path: Path):
|
|
img = tmp_path / "photo.jpg"
|
|
img.write_bytes(_png_bytes()) # bytes are PNG but extension is jpg
|
|
parts, _ = build_native_content_parts("x", [str(img)])
|
|
url = parts[1]["image_url"]["url"]
|
|
assert url.startswith("data:image/jpeg;base64,")
|
|
|
|
def test_mime_inference_webp(self, tmp_path: Path):
|
|
img = tmp_path / "pic.webp"
|
|
img.write_bytes(_png_bytes())
|
|
parts, _ = build_native_content_parts("", [str(img)])
|
|
url = parts[1]["image_url"]["url"]
|
|
assert url.startswith("data:image/webp;base64,")
|
|
|
|
|
|
# ─── Oversize handling ───────────────────────────────────────────────────────
|
|
|
|
|
|
class TestLargeImageHandling:
|
|
"""Large images attach at native size; shrink is handled reactively at
|
|
retry time in ``run_agent._try_shrink_image_parts_in_messages`` rather
|
|
than proactively here.
|
|
"""
|
|
|
|
def test_large_image_passes_through_unchanged(self, tmp_path: Path):
|
|
"""A multi-MB image is attached as-is — no resize, no skip."""
|
|
from agent import image_routing as _ir
|
|
|
|
img = tmp_path / "medium.png"
|
|
# 200 KB of real bytes; not huge but enough to verify no size gate fires.
|
|
img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"X" * 200_000)
|
|
url = _ir._file_to_data_url(img)
|
|
assert url is not None
|
|
assert url.startswith("data:image/png;base64,")
|
|
# Base64 expansion means output is ~4/3 of input, plus header.
|
|
assert len(url) > 200_000
|
|
|
|
def test_missing_file_returns_none(self, tmp_path: Path):
|
|
from agent import image_routing as _ir
|
|
missing = tmp_path / "does_not_exist.png"
|
|
assert _ir._file_to_data_url(missing) is None
|
|
|
|
def test_build_native_parts_no_provider_kwarg(self, tmp_path: Path):
|
|
"""build_native_content_parts takes text + paths, no provider kwarg."""
|
|
from agent import image_routing as _ir
|
|
|
|
img = tmp_path / "cat.png"
|
|
img.write_bytes(_png_bytes())
|
|
parts, skipped = _ir.build_native_content_parts("hi", [str(img)])
|
|
assert skipped == []
|
|
assert len(parts) == 2
|
|
assert parts[0]["type"] == "text"
|
|
assert parts[1]["type"] == "image_url"
|