mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-20 05:01:30 +00:00
feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955)
When the active main model has native vision and the provider supports multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini 3, OpenRouter, Nous), vision_analyze loads the image bytes and returns them to the model as a multimodal tool-result envelope. The model then sees the pixels directly on its next turn instead of receiving a lossy text description from an auxiliary LLM. Falls back to the legacy aux-LLM text path for non-vision models and unverified providers. Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and Cline. All four converge on the same pattern: tool results carry image content blocks for vision-capable provider/model combinations. Changes - tools/vision_tools.py: _vision_analyze_native fast path + provider capability table (_supports_media_in_tool_results). Schema description updated to reflect new behaviour. - agent/codex_responses_adapter.py: function_call_output.output now accepts the array form for multimodal tool results (was string-only). Preflight validates input_text/input_image parts. - agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so tools see the live CLI/gateway override, not the stale config.yaml default. set_runtime_main()/clear_runtime_main() helpers. - run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn start so vision_analyze's fast-path check sees the actual runtime. - tests/conftest.py: clear runtime-main override between tests. Tests - tests/tools/test_vision_native_fast_path.py: provider capability table, envelope shape, fast-path gating (vision-capable model uses fast path; non-vision model falls through to aux). - tests/run_agent/test_codex_multimodal_tool_result.py: list tool content becomes function_call_output.output array; preflight preserves arrays and drops unknown part types. Live verified - Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a typed filepath, gets pixels back, reads exact text from images that no aux description could capture (font color irony, multi-line fruit-count list, etc.). PR replaces the closed prior efforts (#16506 shipped the inbound user- attached path; this PR closes the gap for tool-discovered images).
This commit is contained in:
parent
e62250453b
commit
3800972dd0
7 changed files with 757 additions and 10 deletions
|
|
@ -427,6 +427,15 @@ def _reset_module_state():
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
# --- agent.auxiliary_client — runtime main provider/model override ---
|
||||
# Set per-turn by AIAgent.run_conversation; tests that import it must
|
||||
# see a clean state so config.yaml fallback works as expected.
|
||||
try:
|
||||
from agent import auxiliary_client as _aux_mod
|
||||
_aux_mod.clear_runtime_main()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# --- tools.file_tools — per-task read history + file-ops cache ---
|
||||
# _read_tracker accumulates per-task_id read history for loop detection,
|
||||
# capped by _READ_HISTORY_CAP. If entries from a prior test persist, the
|
||||
|
|
|
|||
173
tests/run_agent/test_codex_multimodal_tool_result.py
Normal file
173
tests/run_agent/test_codex_multimodal_tool_result.py
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
"""Tests for codex_responses_adapter multimodal tool-result handling.
|
||||
|
||||
Tool messages can contain a list of OpenAI-style content parts
|
||||
(``[{type:"text"...}, {type:"image_url"...}]``) when the
|
||||
``vision_analyze`` native fast path returns image bytes for the main model.
|
||||
This file verifies the Codex Responses adapter:
|
||||
|
||||
1. Converts that list into ``function_call_output.output`` as an array of
|
||||
``input_text``/``input_image`` items (not a stringified blob).
|
||||
2. Preserves array-shaped output through the preflight validator.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from agent.codex_responses_adapter import (
|
||||
_chat_messages_to_responses_input,
|
||||
_preflight_codex_input_items,
|
||||
)
|
||||
|
||||
|
||||
def _build_messages_with_multimodal_tool_result():
|
||||
return [
|
||||
{"role": "user", "content": "What's in /tmp/foo.png?"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [{
|
||||
"id": "call_abc",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "vision_analyze",
|
||||
"arguments": '{"image_url": "/tmp/foo.png", "question": "describe"}',
|
||||
},
|
||||
}],
|
||||
},
|
||||
{
|
||||
"role": "tool",
|
||||
"name": "vision_analyze",
|
||||
"tool_call_id": "call_abc",
|
||||
"content": [
|
||||
{"type": "text", "text": "Image loaded."},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,XYZ"}},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class TestMultimodalToolResultConversion:
|
||||
def test_list_content_becomes_output_array(self):
|
||||
items = _chat_messages_to_responses_input(
|
||||
_build_messages_with_multimodal_tool_result()
|
||||
)
|
||||
# Find the function_call_output item
|
||||
outputs = [it for it in items if it.get("type") == "function_call_output"]
|
||||
assert len(outputs) == 1
|
||||
out = outputs[0]
|
||||
assert out["call_id"] == "call_abc"
|
||||
# Output should be a LIST (array form), not a string
|
||||
assert isinstance(out["output"], list), \
|
||||
f"Expected array output for multimodal tool result, got {type(out['output']).__name__}: {out['output']!r}"
|
||||
types = [p.get("type") for p in out["output"]]
|
||||
assert "input_text" in types
|
||||
assert "input_image" in types
|
||||
|
||||
def test_input_image_preserves_data_url(self):
|
||||
items = _chat_messages_to_responses_input(
|
||||
_build_messages_with_multimodal_tool_result()
|
||||
)
|
||||
out = next(it for it in items if it.get("type") == "function_call_output")
|
||||
image_parts = [p for p in out["output"] if p.get("type") == "input_image"]
|
||||
assert len(image_parts) == 1
|
||||
assert image_parts[0]["image_url"] == "data:image/png;base64,XYZ"
|
||||
|
||||
def test_string_tool_content_still_string_output(self):
|
||||
msgs = [
|
||||
{"role": "user", "content": "hi"},
|
||||
{
|
||||
"role": "assistant", "content": "",
|
||||
"tool_calls": [{
|
||||
"id": "call_x", "type": "function",
|
||||
"function": {"name": "terminal", "arguments": "{}"},
|
||||
}],
|
||||
},
|
||||
{
|
||||
"role": "tool", "name": "terminal", "tool_call_id": "call_x",
|
||||
"content": "ls output here",
|
||||
},
|
||||
]
|
||||
items = _chat_messages_to_responses_input(msgs)
|
||||
out = next(it for it in items if it.get("type") == "function_call_output")
|
||||
assert isinstance(out["output"], str)
|
||||
assert out["output"] == "ls output here"
|
||||
|
||||
|
||||
class TestPreflightAcceptsArrayOutput:
|
||||
def test_preflight_passes_array_through(self):
|
||||
raw = [
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "call_abc",
|
||||
"name": "vision_analyze",
|
||||
"arguments": "{}",
|
||||
},
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": "call_abc",
|
||||
"output": [
|
||||
{"type": "input_text", "text": "Image loaded."},
|
||||
{"type": "input_image", "image_url": "data:image/png;base64,ABC"},
|
||||
],
|
||||
},
|
||||
]
|
||||
normalized = _preflight_codex_input_items(raw)
|
||||
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
|
||||
assert isinstance(out["output"], list)
|
||||
assert len(out["output"]) == 2
|
||||
assert out["output"][1]["type"] == "input_image"
|
||||
assert out["output"][1]["image_url"] == "data:image/png;base64,ABC"
|
||||
|
||||
def test_preflight_drops_unknown_part_types(self):
|
||||
raw = [
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "call_abc", "name": "vision_analyze", "arguments": "{}",
|
||||
},
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": "call_abc",
|
||||
"output": [
|
||||
{"type": "input_text", "text": "ok"},
|
||||
{"type": "garbage", "data": "nope"}, # unknown — should be dropped
|
||||
{"type": "input_image", "image_url": "data:image/png;base64,ZZ"},
|
||||
],
|
||||
},
|
||||
]
|
||||
normalized = _preflight_codex_input_items(raw)
|
||||
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
|
||||
# The "garbage" part is dropped; valid parts remain
|
||||
types = [p.get("type") for p in out["output"]]
|
||||
assert types == ["input_text", "input_image"]
|
||||
|
||||
def test_preflight_empty_array_becomes_empty_string(self):
|
||||
# Defensive: an array with no valid parts shouldn't break the API call
|
||||
raw = [
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "call_x", "name": "vision_analyze", "arguments": "{}",
|
||||
},
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": "call_x",
|
||||
"output": [{"type": "garbage"}], # all dropped
|
||||
},
|
||||
]
|
||||
normalized = _preflight_codex_input_items(raw)
|
||||
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
|
||||
assert out["output"] == ""
|
||||
|
||||
def test_preflight_string_output_unchanged(self):
|
||||
raw = [
|
||||
{
|
||||
"type": "function_call",
|
||||
"call_id": "call_x", "name": "terminal", "arguments": "{}",
|
||||
},
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": "call_x",
|
||||
"output": "plain text output",
|
||||
},
|
||||
]
|
||||
normalized = _preflight_codex_input_items(raw)
|
||||
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
|
||||
assert out["output"] == "plain text output"
|
||||
207
tests/tools/test_vision_native_fast_path.py
Normal file
207
tests/tools/test_vision_native_fast_path.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
"""Tests for the native-vision fast path inside vision_analyze.
|
||||
|
||||
When the active main model supports native vision AND the provider supports
|
||||
image content inside tool-result messages, ``_handle_vision_analyze`` skips
|
||||
the auxiliary LLM and returns a multimodal envelope so the main model sees
|
||||
the pixels directly on its next turn.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from tools.vision_tools import (
|
||||
_build_native_vision_tool_result,
|
||||
_handle_vision_analyze,
|
||||
_supports_media_in_tool_results,
|
||||
_vision_analyze_native,
|
||||
)
|
||||
|
||||
|
||||
# Minimal valid 1x1 PNG bytes.
|
||||
_TINY_PNG = base64.b64decode(
|
||||
b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
|
||||
)
|
||||
|
||||
|
||||
# ─── _supports_media_in_tool_results ─────────────────────────────────────────
|
||||
|
||||
|
||||
class TestSupportsMediaInToolResults:
|
||||
def test_anthropic_native_yes(self):
|
||||
assert _supports_media_in_tool_results("anthropic", "claude-opus-4-6") is True
|
||||
|
||||
def test_openrouter_yes(self):
|
||||
assert _supports_media_in_tool_results("openrouter", "anthropic/claude-opus-4.6") is True
|
||||
|
||||
def test_nous_yes(self):
|
||||
assert _supports_media_in_tool_results("nous", "anthropic/claude-sonnet-4.6") is True
|
||||
|
||||
def test_openai_chat_yes(self):
|
||||
assert _supports_media_in_tool_results("openai", "gpt-5.4") is True
|
||||
|
||||
def test_openai_codex_yes(self):
|
||||
assert _supports_media_in_tool_results("openai-codex", "gpt-5-codex") is True
|
||||
|
||||
def test_gemini_3_yes(self):
|
||||
assert _supports_media_in_tool_results("google", "gemini-3-flash-preview") is True
|
||||
|
||||
def test_gemini_2_no(self):
|
||||
assert _supports_media_in_tool_results("google", "gemini-2.5-pro") is False
|
||||
|
||||
def test_unknown_provider_conservative_no(self):
|
||||
assert _supports_media_in_tool_results("brand-new-provider", "any-model") is False
|
||||
|
||||
def test_empty_provider_no(self):
|
||||
assert _supports_media_in_tool_results("", "anything") is False
|
||||
assert _supports_media_in_tool_results(None, "anything") is False # type: ignore[arg-type]
|
||||
|
||||
|
||||
# ─── _build_native_vision_tool_result ────────────────────────────────────────
|
||||
|
||||
|
||||
class TestBuildNativeVisionToolResult:
|
||||
def test_envelope_shape(self):
|
||||
env = _build_native_vision_tool_result(
|
||||
image_url="/tmp/foo.png",
|
||||
question="what does it say?",
|
||||
image_data_url="data:image/png;base64,XYZ",
|
||||
image_size_bytes=1024,
|
||||
)
|
||||
assert env["_multimodal"] is True
|
||||
assert isinstance(env["content"], list)
|
||||
assert len(env["content"]) == 2
|
||||
assert env["content"][0]["type"] == "text"
|
||||
assert env["content"][1]["type"] == "image_url"
|
||||
assert env["content"][1]["image_url"]["url"] == "data:image/png;base64,XYZ"
|
||||
assert "what does it say?" in env["content"][0]["text"]
|
||||
assert "Image attached natively" in env["text_summary"]
|
||||
|
||||
def test_no_question_omits_question_section(self):
|
||||
env = _build_native_vision_tool_result(
|
||||
image_url="/tmp/foo.png",
|
||||
question="",
|
||||
image_data_url="data:image/png;base64,XYZ",
|
||||
image_size_bytes=512,
|
||||
)
|
||||
text = env["content"][0]["text"]
|
||||
assert "Question:" not in text
|
||||
assert "Image loaded" in text
|
||||
|
||||
|
||||
# ─── _vision_analyze_native ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestVisionAnalyzeNative:
|
||||
def test_local_file_returns_multimodal_envelope(self, tmp_path):
|
||||
img = tmp_path / "test.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_vision_analyze_native(str(img), "what is this?")
|
||||
)
|
||||
assert isinstance(result, dict)
|
||||
assert result.get("_multimodal") is True
|
||||
parts = result["content"]
|
||||
assert any(p.get("type") == "image_url" for p in parts)
|
||||
assert any(p.get("type") == "text" for p in parts)
|
||||
url = next(p["image_url"]["url"] for p in parts if p.get("type") == "image_url")
|
||||
assert url.startswith("data:image/")
|
||||
|
||||
def test_missing_file_returns_error_string(self, tmp_path):
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_vision_analyze_native(str(tmp_path / "nope.png"), "?")
|
||||
)
|
||||
# tool_error returns a JSON string, not the multimodal envelope
|
||||
assert isinstance(result, str)
|
||||
parsed = json.loads(result)
|
||||
assert parsed.get("success") is False
|
||||
assert "Invalid image source" in parsed.get("error", "")
|
||||
|
||||
def test_empty_image_url_returns_error(self):
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_vision_analyze_native("", "?")
|
||||
)
|
||||
assert isinstance(result, str)
|
||||
parsed = json.loads(result)
|
||||
assert parsed.get("success") is False
|
||||
assert "image_url is required" in parsed.get("error", "")
|
||||
|
||||
def test_file_url_scheme_resolves(self, tmp_path):
|
||||
img = tmp_path / "t.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
result = asyncio.get_event_loop().run_until_complete(
|
||||
_vision_analyze_native(f"file://{img}", "?")
|
||||
)
|
||||
assert isinstance(result, dict)
|
||||
assert result.get("_multimodal") is True
|
||||
|
||||
|
||||
# ─── _handle_vision_analyze fast-path gating ─────────────────────────────────
|
||||
|
||||
|
||||
class TestHandleVisionAnalyzeFastPath:
|
||||
"""Verify the dispatcher chooses fast-path vs aux-LLM correctly."""
|
||||
|
||||
def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch):
|
||||
"""Main model supports native vision → fast path returns multimodal."""
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
|
||||
# Set runtime override so the handler thinks we're on opus@openrouter
|
||||
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||
set_runtime_main("openrouter", "anthropic/claude-opus-4.6")
|
||||
try:
|
||||
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
assert isinstance(result, dict), \
|
||||
f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
|
||||
assert result.get("_multimodal") is True
|
||||
|
||||
def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch):
|
||||
"""Non-vision main model → fast path skipped, aux LLM path attempted."""
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
|
||||
async def _aux_sentinel(*args, **kwargs):
|
||||
return '{"sentinel": "aux-path"}'
|
||||
|
||||
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||
set_runtime_main("openrouter", "qwen/qwen3-coder")
|
||||
try:
|
||||
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
|
||||
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
|
||||
"Fast path fired for non-vision model; should have fallen through to aux LLM"
|
||||
|
||||
def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch):
|
||||
"""Even with vision-capable model, unknown provider → fall through."""
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_TINY_PNG)
|
||||
|
||||
async def _aux_sentinel(*args, **kwargs):
|
||||
return '{"sentinel": "aux-path"}'
|
||||
|
||||
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||
set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6")
|
||||
try:
|
||||
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
|
||||
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||
finally:
|
||||
clear_runtime_main()
|
||||
|
||||
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
|
||||
"Fast path fired for unknown provider; should have fallen through"
|
||||
Loading…
Add table
Add a link
Reference in a new issue