hermes-agent/tests/run_agent/test_codex_multimodal_tool_result.py
Teknium 3800972dd0
feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955)
When the active main model has native vision and the provider supports
multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini
3, OpenRouter, Nous), vision_analyze loads the image bytes and returns
them to the model as a multimodal tool-result envelope. The model then
sees the pixels directly on its next turn instead of receiving a lossy
text description from an auxiliary LLM.

Falls back to the legacy aux-LLM text path for non-vision models and
unverified providers.

Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and
Cline. All four converge on the same pattern: tool results carry image
content blocks for vision-capable provider/model combinations.

Changes
- tools/vision_tools.py: _vision_analyze_native fast path + provider
  capability table (_supports_media_in_tool_results). Schema description
  updated to reflect new behaviour.
- agent/codex_responses_adapter.py: function_call_output.output now
  accepts the array form for multimodal tool results (was string-only).
  Preflight validates input_text/input_image parts.
- agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so
  tools see the live CLI/gateway override, not the stale config.yaml
  default. set_runtime_main()/clear_runtime_main() helpers.
- run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn
  start so vision_analyze's fast-path check sees the actual runtime.
- tests/conftest.py: clear runtime-main override between tests.

Tests
- tests/tools/test_vision_native_fast_path.py: provider capability
  table, envelope shape, fast-path gating (vision-capable model uses
  fast path; non-vision model falls through to aux).
- tests/run_agent/test_codex_multimodal_tool_result.py: list tool
  content becomes function_call_output.output array; preflight
  preserves arrays and drops unknown part types.

Live verified
- Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a
  typed filepath, gets pixels back, reads exact text from images that
  no aux description could capture (font color irony, multi-line
  fruit-count list, etc.).

PR replaces the closed prior efforts (#16506 shipped the inbound user-
attached path; this PR closes the gap for tool-discovered images).
2026-05-09 21:06:19 -07:00

173 lines
6.6 KiB
Python

"""Tests for codex_responses_adapter multimodal tool-result handling.
Tool messages can contain a list of OpenAI-style content parts
(``[{type:"text"...}, {type:"image_url"...}]``) when the
``vision_analyze`` native fast path returns image bytes for the main model.
This file verifies the Codex Responses adapter:
1. Converts that list into ``function_call_output.output`` as an array of
``input_text``/``input_image`` items (not a stringified blob).
2. Preserves array-shaped output through the preflight validator.
"""
from __future__ import annotations
from agent.codex_responses_adapter import (
_chat_messages_to_responses_input,
_preflight_codex_input_items,
)
def _build_messages_with_multimodal_tool_result():
return [
{"role": "user", "content": "What's in /tmp/foo.png?"},
{
"role": "assistant",
"content": "",
"tool_calls": [{
"id": "call_abc",
"type": "function",
"function": {
"name": "vision_analyze",
"arguments": '{"image_url": "/tmp/foo.png", "question": "describe"}',
},
}],
},
{
"role": "tool",
"name": "vision_analyze",
"tool_call_id": "call_abc",
"content": [
{"type": "text", "text": "Image loaded."},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,XYZ"}},
],
},
]
class TestMultimodalToolResultConversion:
def test_list_content_becomes_output_array(self):
items = _chat_messages_to_responses_input(
_build_messages_with_multimodal_tool_result()
)
# Find the function_call_output item
outputs = [it for it in items if it.get("type") == "function_call_output"]
assert len(outputs) == 1
out = outputs[0]
assert out["call_id"] == "call_abc"
# Output should be a LIST (array form), not a string
assert isinstance(out["output"], list), \
f"Expected array output for multimodal tool result, got {type(out['output']).__name__}: {out['output']!r}"
types = [p.get("type") for p in out["output"]]
assert "input_text" in types
assert "input_image" in types
def test_input_image_preserves_data_url(self):
items = _chat_messages_to_responses_input(
_build_messages_with_multimodal_tool_result()
)
out = next(it for it in items if it.get("type") == "function_call_output")
image_parts = [p for p in out["output"] if p.get("type") == "input_image"]
assert len(image_parts) == 1
assert image_parts[0]["image_url"] == "data:image/png;base64,XYZ"
def test_string_tool_content_still_string_output(self):
msgs = [
{"role": "user", "content": "hi"},
{
"role": "assistant", "content": "",
"tool_calls": [{
"id": "call_x", "type": "function",
"function": {"name": "terminal", "arguments": "{}"},
}],
},
{
"role": "tool", "name": "terminal", "tool_call_id": "call_x",
"content": "ls output here",
},
]
items = _chat_messages_to_responses_input(msgs)
out = next(it for it in items if it.get("type") == "function_call_output")
assert isinstance(out["output"], str)
assert out["output"] == "ls output here"
class TestPreflightAcceptsArrayOutput:
def test_preflight_passes_array_through(self):
raw = [
{
"type": "function_call",
"call_id": "call_abc",
"name": "vision_analyze",
"arguments": "{}",
},
{
"type": "function_call_output",
"call_id": "call_abc",
"output": [
{"type": "input_text", "text": "Image loaded."},
{"type": "input_image", "image_url": "data:image/png;base64,ABC"},
],
},
]
normalized = _preflight_codex_input_items(raw)
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
assert isinstance(out["output"], list)
assert len(out["output"]) == 2
assert out["output"][1]["type"] == "input_image"
assert out["output"][1]["image_url"] == "data:image/png;base64,ABC"
def test_preflight_drops_unknown_part_types(self):
raw = [
{
"type": "function_call",
"call_id": "call_abc", "name": "vision_analyze", "arguments": "{}",
},
{
"type": "function_call_output",
"call_id": "call_abc",
"output": [
{"type": "input_text", "text": "ok"},
{"type": "garbage", "data": "nope"}, # unknown — should be dropped
{"type": "input_image", "image_url": "data:image/png;base64,ZZ"},
],
},
]
normalized = _preflight_codex_input_items(raw)
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
# The "garbage" part is dropped; valid parts remain
types = [p.get("type") for p in out["output"]]
assert types == ["input_text", "input_image"]
def test_preflight_empty_array_becomes_empty_string(self):
# Defensive: an array with no valid parts shouldn't break the API call
raw = [
{
"type": "function_call",
"call_id": "call_x", "name": "vision_analyze", "arguments": "{}",
},
{
"type": "function_call_output",
"call_id": "call_x",
"output": [{"type": "garbage"}], # all dropped
},
]
normalized = _preflight_codex_input_items(raw)
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
assert out["output"] == ""
def test_preflight_string_output_unchanged(self):
raw = [
{
"type": "function_call",
"call_id": "call_x", "name": "terminal", "arguments": "{}",
},
{
"type": "function_call_output",
"call_id": "call_x",
"output": "plain text output",
},
]
normalized = _preflight_codex_input_items(raw)
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
assert out["output"] == "plain text output"