mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-23 05:31:23 +00:00
feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955)
When the active main model has native vision and the provider supports multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini 3, OpenRouter, Nous), vision_analyze loads the image bytes and returns them to the model as a multimodal tool-result envelope. The model then sees the pixels directly on its next turn instead of receiving a lossy text description from an auxiliary LLM. Falls back to the legacy aux-LLM text path for non-vision models and unverified providers. Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and Cline. All four converge on the same pattern: tool results carry image content blocks for vision-capable provider/model combinations. Changes - tools/vision_tools.py: _vision_analyze_native fast path + provider capability table (_supports_media_in_tool_results). Schema description updated to reflect new behaviour. - agent/codex_responses_adapter.py: function_call_output.output now accepts the array form for multimodal tool results (was string-only). Preflight validates input_text/input_image parts. - agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so tools see the live CLI/gateway override, not the stale config.yaml default. set_runtime_main()/clear_runtime_main() helpers. - run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn start so vision_analyze's fast-path check sees the actual runtime. - tests/conftest.py: clear runtime-main override between tests. Tests - tests/tools/test_vision_native_fast_path.py: provider capability table, envelope shape, fast-path gating (vision-capable model uses fast path; non-vision model falls through to aux). - tests/run_agent/test_codex_multimodal_tool_result.py: list tool content becomes function_call_output.output array; preflight preserves arrays and drops unknown part types. Live verified - Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a typed filepath, gets pixels back, reads exact text from images that no aux description could capture (font color irony, multi-line fruit-count list, etc.). PR replaces the closed prior efforts (#16506 shipped the inbound user- attached path; this PR closes the gap for tool-discovered images).
This commit is contained in:
parent
e62250453b
commit
3800972dd0
7 changed files with 757 additions and 10 deletions
|
|
@ -1463,7 +1463,16 @@ def _read_main_model() -> str:
|
||||||
|
|
||||||
config.yaml model.default is the single source of truth for the active
|
config.yaml model.default is the single source of truth for the active
|
||||||
model. Environment variables are no longer consulted.
|
model. Environment variables are no longer consulted.
|
||||||
|
|
||||||
|
Runtime override: when an AIAgent is active with a CLI/gateway-provided
|
||||||
|
model that differs from config.yaml, ``set_runtime_main()`` records the
|
||||||
|
override in a process-local global. This is consulted FIRST so tools
|
||||||
|
that gate on "the active main model" (e.g. ``vision_analyze``'s native
|
||||||
|
fast path) see the live runtime, not the persisted config default.
|
||||||
"""
|
"""
|
||||||
|
override = _RUNTIME_MAIN_MODEL
|
||||||
|
if isinstance(override, str) and override.strip():
|
||||||
|
return override.strip()
|
||||||
try:
|
try:
|
||||||
from hermes_cli.config import load_config
|
from hermes_cli.config import load_config
|
||||||
cfg = load_config()
|
cfg = load_config()
|
||||||
|
|
@ -1484,7 +1493,13 @@ def _read_main_provider() -> str:
|
||||||
|
|
||||||
Returns the lowercase provider id (e.g. "alibaba", "openrouter") or ""
|
Returns the lowercase provider id (e.g. "alibaba", "openrouter") or ""
|
||||||
if not configured.
|
if not configured.
|
||||||
|
|
||||||
|
Runtime override: see ``_read_main_model`` — same mechanism for the
|
||||||
|
provider half of the runtime tuple.
|
||||||
"""
|
"""
|
||||||
|
override = _RUNTIME_MAIN_PROVIDER
|
||||||
|
if isinstance(override, str) and override.strip():
|
||||||
|
return override.strip().lower()
|
||||||
try:
|
try:
|
||||||
from hermes_cli.config import load_config
|
from hermes_cli.config import load_config
|
||||||
cfg = load_config()
|
cfg = load_config()
|
||||||
|
|
@ -1498,6 +1513,32 @@ def _read_main_provider() -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
# Process-local override set by AIAgent at session/turn start. Single-threaded
|
||||||
|
# per turn — no lock needed. Cleared by ``clear_runtime_main()``.
|
||||||
|
_RUNTIME_MAIN_PROVIDER: str = ""
|
||||||
|
_RUNTIME_MAIN_MODEL: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def set_runtime_main(provider: str, model: str) -> None:
|
||||||
|
"""Record the live runtime provider/model for the current AIAgent.
|
||||||
|
|
||||||
|
Called by ``run_agent.AIAgent._sync_runtime_main_for_aux_routing`` (or
|
||||||
|
equivalent setter) at the top of each turn so that
|
||||||
|
``_read_main_provider`` / ``_read_main_model`` reflect CLI/gateway
|
||||||
|
overrides instead of the stale config.yaml default.
|
||||||
|
"""
|
||||||
|
global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
|
||||||
|
_RUNTIME_MAIN_PROVIDER = (provider or "").strip().lower()
|
||||||
|
_RUNTIME_MAIN_MODEL = (model or "").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def clear_runtime_main() -> None:
|
||||||
|
"""Clear the runtime override (e.g. on session end)."""
|
||||||
|
global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
|
||||||
|
_RUNTIME_MAIN_PROVIDER = ""
|
||||||
|
_RUNTIME_MAIN_MODEL = ""
|
||||||
|
|
||||||
|
|
||||||
def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||||
"""Resolve the active custom/main endpoint the same way the main CLI does.
|
"""Resolve the active custom/main endpoint the same way the main CLI does.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -410,10 +410,29 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di
|
||||||
call_id = raw_tool_call_id.strip()
|
call_id = raw_tool_call_id.strip()
|
||||||
if not isinstance(call_id, str) or not call_id.strip():
|
if not isinstance(call_id, str) or not call_id.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Multimodal tool result: convert OpenAI-style content list into
|
||||||
|
# Responses ``function_call_output.output`` array. The Responses
|
||||||
|
# API accepts ``output`` as either a string or an array of
|
||||||
|
# ``input_text``/``input_image`` items. See
|
||||||
|
# https://developers.openai.com/api/reference/python/resources/responses/.
|
||||||
|
tool_content = msg.get("content")
|
||||||
|
output_value: Any
|
||||||
|
if isinstance(tool_content, list):
|
||||||
|
converted = _chat_content_to_responses_parts(
|
||||||
|
tool_content, role="user",
|
||||||
|
)
|
||||||
|
if converted:
|
||||||
|
output_value = converted
|
||||||
|
else:
|
||||||
|
output_value = ""
|
||||||
|
else:
|
||||||
|
output_value = str(tool_content or "")
|
||||||
|
|
||||||
items.append({
|
items.append({
|
||||||
"type": "function_call_output",
|
"type": "function_call_output",
|
||||||
"call_id": call_id,
|
"call_id": call_id,
|
||||||
"output": str(msg.get("content", "") or ""),
|
"output": output_value,
|
||||||
})
|
})
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
@ -466,6 +485,38 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
|
||||||
output = item.get("output", "")
|
output = item.get("output", "")
|
||||||
if output is None:
|
if output is None:
|
||||||
output = ""
|
output = ""
|
||||||
|
# Output may be a string OR an array of structured content
|
||||||
|
# items (input_text / input_image) for multimodal tool results.
|
||||||
|
# Both shapes are accepted by the Responses API. We preserve
|
||||||
|
# the array form when present.
|
||||||
|
if isinstance(output, list):
|
||||||
|
# Validate each item is a recognised content shape; drop
|
||||||
|
# anything else to avoid 4xx from the API.
|
||||||
|
cleaned: List[Dict[str, Any]] = []
|
||||||
|
for part in output:
|
||||||
|
if not isinstance(part, dict):
|
||||||
|
continue
|
||||||
|
ptype = part.get("type")
|
||||||
|
if ptype == "input_text":
|
||||||
|
text = part.get("text")
|
||||||
|
if isinstance(text, str) and text:
|
||||||
|
cleaned.append({"type": "input_text", "text": text})
|
||||||
|
elif ptype == "input_image":
|
||||||
|
url = part.get("image_url")
|
||||||
|
if isinstance(url, str) and url:
|
||||||
|
entry: Dict[str, Any] = {"type": "input_image", "image_url": url}
|
||||||
|
detail = part.get("detail")
|
||||||
|
if isinstance(detail, str) and detail.strip():
|
||||||
|
entry["detail"] = detail.strip()
|
||||||
|
cleaned.append(entry)
|
||||||
|
normalized.append(
|
||||||
|
{
|
||||||
|
"type": "function_call_output",
|
||||||
|
"call_id": call_id.strip(),
|
||||||
|
"output": cleaned if cleaned else "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
continue
|
||||||
if not isinstance(output, str):
|
if not isinstance(output, str):
|
||||||
output = str(output)
|
output = str(output)
|
||||||
|
|
||||||
|
|
|
||||||
14
run_agent.py
14
run_agent.py
|
|
@ -11119,6 +11119,20 @@ class AIAgent:
|
||||||
|
|
||||||
self._ensure_db_session()
|
self._ensure_db_session()
|
||||||
|
|
||||||
|
# Tell auxiliary_client what the live main provider/model are for
|
||||||
|
# this turn. Used by tools whose behaviour depends on the active
|
||||||
|
# main model (e.g. vision_analyze's native fast path) so they see
|
||||||
|
# the CLI/gateway override instead of the stale config.yaml
|
||||||
|
# default. Idempotent — fine to call every turn.
|
||||||
|
try:
|
||||||
|
from agent.auxiliary_client import set_runtime_main
|
||||||
|
set_runtime_main(
|
||||||
|
getattr(self, "provider", "") or "",
|
||||||
|
getattr(self, "model", "") or "",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Tag all log records on this thread with the session ID so
|
# Tag all log records on this thread with the session ID so
|
||||||
# ``hermes logs --session <id>`` can filter a single conversation.
|
# ``hermes logs --session <id>`` can filter a single conversation.
|
||||||
from hermes_logging import set_session_context
|
from hermes_logging import set_session_context
|
||||||
|
|
|
||||||
|
|
@ -427,6 +427,15 @@ def _reset_module_state():
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# --- agent.auxiliary_client — runtime main provider/model override ---
|
||||||
|
# Set per-turn by AIAgent.run_conversation; tests that import it must
|
||||||
|
# see a clean state so config.yaml fallback works as expected.
|
||||||
|
try:
|
||||||
|
from agent import auxiliary_client as _aux_mod
|
||||||
|
_aux_mod.clear_runtime_main()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# --- tools.file_tools — per-task read history + file-ops cache ---
|
# --- tools.file_tools — per-task read history + file-ops cache ---
|
||||||
# _read_tracker accumulates per-task_id read history for loop detection,
|
# _read_tracker accumulates per-task_id read history for loop detection,
|
||||||
# capped by _READ_HISTORY_CAP. If entries from a prior test persist, the
|
# capped by _READ_HISTORY_CAP. If entries from a prior test persist, the
|
||||||
|
|
|
||||||
173
tests/run_agent/test_codex_multimodal_tool_result.py
Normal file
173
tests/run_agent/test_codex_multimodal_tool_result.py
Normal file
|
|
@ -0,0 +1,173 @@
|
||||||
|
"""Tests for codex_responses_adapter multimodal tool-result handling.
|
||||||
|
|
||||||
|
Tool messages can contain a list of OpenAI-style content parts
|
||||||
|
(``[{type:"text"...}, {type:"image_url"...}]``) when the
|
||||||
|
``vision_analyze`` native fast path returns image bytes for the main model.
|
||||||
|
This file verifies the Codex Responses adapter:
|
||||||
|
|
||||||
|
1. Converts that list into ``function_call_output.output`` as an array of
|
||||||
|
``input_text``/``input_image`` items (not a stringified blob).
|
||||||
|
2. Preserves array-shaped output through the preflight validator.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from agent.codex_responses_adapter import (
|
||||||
|
_chat_messages_to_responses_input,
|
||||||
|
_preflight_codex_input_items,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_messages_with_multimodal_tool_result():
|
||||||
|
return [
|
||||||
|
{"role": "user", "content": "What's in /tmp/foo.png?"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "",
|
||||||
|
"tool_calls": [{
|
||||||
|
"id": "call_abc",
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "vision_analyze",
|
||||||
|
"arguments": '{"image_url": "/tmp/foo.png", "question": "describe"}',
|
||||||
|
},
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool",
|
||||||
|
"name": "vision_analyze",
|
||||||
|
"tool_call_id": "call_abc",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Image loaded."},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,XYZ"}},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class TestMultimodalToolResultConversion:
|
||||||
|
def test_list_content_becomes_output_array(self):
|
||||||
|
items = _chat_messages_to_responses_input(
|
||||||
|
_build_messages_with_multimodal_tool_result()
|
||||||
|
)
|
||||||
|
# Find the function_call_output item
|
||||||
|
outputs = [it for it in items if it.get("type") == "function_call_output"]
|
||||||
|
assert len(outputs) == 1
|
||||||
|
out = outputs[0]
|
||||||
|
assert out["call_id"] == "call_abc"
|
||||||
|
# Output should be a LIST (array form), not a string
|
||||||
|
assert isinstance(out["output"], list), \
|
||||||
|
f"Expected array output for multimodal tool result, got {type(out['output']).__name__}: {out['output']!r}"
|
||||||
|
types = [p.get("type") for p in out["output"]]
|
||||||
|
assert "input_text" in types
|
||||||
|
assert "input_image" in types
|
||||||
|
|
||||||
|
def test_input_image_preserves_data_url(self):
|
||||||
|
items = _chat_messages_to_responses_input(
|
||||||
|
_build_messages_with_multimodal_tool_result()
|
||||||
|
)
|
||||||
|
out = next(it for it in items if it.get("type") == "function_call_output")
|
||||||
|
image_parts = [p for p in out["output"] if p.get("type") == "input_image"]
|
||||||
|
assert len(image_parts) == 1
|
||||||
|
assert image_parts[0]["image_url"] == "data:image/png;base64,XYZ"
|
||||||
|
|
||||||
|
def test_string_tool_content_still_string_output(self):
|
||||||
|
msgs = [
|
||||||
|
{"role": "user", "content": "hi"},
|
||||||
|
{
|
||||||
|
"role": "assistant", "content": "",
|
||||||
|
"tool_calls": [{
|
||||||
|
"id": "call_x", "type": "function",
|
||||||
|
"function": {"name": "terminal", "arguments": "{}"},
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "tool", "name": "terminal", "tool_call_id": "call_x",
|
||||||
|
"content": "ls output here",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
items = _chat_messages_to_responses_input(msgs)
|
||||||
|
out = next(it for it in items if it.get("type") == "function_call_output")
|
||||||
|
assert isinstance(out["output"], str)
|
||||||
|
assert out["output"] == "ls output here"
|
||||||
|
|
||||||
|
|
||||||
|
class TestPreflightAcceptsArrayOutput:
|
||||||
|
def test_preflight_passes_array_through(self):
|
||||||
|
raw = [
|
||||||
|
{
|
||||||
|
"type": "function_call",
|
||||||
|
"call_id": "call_abc",
|
||||||
|
"name": "vision_analyze",
|
||||||
|
"arguments": "{}",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function_call_output",
|
||||||
|
"call_id": "call_abc",
|
||||||
|
"output": [
|
||||||
|
{"type": "input_text", "text": "Image loaded."},
|
||||||
|
{"type": "input_image", "image_url": "data:image/png;base64,ABC"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
normalized = _preflight_codex_input_items(raw)
|
||||||
|
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
|
||||||
|
assert isinstance(out["output"], list)
|
||||||
|
assert len(out["output"]) == 2
|
||||||
|
assert out["output"][1]["type"] == "input_image"
|
||||||
|
assert out["output"][1]["image_url"] == "data:image/png;base64,ABC"
|
||||||
|
|
||||||
|
def test_preflight_drops_unknown_part_types(self):
|
||||||
|
raw = [
|
||||||
|
{
|
||||||
|
"type": "function_call",
|
||||||
|
"call_id": "call_abc", "name": "vision_analyze", "arguments": "{}",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function_call_output",
|
||||||
|
"call_id": "call_abc",
|
||||||
|
"output": [
|
||||||
|
{"type": "input_text", "text": "ok"},
|
||||||
|
{"type": "garbage", "data": "nope"}, # unknown — should be dropped
|
||||||
|
{"type": "input_image", "image_url": "data:image/png;base64,ZZ"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
normalized = _preflight_codex_input_items(raw)
|
||||||
|
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
|
||||||
|
# The "garbage" part is dropped; valid parts remain
|
||||||
|
types = [p.get("type") for p in out["output"]]
|
||||||
|
assert types == ["input_text", "input_image"]
|
||||||
|
|
||||||
|
def test_preflight_empty_array_becomes_empty_string(self):
|
||||||
|
# Defensive: an array with no valid parts shouldn't break the API call
|
||||||
|
raw = [
|
||||||
|
{
|
||||||
|
"type": "function_call",
|
||||||
|
"call_id": "call_x", "name": "vision_analyze", "arguments": "{}",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function_call_output",
|
||||||
|
"call_id": "call_x",
|
||||||
|
"output": [{"type": "garbage"}], # all dropped
|
||||||
|
},
|
||||||
|
]
|
||||||
|
normalized = _preflight_codex_input_items(raw)
|
||||||
|
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
|
||||||
|
assert out["output"] == ""
|
||||||
|
|
||||||
|
def test_preflight_string_output_unchanged(self):
|
||||||
|
raw = [
|
||||||
|
{
|
||||||
|
"type": "function_call",
|
||||||
|
"call_id": "call_x", "name": "terminal", "arguments": "{}",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "function_call_output",
|
||||||
|
"call_id": "call_x",
|
||||||
|
"output": "plain text output",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
normalized = _preflight_codex_input_items(raw)
|
||||||
|
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
|
||||||
|
assert out["output"] == "plain text output"
|
||||||
207
tests/tools/test_vision_native_fast_path.py
Normal file
207
tests/tools/test_vision_native_fast_path.py
Normal file
|
|
@ -0,0 +1,207 @@
|
||||||
|
"""Tests for the native-vision fast path inside vision_analyze.
|
||||||
|
|
||||||
|
When the active main model supports native vision AND the provider supports
|
||||||
|
image content inside tool-result messages, ``_handle_vision_analyze`` skips
|
||||||
|
the auxiliary LLM and returns a multimodal envelope so the main model sees
|
||||||
|
the pixels directly on its next turn.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tools.vision_tools import (
|
||||||
|
_build_native_vision_tool_result,
|
||||||
|
_handle_vision_analyze,
|
||||||
|
_supports_media_in_tool_results,
|
||||||
|
_vision_analyze_native,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Minimal valid 1x1 PNG bytes.
|
||||||
|
_TINY_PNG = base64.b64decode(
|
||||||
|
b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── _supports_media_in_tool_results ─────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestSupportsMediaInToolResults:
|
||||||
|
def test_anthropic_native_yes(self):
|
||||||
|
assert _supports_media_in_tool_results("anthropic", "claude-opus-4-6") is True
|
||||||
|
|
||||||
|
def test_openrouter_yes(self):
|
||||||
|
assert _supports_media_in_tool_results("openrouter", "anthropic/claude-opus-4.6") is True
|
||||||
|
|
||||||
|
def test_nous_yes(self):
|
||||||
|
assert _supports_media_in_tool_results("nous", "anthropic/claude-sonnet-4.6") is True
|
||||||
|
|
||||||
|
def test_openai_chat_yes(self):
|
||||||
|
assert _supports_media_in_tool_results("openai", "gpt-5.4") is True
|
||||||
|
|
||||||
|
def test_openai_codex_yes(self):
|
||||||
|
assert _supports_media_in_tool_results("openai-codex", "gpt-5-codex") is True
|
||||||
|
|
||||||
|
def test_gemini_3_yes(self):
|
||||||
|
assert _supports_media_in_tool_results("google", "gemini-3-flash-preview") is True
|
||||||
|
|
||||||
|
def test_gemini_2_no(self):
|
||||||
|
assert _supports_media_in_tool_results("google", "gemini-2.5-pro") is False
|
||||||
|
|
||||||
|
def test_unknown_provider_conservative_no(self):
|
||||||
|
assert _supports_media_in_tool_results("brand-new-provider", "any-model") is False
|
||||||
|
|
||||||
|
def test_empty_provider_no(self):
|
||||||
|
assert _supports_media_in_tool_results("", "anything") is False
|
||||||
|
assert _supports_media_in_tool_results(None, "anything") is False # type: ignore[arg-type]
|
||||||
|
|
||||||
|
|
||||||
|
# ─── _build_native_vision_tool_result ────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuildNativeVisionToolResult:
|
||||||
|
def test_envelope_shape(self):
|
||||||
|
env = _build_native_vision_tool_result(
|
||||||
|
image_url="/tmp/foo.png",
|
||||||
|
question="what does it say?",
|
||||||
|
image_data_url="data:image/png;base64,XYZ",
|
||||||
|
image_size_bytes=1024,
|
||||||
|
)
|
||||||
|
assert env["_multimodal"] is True
|
||||||
|
assert isinstance(env["content"], list)
|
||||||
|
assert len(env["content"]) == 2
|
||||||
|
assert env["content"][0]["type"] == "text"
|
||||||
|
assert env["content"][1]["type"] == "image_url"
|
||||||
|
assert env["content"][1]["image_url"]["url"] == "data:image/png;base64,XYZ"
|
||||||
|
assert "what does it say?" in env["content"][0]["text"]
|
||||||
|
assert "Image attached natively" in env["text_summary"]
|
||||||
|
|
||||||
|
def test_no_question_omits_question_section(self):
|
||||||
|
env = _build_native_vision_tool_result(
|
||||||
|
image_url="/tmp/foo.png",
|
||||||
|
question="",
|
||||||
|
image_data_url="data:image/png;base64,XYZ",
|
||||||
|
image_size_bytes=512,
|
||||||
|
)
|
||||||
|
text = env["content"][0]["text"]
|
||||||
|
assert "Question:" not in text
|
||||||
|
assert "Image loaded" in text
|
||||||
|
|
||||||
|
|
||||||
|
# ─── _vision_analyze_native ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestVisionAnalyzeNative:
|
||||||
|
def test_local_file_returns_multimodal_envelope(self, tmp_path):
|
||||||
|
img = tmp_path / "test.png"
|
||||||
|
img.write_bytes(_TINY_PNG)
|
||||||
|
result = asyncio.get_event_loop().run_until_complete(
|
||||||
|
_vision_analyze_native(str(img), "what is this?")
|
||||||
|
)
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
assert result.get("_multimodal") is True
|
||||||
|
parts = result["content"]
|
||||||
|
assert any(p.get("type") == "image_url" for p in parts)
|
||||||
|
assert any(p.get("type") == "text" for p in parts)
|
||||||
|
url = next(p["image_url"]["url"] for p in parts if p.get("type") == "image_url")
|
||||||
|
assert url.startswith("data:image/")
|
||||||
|
|
||||||
|
def test_missing_file_returns_error_string(self, tmp_path):
|
||||||
|
result = asyncio.get_event_loop().run_until_complete(
|
||||||
|
_vision_analyze_native(str(tmp_path / "nope.png"), "?")
|
||||||
|
)
|
||||||
|
# tool_error returns a JSON string, not the multimodal envelope
|
||||||
|
assert isinstance(result, str)
|
||||||
|
parsed = json.loads(result)
|
||||||
|
assert parsed.get("success") is False
|
||||||
|
assert "Invalid image source" in parsed.get("error", "")
|
||||||
|
|
||||||
|
def test_empty_image_url_returns_error(self):
|
||||||
|
result = asyncio.get_event_loop().run_until_complete(
|
||||||
|
_vision_analyze_native("", "?")
|
||||||
|
)
|
||||||
|
assert isinstance(result, str)
|
||||||
|
parsed = json.loads(result)
|
||||||
|
assert parsed.get("success") is False
|
||||||
|
assert "image_url is required" in parsed.get("error", "")
|
||||||
|
|
||||||
|
def test_file_url_scheme_resolves(self, tmp_path):
|
||||||
|
img = tmp_path / "t.png"
|
||||||
|
img.write_bytes(_TINY_PNG)
|
||||||
|
result = asyncio.get_event_loop().run_until_complete(
|
||||||
|
_vision_analyze_native(f"file://{img}", "?")
|
||||||
|
)
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
assert result.get("_multimodal") is True
|
||||||
|
|
||||||
|
|
||||||
|
# ─── _handle_vision_analyze fast-path gating ─────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestHandleVisionAnalyzeFastPath:
|
||||||
|
"""Verify the dispatcher chooses fast-path vs aux-LLM correctly."""
|
||||||
|
|
||||||
|
def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch):
|
||||||
|
"""Main model supports native vision → fast path returns multimodal."""
|
||||||
|
img = tmp_path / "x.png"
|
||||||
|
img.write_bytes(_TINY_PNG)
|
||||||
|
|
||||||
|
# Set runtime override so the handler thinks we're on opus@openrouter
|
||||||
|
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||||
|
set_runtime_main("openrouter", "anthropic/claude-opus-4.6")
|
||||||
|
try:
|
||||||
|
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||||
|
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||||
|
finally:
|
||||||
|
clear_runtime_main()
|
||||||
|
|
||||||
|
assert isinstance(result, dict), \
|
||||||
|
f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
|
||||||
|
assert result.get("_multimodal") is True
|
||||||
|
|
||||||
|
def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch):
|
||||||
|
"""Non-vision main model → fast path skipped, aux LLM path attempted."""
|
||||||
|
img = tmp_path / "x.png"
|
||||||
|
img.write_bytes(_TINY_PNG)
|
||||||
|
|
||||||
|
async def _aux_sentinel(*args, **kwargs):
|
||||||
|
return '{"sentinel": "aux-path"}'
|
||||||
|
|
||||||
|
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||||
|
set_runtime_main("openrouter", "qwen/qwen3-coder")
|
||||||
|
try:
|
||||||
|
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
|
||||||
|
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||||
|
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||||
|
finally:
|
||||||
|
clear_runtime_main()
|
||||||
|
|
||||||
|
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
|
||||||
|
"Fast path fired for non-vision model; should have fallen through to aux LLM"
|
||||||
|
|
||||||
|
def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch):
|
||||||
|
"""Even with vision-capable model, unknown provider → fall through."""
|
||||||
|
img = tmp_path / "x.png"
|
||||||
|
img.write_bytes(_TINY_PNG)
|
||||||
|
|
||||||
|
async def _aux_sentinel(*args, **kwargs):
|
||||||
|
return '{"sentinel": "aux-path"}'
|
||||||
|
|
||||||
|
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
|
||||||
|
set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6")
|
||||||
|
try:
|
||||||
|
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
|
||||||
|
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
|
||||||
|
result = asyncio.get_event_loop().run_until_complete(coro)
|
||||||
|
finally:
|
||||||
|
clear_runtime_main()
|
||||||
|
|
||||||
|
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
|
||||||
|
"Fast path fired for unknown provider; should have fallen through"
|
||||||
|
|
@ -403,6 +403,232 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
|
||||||
return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
|
return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Native fast path: short-circuit the auxiliary LLM when the active main model
|
||||||
|
# supports native vision. Instead of asking a separate LLM to describe the
|
||||||
|
# image and returning text, we load the image, base64-encode it, and return a
|
||||||
|
# multimodal tool-result envelope. The agent loop unwraps the envelope into an
|
||||||
|
# OpenAI-style content list on the `tool` role; provider adapters (anthropic,
|
||||||
|
# codex_responses, chat_completions) translate that into Anthropic
|
||||||
|
# tool_result image blocks / Responses input_image / OpenAI image_url tool
|
||||||
|
# content. The main model then "sees" the pixels directly on its next turn.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _supports_media_in_tool_results(provider: str, model: str) -> bool:
|
||||||
|
"""Whether the given provider+model combination accepts image content
|
||||||
|
inside a tool-result message.
|
||||||
|
|
||||||
|
Providers covered today (per spec docs verified Apr-2026):
|
||||||
|
|
||||||
|
* Anthropic Messages API (``anthropic`` provider, plus aggregators that
|
||||||
|
proxy Claude — ``openrouter``, ``nous``, ``vertex``, ``bedrock``):
|
||||||
|
``tool_result`` blocks accept ``image`` content blocks.
|
||||||
|
* OpenAI Chat Completions: tool messages accept array content with
|
||||||
|
``image_url`` parts.
|
||||||
|
* OpenAI Responses (``openai-codex``): ``function_call_output.output``
|
||||||
|
accepts an array of ``input_text``/``input_image`` items.
|
||||||
|
* Gemini 3 (and proxied via aggregators): supports multimodal tool
|
||||||
|
results. Older Gemini does NOT.
|
||||||
|
|
||||||
|
For unknown / legacy providers we conservatively return False — the
|
||||||
|
caller falls back to the legacy aux-LLM text path.
|
||||||
|
"""
|
||||||
|
if not isinstance(provider, str):
|
||||||
|
return False
|
||||||
|
p = provider.strip().lower()
|
||||||
|
if not p:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Aggregators that route to multiple vendors — assume support since
|
||||||
|
# users on these aggregators are typically using vision-capable
|
||||||
|
# frontier models. Falling back to text would be a regression for
|
||||||
|
# them.
|
||||||
|
_AGGREGATORS = {
|
||||||
|
"openrouter", "nous", "vertex", "bedrock", "anthropic-vertex",
|
||||||
|
"google-vertex",
|
||||||
|
}
|
||||||
|
if p in _AGGREGATORS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Native Anthropic
|
||||||
|
if p in {"anthropic", "claude", "anthropic-direct"}:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# OpenAI Chat Completions and Responses
|
||||||
|
if p in {"openai", "openai-chat", "openai-codex", "azure-openai"}:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Gemini — gate on model name; older Gemini variants did not support
|
||||||
|
# multimodal functionResponse. Gemini 3.x does.
|
||||||
|
if p in {"google", "gemini", "google-gemini", "google-vertex-gemini"}:
|
||||||
|
if not isinstance(model, str):
|
||||||
|
return False
|
||||||
|
m = model.strip().lower()
|
||||||
|
if "gemini-3" in m or "gemini-pro-3" in m or "gemini-flash-3" in m:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Other vision-capable provider stacks. Conservative default: False.
|
||||||
|
# Add explicit entries here as we verify each provider's tool-result
|
||||||
|
# multimodal support empirically.
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _build_native_vision_tool_result(
|
||||||
|
image_url: str,
|
||||||
|
question: str,
|
||||||
|
image_data_url: str,
|
||||||
|
image_size_bytes: int,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Build the multimodal tool-result envelope returned by the fast path.
|
||||||
|
|
||||||
|
Shape:
|
||||||
|
{
|
||||||
|
"_multimodal": True,
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "<short note + the user's question>"},
|
||||||
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
|
||||||
|
],
|
||||||
|
"text_summary": "<plain-text fallback>",
|
||||||
|
"meta": {"image_url": ..., "size_bytes": N},
|
||||||
|
}
|
||||||
|
|
||||||
|
The text part exists for two reasons: (1) it gives the model an
|
||||||
|
instruction to act on now that the pixels are in context, and
|
||||||
|
(2) providers that don't support multimodal tool results can fall back
|
||||||
|
to ``text_summary``.
|
||||||
|
"""
|
||||||
|
# The tool-result text part is intentionally minimal. The model already
|
||||||
|
# has the user's original question in context; this just acknowledges
|
||||||
|
# the image is now visible and reminds it what it was asked.
|
||||||
|
text_part = (
|
||||||
|
"Image loaded into your context — you can see it natively now. "
|
||||||
|
"Use your built-in vision to answer the user."
|
||||||
|
)
|
||||||
|
if isinstance(question, str) and question.strip():
|
||||||
|
text_part += f"\n\nQuestion: {question.strip()}"
|
||||||
|
|
||||||
|
summary = (
|
||||||
|
f"Image attached natively for the main model "
|
||||||
|
f"({image_size_bytes / 1024:.1f} KB). "
|
||||||
|
"Answer using built-in vision."
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"_multimodal": True,
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": text_part},
|
||||||
|
{"type": "image_url", "image_url": {"url": image_data_url}},
|
||||||
|
],
|
||||||
|
"text_summary": summary,
|
||||||
|
"meta": {
|
||||||
|
"image_url": image_url[:200],
|
||||||
|
"size_bytes": image_size_bytes,
|
||||||
|
"native_vision": True,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def _vision_analyze_native(
|
||||||
|
image_url: str,
|
||||||
|
question: str,
|
||||||
|
) -> Any:
|
||||||
|
"""Fast path for vision-capable main models.
|
||||||
|
|
||||||
|
Loads the image (local file OR remote URL), base64-encodes it, and
|
||||||
|
returns a multimodal tool-result envelope. The agent loop unwraps it;
|
||||||
|
provider adapters serialize it into the right tool-result-with-image
|
||||||
|
shape for each backend.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A ``_multimodal`` envelope dict on success.
|
||||||
|
A JSON error string on failure (matches the existing tool-result
|
||||||
|
contract so the agent loop displays errors normally).
|
||||||
|
"""
|
||||||
|
if not isinstance(image_url, str) or not image_url.strip():
|
||||||
|
return tool_error("image_url is required", success=False)
|
||||||
|
|
||||||
|
temp_image_path: Optional[Path] = None
|
||||||
|
should_cleanup = False
|
||||||
|
try:
|
||||||
|
from tools.interrupt import is_interrupted
|
||||||
|
if is_interrupted():
|
||||||
|
return tool_error("Interrupted", success=False)
|
||||||
|
|
||||||
|
# Resolve the image source (mirrors vision_analyze_tool's logic
|
||||||
|
# exactly so behaviour is consistent).
|
||||||
|
resolved_url = image_url
|
||||||
|
if resolved_url.startswith("file://"):
|
||||||
|
resolved_url = resolved_url[len("file://"):]
|
||||||
|
local_path = Path(os.path.expanduser(resolved_url))
|
||||||
|
|
||||||
|
if local_path.is_file():
|
||||||
|
temp_image_path = local_path
|
||||||
|
should_cleanup = False
|
||||||
|
elif _validate_image_url(image_url):
|
||||||
|
blocked = check_website_access(image_url)
|
||||||
|
if blocked:
|
||||||
|
return tool_error(blocked["message"], success=False)
|
||||||
|
temp_dir = get_hermes_dir("cache/vision", "temp_vision_images")
|
||||||
|
temp_image_path = temp_dir / f"temp_image_{uuid.uuid4()}.jpg"
|
||||||
|
await _download_image(image_url, temp_image_path)
|
||||||
|
should_cleanup = True
|
||||||
|
else:
|
||||||
|
return tool_error(
|
||||||
|
"Invalid image source. Provide an HTTP/HTTPS URL or a "
|
||||||
|
"valid local file path.",
|
||||||
|
success=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
image_size_bytes = temp_image_path.stat().st_size
|
||||||
|
detected_mime_type = _detect_image_mime_type(temp_image_path)
|
||||||
|
if not detected_mime_type:
|
||||||
|
return tool_error(
|
||||||
|
"Only real image files are supported for vision analysis.",
|
||||||
|
success=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
image_data_url = _image_to_base64_data_url(
|
||||||
|
temp_image_path, mime_type=detected_mime_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Honour the same hard cap as the legacy path. Resize if needed.
|
||||||
|
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||||
|
image_data_url = _resize_image_for_vision(
|
||||||
|
temp_image_path, mime_type=detected_mime_type,
|
||||||
|
)
|
||||||
|
if len(image_data_url) > _MAX_BASE64_BYTES:
|
||||||
|
return tool_error(
|
||||||
|
f"Image too large for vision API: base64 payload is "
|
||||||
|
f"{len(image_data_url) / (1024 * 1024):.1f} MB "
|
||||||
|
f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
|
||||||
|
f"even after resizing. Install Pillow "
|
||||||
|
f"(`pip install Pillow`) for better auto-resize, "
|
||||||
|
f"or compress the image manually.",
|
||||||
|
success=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
return _build_native_vision_tool_result(
|
||||||
|
image_url=image_url,
|
||||||
|
question=question,
|
||||||
|
image_data_url=image_data_url,
|
||||||
|
image_size_bytes=image_size_bytes,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Native vision fast path failed: %s", exc)
|
||||||
|
return tool_error(f"Native vision failed: {exc}", success=False)
|
||||||
|
finally:
|
||||||
|
# Only delete temp files we created — never user-provided paths.
|
||||||
|
if should_cleanup and temp_image_path is not None:
|
||||||
|
try:
|
||||||
|
if temp_image_path.exists():
|
||||||
|
temp_image_path.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def vision_analyze_tool(
|
async def vision_analyze_tool(
|
||||||
image_url: str,
|
image_url: str,
|
||||||
user_prompt: str,
|
user_prompt: str,
|
||||||
|
|
@ -758,24 +984,25 @@ from tools.registry import registry, tool_error
|
||||||
VISION_ANALYZE_SCHEMA = {
|
VISION_ANALYZE_SCHEMA = {
|
||||||
"name": "vision_analyze",
|
"name": "vision_analyze",
|
||||||
"description": (
|
"description": (
|
||||||
"Inspect an image from a URL, file path, or tool output when you need "
|
"Load an image into the conversation so you can see it. Accepts a "
|
||||||
"closer detail than what's visible in the conversation. If the user's "
|
"URL, local file path, or data URL. When your active model has "
|
||||||
"image is already attached to the conversation and you can see it, "
|
"native vision, the image is attached to your context directly "
|
||||||
"just answer directly — only call this tool for images referenced by "
|
"and you read the pixels yourself on the next turn — call this "
|
||||||
"URL/path, images returned inside other tool results (browser "
|
"any time the user references an image (filepath in their message, "
|
||||||
"screenshots, search thumbnails), or when you need a deeper look at "
|
"URL in tool output, screenshot from the browser, etc.). For "
|
||||||
"a specific region the main model's vision may have missed."
|
"non-vision models, falls back to an auxiliary vision model that "
|
||||||
|
"returns a text description."
|
||||||
),
|
),
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "Image URL (http/https) or local file path to analyze."
|
"description": "Image URL (http/https), local file path, or data: URL to load."
|
||||||
},
|
},
|
||||||
"question": {
|
"question": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "Your specific question or request about the image to resolve. The AI will automatically provide a complete image description AND answer your specific question."
|
"description": "Your specific question or request about the image. Optional context the model uses on the next turn after seeing the image."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["image_url", "question"]
|
"required": ["image_url", "question"]
|
||||||
|
|
@ -786,6 +1013,31 @@ VISION_ANALYZE_SCHEMA = {
|
||||||
def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
|
def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
|
||||||
image_url = args.get("image_url", "")
|
image_url = args.get("image_url", "")
|
||||||
question = args.get("question", "")
|
question = args.get("question", "")
|
||||||
|
|
||||||
|
# Fast path: when the active main model supports native vision AND the
|
||||||
|
# provider supports image content inside tool results, short-circuit
|
||||||
|
# the auxiliary LLM and return the image bytes as a multimodal
|
||||||
|
# tool-result envelope. The main model sees the pixels directly on its
|
||||||
|
# next turn — no aux call, no information loss, no extra latency.
|
||||||
|
try:
|
||||||
|
from agent.auxiliary_client import _read_main_provider, _read_main_model
|
||||||
|
from agent.image_routing import decide_image_input_mode
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
|
||||||
|
_provider = _read_main_provider()
|
||||||
|
_model = _read_main_model()
|
||||||
|
_cfg = load_config()
|
||||||
|
_mode = decide_image_input_mode(_provider, _model, _cfg)
|
||||||
|
if _mode == "native" and _supports_media_in_tool_results(_provider, _model):
|
||||||
|
logger.info(
|
||||||
|
"vision_analyze: native fast path (provider=%s, model=%s)",
|
||||||
|
_provider, _model,
|
||||||
|
)
|
||||||
|
return _vision_analyze_native(image_url, question)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("Native vision fast-path check failed; using aux LLM: %s", exc)
|
||||||
|
|
||||||
|
# Legacy path: aux LLM describes the image and we return its text.
|
||||||
full_prompt = (
|
full_prompt = (
|
||||||
"Fully describe and explain everything about this image, then answer the "
|
"Fully describe and explain everything about this image, then answer the "
|
||||||
f"following question:\n\n{question}"
|
f"following question:\n\n{question}"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue