feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955)

When the active main model has native vision and the provider supports
multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini
3, OpenRouter, Nous), vision_analyze loads the image bytes and returns
them to the model as a multimodal tool-result envelope. The model then
sees the pixels directly on its next turn instead of receiving a lossy
text description from an auxiliary LLM.

Falls back to the legacy aux-LLM text path for non-vision models and
unverified providers.

Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and
Cline. All four converge on the same pattern: tool results carry image
content blocks for vision-capable provider/model combinations.

Changes
- tools/vision_tools.py: _vision_analyze_native fast path + provider
  capability table (_supports_media_in_tool_results). Schema description
  updated to reflect new behaviour.
- agent/codex_responses_adapter.py: function_call_output.output now
  accepts the array form for multimodal tool results (was string-only).
  Preflight validates input_text/input_image parts.
- agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so
  tools see the live CLI/gateway override, not the stale config.yaml
  default. set_runtime_main()/clear_runtime_main() helpers.
- run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn
  start so vision_analyze's fast-path check sees the actual runtime.
- tests/conftest.py: clear runtime-main override between tests.

Tests
- tests/tools/test_vision_native_fast_path.py: provider capability
  table, envelope shape, fast-path gating (vision-capable model uses
  fast path; non-vision model falls through to aux).
- tests/run_agent/test_codex_multimodal_tool_result.py: list tool
  content becomes function_call_output.output array; preflight
  preserves arrays and drops unknown part types.

Live verified
- Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a
  typed filepath, gets pixels back, reads exact text from images that
  no aux description could capture (font color irony, multi-line
  fruit-count list, etc.).

PR replaces the closed prior efforts (#16506 shipped the inbound user-
attached path; this PR closes the gap for tool-discovered images).
This commit is contained in:
Teknium 2026-05-09 21:06:19 -07:00 committed by GitHub
parent e62250453b
commit 3800972dd0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 757 additions and 10 deletions

View file

@ -1463,7 +1463,16 @@ def _read_main_model() -> str:
config.yaml model.default is the single source of truth for the active config.yaml model.default is the single source of truth for the active
model. Environment variables are no longer consulted. model. Environment variables are no longer consulted.
Runtime override: when an AIAgent is active with a CLI/gateway-provided
model that differs from config.yaml, ``set_runtime_main()`` records the
override in a process-local global. This is consulted FIRST so tools
that gate on "the active main model" (e.g. ``vision_analyze``'s native
fast path) see the live runtime, not the persisted config default.
""" """
override = _RUNTIME_MAIN_MODEL
if isinstance(override, str) and override.strip():
return override.strip()
try: try:
from hermes_cli.config import load_config from hermes_cli.config import load_config
cfg = load_config() cfg = load_config()
@ -1484,7 +1493,13 @@ def _read_main_provider() -> str:
Returns the lowercase provider id (e.g. "alibaba", "openrouter") or "" Returns the lowercase provider id (e.g. "alibaba", "openrouter") or ""
if not configured. if not configured.
Runtime override: see ``_read_main_model`` same mechanism for the
provider half of the runtime tuple.
""" """
override = _RUNTIME_MAIN_PROVIDER
if isinstance(override, str) and override.strip():
return override.strip().lower()
try: try:
from hermes_cli.config import load_config from hermes_cli.config import load_config
cfg = load_config() cfg = load_config()
@ -1498,6 +1513,32 @@ def _read_main_provider() -> str:
return "" return ""
# Process-local override set by AIAgent at session/turn start. Single-threaded
# per turn — no lock needed. Cleared by ``clear_runtime_main()``.
_RUNTIME_MAIN_PROVIDER: str = ""
_RUNTIME_MAIN_MODEL: str = ""
def set_runtime_main(provider: str, model: str) -> None:
"""Record the live runtime provider/model for the current AIAgent.
Called by ``run_agent.AIAgent._sync_runtime_main_for_aux_routing`` (or
equivalent setter) at the top of each turn so that
``_read_main_provider`` / ``_read_main_model`` reflect CLI/gateway
overrides instead of the stale config.yaml default.
"""
global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
_RUNTIME_MAIN_PROVIDER = (provider or "").strip().lower()
_RUNTIME_MAIN_MODEL = (model or "").strip()
def clear_runtime_main() -> None:
"""Clear the runtime override (e.g. on session end)."""
global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
_RUNTIME_MAIN_PROVIDER = ""
_RUNTIME_MAIN_MODEL = ""
def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]: def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""Resolve the active custom/main endpoint the same way the main CLI does. """Resolve the active custom/main endpoint the same way the main CLI does.

View file

@ -410,10 +410,29 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di
call_id = raw_tool_call_id.strip() call_id = raw_tool_call_id.strip()
if not isinstance(call_id, str) or not call_id.strip(): if not isinstance(call_id, str) or not call_id.strip():
continue continue
# Multimodal tool result: convert OpenAI-style content list into
# Responses ``function_call_output.output`` array. The Responses
# API accepts ``output`` as either a string or an array of
# ``input_text``/``input_image`` items. See
# https://developers.openai.com/api/reference/python/resources/responses/.
tool_content = msg.get("content")
output_value: Any
if isinstance(tool_content, list):
converted = _chat_content_to_responses_parts(
tool_content, role="user",
)
if converted:
output_value = converted
else:
output_value = ""
else:
output_value = str(tool_content or "")
items.append({ items.append({
"type": "function_call_output", "type": "function_call_output",
"call_id": call_id, "call_id": call_id,
"output": str(msg.get("content", "") or ""), "output": output_value,
}) })
return items return items
@ -466,6 +485,38 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
output = item.get("output", "") output = item.get("output", "")
if output is None: if output is None:
output = "" output = ""
# Output may be a string OR an array of structured content
# items (input_text / input_image) for multimodal tool results.
# Both shapes are accepted by the Responses API. We preserve
# the array form when present.
if isinstance(output, list):
# Validate each item is a recognised content shape; drop
# anything else to avoid 4xx from the API.
cleaned: List[Dict[str, Any]] = []
for part in output:
if not isinstance(part, dict):
continue
ptype = part.get("type")
if ptype == "input_text":
text = part.get("text")
if isinstance(text, str) and text:
cleaned.append({"type": "input_text", "text": text})
elif ptype == "input_image":
url = part.get("image_url")
if isinstance(url, str) and url:
entry: Dict[str, Any] = {"type": "input_image", "image_url": url}
detail = part.get("detail")
if isinstance(detail, str) and detail.strip():
entry["detail"] = detail.strip()
cleaned.append(entry)
normalized.append(
{
"type": "function_call_output",
"call_id": call_id.strip(),
"output": cleaned if cleaned else "",
}
)
continue
if not isinstance(output, str): if not isinstance(output, str):
output = str(output) output = str(output)

View file

@ -11119,6 +11119,20 @@ class AIAgent:
self._ensure_db_session() self._ensure_db_session()
# Tell auxiliary_client what the live main provider/model are for
# this turn. Used by tools whose behaviour depends on the active
# main model (e.g. vision_analyze's native fast path) so they see
# the CLI/gateway override instead of the stale config.yaml
# default. Idempotent — fine to call every turn.
try:
from agent.auxiliary_client import set_runtime_main
set_runtime_main(
getattr(self, "provider", "") or "",
getattr(self, "model", "") or "",
)
except Exception:
pass
# Tag all log records on this thread with the session ID so # Tag all log records on this thread with the session ID so
# ``hermes logs --session <id>`` can filter a single conversation. # ``hermes logs --session <id>`` can filter a single conversation.
from hermes_logging import set_session_context from hermes_logging import set_session_context

View file

@ -427,6 +427,15 @@ def _reset_module_state():
except Exception: except Exception:
pass pass
# --- agent.auxiliary_client — runtime main provider/model override ---
# Set per-turn by AIAgent.run_conversation; tests that import it must
# see a clean state so config.yaml fallback works as expected.
try:
from agent import auxiliary_client as _aux_mod
_aux_mod.clear_runtime_main()
except Exception:
pass
# --- tools.file_tools — per-task read history + file-ops cache --- # --- tools.file_tools — per-task read history + file-ops cache ---
# _read_tracker accumulates per-task_id read history for loop detection, # _read_tracker accumulates per-task_id read history for loop detection,
# capped by _READ_HISTORY_CAP. If entries from a prior test persist, the # capped by _READ_HISTORY_CAP. If entries from a prior test persist, the

View file

@ -0,0 +1,173 @@
"""Tests for codex_responses_adapter multimodal tool-result handling.
Tool messages can contain a list of OpenAI-style content parts
(``[{type:"text"...}, {type:"image_url"...}]``) when the
``vision_analyze`` native fast path returns image bytes for the main model.
This file verifies the Codex Responses adapter:
1. Converts that list into ``function_call_output.output`` as an array of
``input_text``/``input_image`` items (not a stringified blob).
2. Preserves array-shaped output through the preflight validator.
"""
from __future__ import annotations
from agent.codex_responses_adapter import (
_chat_messages_to_responses_input,
_preflight_codex_input_items,
)
def _build_messages_with_multimodal_tool_result():
return [
{"role": "user", "content": "What's in /tmp/foo.png?"},
{
"role": "assistant",
"content": "",
"tool_calls": [{
"id": "call_abc",
"type": "function",
"function": {
"name": "vision_analyze",
"arguments": '{"image_url": "/tmp/foo.png", "question": "describe"}',
},
}],
},
{
"role": "tool",
"name": "vision_analyze",
"tool_call_id": "call_abc",
"content": [
{"type": "text", "text": "Image loaded."},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,XYZ"}},
],
},
]
class TestMultimodalToolResultConversion:
def test_list_content_becomes_output_array(self):
items = _chat_messages_to_responses_input(
_build_messages_with_multimodal_tool_result()
)
# Find the function_call_output item
outputs = [it for it in items if it.get("type") == "function_call_output"]
assert len(outputs) == 1
out = outputs[0]
assert out["call_id"] == "call_abc"
# Output should be a LIST (array form), not a string
assert isinstance(out["output"], list), \
f"Expected array output for multimodal tool result, got {type(out['output']).__name__}: {out['output']!r}"
types = [p.get("type") for p in out["output"]]
assert "input_text" in types
assert "input_image" in types
def test_input_image_preserves_data_url(self):
items = _chat_messages_to_responses_input(
_build_messages_with_multimodal_tool_result()
)
out = next(it for it in items if it.get("type") == "function_call_output")
image_parts = [p for p in out["output"] if p.get("type") == "input_image"]
assert len(image_parts) == 1
assert image_parts[0]["image_url"] == "data:image/png;base64,XYZ"
def test_string_tool_content_still_string_output(self):
msgs = [
{"role": "user", "content": "hi"},
{
"role": "assistant", "content": "",
"tool_calls": [{
"id": "call_x", "type": "function",
"function": {"name": "terminal", "arguments": "{}"},
}],
},
{
"role": "tool", "name": "terminal", "tool_call_id": "call_x",
"content": "ls output here",
},
]
items = _chat_messages_to_responses_input(msgs)
out = next(it for it in items if it.get("type") == "function_call_output")
assert isinstance(out["output"], str)
assert out["output"] == "ls output here"
class TestPreflightAcceptsArrayOutput:
def test_preflight_passes_array_through(self):
raw = [
{
"type": "function_call",
"call_id": "call_abc",
"name": "vision_analyze",
"arguments": "{}",
},
{
"type": "function_call_output",
"call_id": "call_abc",
"output": [
{"type": "input_text", "text": "Image loaded."},
{"type": "input_image", "image_url": "data:image/png;base64,ABC"},
],
},
]
normalized = _preflight_codex_input_items(raw)
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
assert isinstance(out["output"], list)
assert len(out["output"]) == 2
assert out["output"][1]["type"] == "input_image"
assert out["output"][1]["image_url"] == "data:image/png;base64,ABC"
def test_preflight_drops_unknown_part_types(self):
raw = [
{
"type": "function_call",
"call_id": "call_abc", "name": "vision_analyze", "arguments": "{}",
},
{
"type": "function_call_output",
"call_id": "call_abc",
"output": [
{"type": "input_text", "text": "ok"},
{"type": "garbage", "data": "nope"}, # unknown — should be dropped
{"type": "input_image", "image_url": "data:image/png;base64,ZZ"},
],
},
]
normalized = _preflight_codex_input_items(raw)
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
# The "garbage" part is dropped; valid parts remain
types = [p.get("type") for p in out["output"]]
assert types == ["input_text", "input_image"]
def test_preflight_empty_array_becomes_empty_string(self):
# Defensive: an array with no valid parts shouldn't break the API call
raw = [
{
"type": "function_call",
"call_id": "call_x", "name": "vision_analyze", "arguments": "{}",
},
{
"type": "function_call_output",
"call_id": "call_x",
"output": [{"type": "garbage"}], # all dropped
},
]
normalized = _preflight_codex_input_items(raw)
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
assert out["output"] == ""
def test_preflight_string_output_unchanged(self):
raw = [
{
"type": "function_call",
"call_id": "call_x", "name": "terminal", "arguments": "{}",
},
{
"type": "function_call_output",
"call_id": "call_x",
"output": "plain text output",
},
]
normalized = _preflight_codex_input_items(raw)
out = [it for it in normalized if it.get("type") == "function_call_output"][0]
assert out["output"] == "plain text output"

View file

@ -0,0 +1,207 @@
"""Tests for the native-vision fast path inside vision_analyze.
When the active main model supports native vision AND the provider supports
image content inside tool-result messages, ``_handle_vision_analyze`` skips
the auxiliary LLM and returns a multimodal envelope so the main model sees
the pixels directly on its next turn.
"""
from __future__ import annotations
import asyncio
import base64
import json
from pathlib import Path
from unittest.mock import patch
import pytest
from tools.vision_tools import (
_build_native_vision_tool_result,
_handle_vision_analyze,
_supports_media_in_tool_results,
_vision_analyze_native,
)
# Minimal valid 1x1 PNG bytes.
_TINY_PNG = base64.b64decode(
b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
)
# ─── _supports_media_in_tool_results ─────────────────────────────────────────
class TestSupportsMediaInToolResults:
def test_anthropic_native_yes(self):
assert _supports_media_in_tool_results("anthropic", "claude-opus-4-6") is True
def test_openrouter_yes(self):
assert _supports_media_in_tool_results("openrouter", "anthropic/claude-opus-4.6") is True
def test_nous_yes(self):
assert _supports_media_in_tool_results("nous", "anthropic/claude-sonnet-4.6") is True
def test_openai_chat_yes(self):
assert _supports_media_in_tool_results("openai", "gpt-5.4") is True
def test_openai_codex_yes(self):
assert _supports_media_in_tool_results("openai-codex", "gpt-5-codex") is True
def test_gemini_3_yes(self):
assert _supports_media_in_tool_results("google", "gemini-3-flash-preview") is True
def test_gemini_2_no(self):
assert _supports_media_in_tool_results("google", "gemini-2.5-pro") is False
def test_unknown_provider_conservative_no(self):
assert _supports_media_in_tool_results("brand-new-provider", "any-model") is False
def test_empty_provider_no(self):
assert _supports_media_in_tool_results("", "anything") is False
assert _supports_media_in_tool_results(None, "anything") is False # type: ignore[arg-type]
# ─── _build_native_vision_tool_result ────────────────────────────────────────
class TestBuildNativeVisionToolResult:
def test_envelope_shape(self):
env = _build_native_vision_tool_result(
image_url="/tmp/foo.png",
question="what does it say?",
image_data_url="data:image/png;base64,XYZ",
image_size_bytes=1024,
)
assert env["_multimodal"] is True
assert isinstance(env["content"], list)
assert len(env["content"]) == 2
assert env["content"][0]["type"] == "text"
assert env["content"][1]["type"] == "image_url"
assert env["content"][1]["image_url"]["url"] == "data:image/png;base64,XYZ"
assert "what does it say?" in env["content"][0]["text"]
assert "Image attached natively" in env["text_summary"]
def test_no_question_omits_question_section(self):
env = _build_native_vision_tool_result(
image_url="/tmp/foo.png",
question="",
image_data_url="data:image/png;base64,XYZ",
image_size_bytes=512,
)
text = env["content"][0]["text"]
assert "Question:" not in text
assert "Image loaded" in text
# ─── _vision_analyze_native ──────────────────────────────────────────────────
class TestVisionAnalyzeNative:
def test_local_file_returns_multimodal_envelope(self, tmp_path):
img = tmp_path / "test.png"
img.write_bytes(_TINY_PNG)
result = asyncio.get_event_loop().run_until_complete(
_vision_analyze_native(str(img), "what is this?")
)
assert isinstance(result, dict)
assert result.get("_multimodal") is True
parts = result["content"]
assert any(p.get("type") == "image_url" for p in parts)
assert any(p.get("type") == "text" for p in parts)
url = next(p["image_url"]["url"] for p in parts if p.get("type") == "image_url")
assert url.startswith("data:image/")
def test_missing_file_returns_error_string(self, tmp_path):
result = asyncio.get_event_loop().run_until_complete(
_vision_analyze_native(str(tmp_path / "nope.png"), "?")
)
# tool_error returns a JSON string, not the multimodal envelope
assert isinstance(result, str)
parsed = json.loads(result)
assert parsed.get("success") is False
assert "Invalid image source" in parsed.get("error", "")
def test_empty_image_url_returns_error(self):
result = asyncio.get_event_loop().run_until_complete(
_vision_analyze_native("", "?")
)
assert isinstance(result, str)
parsed = json.loads(result)
assert parsed.get("success") is False
assert "image_url is required" in parsed.get("error", "")
def test_file_url_scheme_resolves(self, tmp_path):
img = tmp_path / "t.png"
img.write_bytes(_TINY_PNG)
result = asyncio.get_event_loop().run_until_complete(
_vision_analyze_native(f"file://{img}", "?")
)
assert isinstance(result, dict)
assert result.get("_multimodal") is True
# ─── _handle_vision_analyze fast-path gating ─────────────────────────────────
class TestHandleVisionAnalyzeFastPath:
"""Verify the dispatcher chooses fast-path vs aux-LLM correctly."""
def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch):
"""Main model supports native vision → fast path returns multimodal."""
img = tmp_path / "x.png"
img.write_bytes(_TINY_PNG)
# Set runtime override so the handler thinks we're on opus@openrouter
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
set_runtime_main("openrouter", "anthropic/claude-opus-4.6")
try:
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
result = asyncio.get_event_loop().run_until_complete(coro)
finally:
clear_runtime_main()
assert isinstance(result, dict), \
f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
assert result.get("_multimodal") is True
def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch):
"""Non-vision main model → fast path skipped, aux LLM path attempted."""
img = tmp_path / "x.png"
img.write_bytes(_TINY_PNG)
async def _aux_sentinel(*args, **kwargs):
return '{"sentinel": "aux-path"}'
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
set_runtime_main("openrouter", "qwen/qwen3-coder")
try:
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
result = asyncio.get_event_loop().run_until_complete(coro)
finally:
clear_runtime_main()
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
"Fast path fired for non-vision model; should have fallen through to aux LLM"
def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch):
"""Even with vision-capable model, unknown provider → fall through."""
img = tmp_path / "x.png"
img.write_bytes(_TINY_PNG)
async def _aux_sentinel(*args, **kwargs):
return '{"sentinel": "aux-path"}'
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6")
try:
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
result = asyncio.get_event_loop().run_until_complete(coro)
finally:
clear_runtime_main()
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
"Fast path fired for unknown provider; should have fallen through"

View file

@ -403,6 +403,232 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None,
return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type) return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type)
# ---------------------------------------------------------------------------
# Native fast path: short-circuit the auxiliary LLM when the active main model
# supports native vision. Instead of asking a separate LLM to describe the
# image and returning text, we load the image, base64-encode it, and return a
# multimodal tool-result envelope. The agent loop unwraps the envelope into an
# OpenAI-style content list on the `tool` role; provider adapters (anthropic,
# codex_responses, chat_completions) translate that into Anthropic
# tool_result image blocks / Responses input_image / OpenAI image_url tool
# content. The main model then "sees" the pixels directly on its next turn.
# ---------------------------------------------------------------------------
def _supports_media_in_tool_results(provider: str, model: str) -> bool:
"""Whether the given provider+model combination accepts image content
inside a tool-result message.
Providers covered today (per spec docs verified Apr-2026):
* Anthropic Messages API (``anthropic`` provider, plus aggregators that
proxy Claude ``openrouter``, ``nous``, ``vertex``, ``bedrock``):
``tool_result`` blocks accept ``image`` content blocks.
* OpenAI Chat Completions: tool messages accept array content with
``image_url`` parts.
* OpenAI Responses (``openai-codex``): ``function_call_output.output``
accepts an array of ``input_text``/``input_image`` items.
* Gemini 3 (and proxied via aggregators): supports multimodal tool
results. Older Gemini does NOT.
For unknown / legacy providers we conservatively return False the
caller falls back to the legacy aux-LLM text path.
"""
if not isinstance(provider, str):
return False
p = provider.strip().lower()
if not p:
return False
# Aggregators that route to multiple vendors — assume support since
# users on these aggregators are typically using vision-capable
# frontier models. Falling back to text would be a regression for
# them.
_AGGREGATORS = {
"openrouter", "nous", "vertex", "bedrock", "anthropic-vertex",
"google-vertex",
}
if p in _AGGREGATORS:
return True
# Native Anthropic
if p in {"anthropic", "claude", "anthropic-direct"}:
return True
# OpenAI Chat Completions and Responses
if p in {"openai", "openai-chat", "openai-codex", "azure-openai"}:
return True
# Gemini — gate on model name; older Gemini variants did not support
# multimodal functionResponse. Gemini 3.x does.
if p in {"google", "gemini", "google-gemini", "google-vertex-gemini"}:
if not isinstance(model, str):
return False
m = model.strip().lower()
if "gemini-3" in m or "gemini-pro-3" in m or "gemini-flash-3" in m:
return True
return False
# Other vision-capable provider stacks. Conservative default: False.
# Add explicit entries here as we verify each provider's tool-result
# multimodal support empirically.
return False
def _build_native_vision_tool_result(
image_url: str,
question: str,
image_data_url: str,
image_size_bytes: int,
) -> Dict[str, Any]:
"""Build the multimodal tool-result envelope returned by the fast path.
Shape:
{
"_multimodal": True,
"content": [
{"type": "text", "text": "<short note + the user's question>"},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}
],
"text_summary": "<plain-text fallback>",
"meta": {"image_url": ..., "size_bytes": N},
}
The text part exists for two reasons: (1) it gives the model an
instruction to act on now that the pixels are in context, and
(2) providers that don't support multimodal tool results can fall back
to ``text_summary``.
"""
# The tool-result text part is intentionally minimal. The model already
# has the user's original question in context; this just acknowledges
# the image is now visible and reminds it what it was asked.
text_part = (
"Image loaded into your context — you can see it natively now. "
"Use your built-in vision to answer the user."
)
if isinstance(question, str) and question.strip():
text_part += f"\n\nQuestion: {question.strip()}"
summary = (
f"Image attached natively for the main model "
f"({image_size_bytes / 1024:.1f} KB). "
"Answer using built-in vision."
)
return {
"_multimodal": True,
"content": [
{"type": "text", "text": text_part},
{"type": "image_url", "image_url": {"url": image_data_url}},
],
"text_summary": summary,
"meta": {
"image_url": image_url[:200],
"size_bytes": image_size_bytes,
"native_vision": True,
},
}
async def _vision_analyze_native(
image_url: str,
question: str,
) -> Any:
"""Fast path for vision-capable main models.
Loads the image (local file OR remote URL), base64-encodes it, and
returns a multimodal tool-result envelope. The agent loop unwraps it;
provider adapters serialize it into the right tool-result-with-image
shape for each backend.
Returns:
A ``_multimodal`` envelope dict on success.
A JSON error string on failure (matches the existing tool-result
contract so the agent loop displays errors normally).
"""
if not isinstance(image_url, str) or not image_url.strip():
return tool_error("image_url is required", success=False)
temp_image_path: Optional[Path] = None
should_cleanup = False
try:
from tools.interrupt import is_interrupted
if is_interrupted():
return tool_error("Interrupted", success=False)
# Resolve the image source (mirrors vision_analyze_tool's logic
# exactly so behaviour is consistent).
resolved_url = image_url
if resolved_url.startswith("file://"):
resolved_url = resolved_url[len("file://"):]
local_path = Path(os.path.expanduser(resolved_url))
if local_path.is_file():
temp_image_path = local_path
should_cleanup = False
elif _validate_image_url(image_url):
blocked = check_website_access(image_url)
if blocked:
return tool_error(blocked["message"], success=False)
temp_dir = get_hermes_dir("cache/vision", "temp_vision_images")
temp_image_path = temp_dir / f"temp_image_{uuid.uuid4()}.jpg"
await _download_image(image_url, temp_image_path)
should_cleanup = True
else:
return tool_error(
"Invalid image source. Provide an HTTP/HTTPS URL or a "
"valid local file path.",
success=False,
)
image_size_bytes = temp_image_path.stat().st_size
detected_mime_type = _detect_image_mime_type(temp_image_path)
if not detected_mime_type:
return tool_error(
"Only real image files are supported for vision analysis.",
success=False,
)
image_data_url = _image_to_base64_data_url(
temp_image_path, mime_type=detected_mime_type,
)
# Honour the same hard cap as the legacy path. Resize if needed.
if len(image_data_url) > _MAX_BASE64_BYTES:
image_data_url = _resize_image_for_vision(
temp_image_path, mime_type=detected_mime_type,
)
if len(image_data_url) > _MAX_BASE64_BYTES:
return tool_error(
f"Image too large for vision API: base64 payload is "
f"{len(image_data_url) / (1024 * 1024):.1f} MB "
f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) "
f"even after resizing. Install Pillow "
f"(`pip install Pillow`) for better auto-resize, "
f"or compress the image manually.",
success=False,
)
return _build_native_vision_tool_result(
image_url=image_url,
question=question,
image_data_url=image_data_url,
image_size_bytes=image_size_bytes,
)
except Exception as exc:
logger.warning("Native vision fast path failed: %s", exc)
return tool_error(f"Native vision failed: {exc}", success=False)
finally:
# Only delete temp files we created — never user-provided paths.
if should_cleanup and temp_image_path is not None:
try:
if temp_image_path.exists():
temp_image_path.unlink()
except Exception:
pass
async def vision_analyze_tool( async def vision_analyze_tool(
image_url: str, image_url: str,
user_prompt: str, user_prompt: str,
@ -758,24 +984,25 @@ from tools.registry import registry, tool_error
VISION_ANALYZE_SCHEMA = { VISION_ANALYZE_SCHEMA = {
"name": "vision_analyze", "name": "vision_analyze",
"description": ( "description": (
"Inspect an image from a URL, file path, or tool output when you need " "Load an image into the conversation so you can see it. Accepts a "
"closer detail than what's visible in the conversation. If the user's " "URL, local file path, or data URL. When your active model has "
"image is already attached to the conversation and you can see it, " "native vision, the image is attached to your context directly "
"just answer directly — only call this tool for images referenced by " "and you read the pixels yourself on the next turn — call this "
"URL/path, images returned inside other tool results (browser " "any time the user references an image (filepath in their message, "
"screenshots, search thumbnails), or when you need a deeper look at " "URL in tool output, screenshot from the browser, etc.). For "
"a specific region the main model's vision may have missed." "non-vision models, falls back to an auxiliary vision model that "
"returns a text description."
), ),
"parameters": { "parameters": {
"type": "object", "type": "object",
"properties": { "properties": {
"image_url": { "image_url": {
"type": "string", "type": "string",
"description": "Image URL (http/https) or local file path to analyze." "description": "Image URL (http/https), local file path, or data: URL to load."
}, },
"question": { "question": {
"type": "string", "type": "string",
"description": "Your specific question or request about the image to resolve. The AI will automatically provide a complete image description AND answer your specific question." "description": "Your specific question or request about the image. Optional context the model uses on the next turn after seeing the image."
} }
}, },
"required": ["image_url", "question"] "required": ["image_url", "question"]
@ -786,6 +1013,31 @@ VISION_ANALYZE_SCHEMA = {
def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
image_url = args.get("image_url", "") image_url = args.get("image_url", "")
question = args.get("question", "") question = args.get("question", "")
# Fast path: when the active main model supports native vision AND the
# provider supports image content inside tool results, short-circuit
# the auxiliary LLM and return the image bytes as a multimodal
# tool-result envelope. The main model sees the pixels directly on its
# next turn — no aux call, no information loss, no extra latency.
try:
from agent.auxiliary_client import _read_main_provider, _read_main_model
from agent.image_routing import decide_image_input_mode
from hermes_cli.config import load_config
_provider = _read_main_provider()
_model = _read_main_model()
_cfg = load_config()
_mode = decide_image_input_mode(_provider, _model, _cfg)
if _mode == "native" and _supports_media_in_tool_results(_provider, _model):
logger.info(
"vision_analyze: native fast path (provider=%s, model=%s)",
_provider, _model,
)
return _vision_analyze_native(image_url, question)
except Exception as exc:
logger.debug("Native vision fast-path check failed; using aux LLM: %s", exc)
# Legacy path: aux LLM describes the image and we return its text.
full_prompt = ( full_prompt = (
"Fully describe and explain everything about this image, then answer the " "Fully describe and explain everything about this image, then answer the "
f"following question:\n\n{question}" f"following question:\n\n{question}"