From 3800972dd05eabed8d75bfc4c0f5d532d85dafe2 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 9 May 2026 21:06:19 -0700 Subject: [PATCH] feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955) When the active main model has native vision and the provider supports multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini 3, OpenRouter, Nous), vision_analyze loads the image bytes and returns them to the model as a multimodal tool-result envelope. The model then sees the pixels directly on its next turn instead of receiving a lossy text description from an auxiliary LLM. Falls back to the legacy aux-LLM text path for non-vision models and unverified providers. Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and Cline. All four converge on the same pattern: tool results carry image content blocks for vision-capable provider/model combinations. Changes - tools/vision_tools.py: _vision_analyze_native fast path + provider capability table (_supports_media_in_tool_results). Schema description updated to reflect new behaviour. - agent/codex_responses_adapter.py: function_call_output.output now accepts the array form for multimodal tool results (was string-only). Preflight validates input_text/input_image parts. - agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so tools see the live CLI/gateway override, not the stale config.yaml default. set_runtime_main()/clear_runtime_main() helpers. - run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn start so vision_analyze's fast-path check sees the actual runtime. - tests/conftest.py: clear runtime-main override between tests. Tests - tests/tools/test_vision_native_fast_path.py: provider capability table, envelope shape, fast-path gating (vision-capable model uses fast path; non-vision model falls through to aux). - tests/run_agent/test_codex_multimodal_tool_result.py: list tool content becomes function_call_output.output array; preflight preserves arrays and drops unknown part types. Live verified - Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a typed filepath, gets pixels back, reads exact text from images that no aux description could capture (font color irony, multi-line fruit-count list, etc.). PR replaces the closed prior efforts (#16506 shipped the inbound user- attached path; this PR closes the gap for tool-discovered images). --- agent/auxiliary_client.py | 41 +++ agent/codex_responses_adapter.py | 53 +++- run_agent.py | 14 + tests/conftest.py | 9 + .../test_codex_multimodal_tool_result.py | 173 +++++++++++ tests/tools/test_vision_native_fast_path.py | 207 ++++++++++++++ tools/vision_tools.py | 270 +++++++++++++++++- 7 files changed, 757 insertions(+), 10 deletions(-) create mode 100644 tests/run_agent/test_codex_multimodal_tool_result.py create mode 100644 tests/tools/test_vision_native_fast_path.py diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 874b7d7b963..fbd79989209 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -1463,7 +1463,16 @@ def _read_main_model() -> str: config.yaml model.default is the single source of truth for the active model. Environment variables are no longer consulted. + + Runtime override: when an AIAgent is active with a CLI/gateway-provided + model that differs from config.yaml, ``set_runtime_main()`` records the + override in a process-local global. This is consulted FIRST so tools + that gate on "the active main model" (e.g. ``vision_analyze``'s native + fast path) see the live runtime, not the persisted config default. """ + override = _RUNTIME_MAIN_MODEL + if isinstance(override, str) and override.strip(): + return override.strip() try: from hermes_cli.config import load_config cfg = load_config() @@ -1484,7 +1493,13 @@ def _read_main_provider() -> str: Returns the lowercase provider id (e.g. "alibaba", "openrouter") or "" if not configured. + + Runtime override: see ``_read_main_model`` — same mechanism for the + provider half of the runtime tuple. """ + override = _RUNTIME_MAIN_PROVIDER + if isinstance(override, str) and override.strip(): + return override.strip().lower() try: from hermes_cli.config import load_config cfg = load_config() @@ -1498,6 +1513,32 @@ def _read_main_provider() -> str: return "" +# Process-local override set by AIAgent at session/turn start. Single-threaded +# per turn — no lock needed. Cleared by ``clear_runtime_main()``. +_RUNTIME_MAIN_PROVIDER: str = "" +_RUNTIME_MAIN_MODEL: str = "" + + +def set_runtime_main(provider: str, model: str) -> None: + """Record the live runtime provider/model for the current AIAgent. + + Called by ``run_agent.AIAgent._sync_runtime_main_for_aux_routing`` (or + equivalent setter) at the top of each turn so that + ``_read_main_provider`` / ``_read_main_model`` reflect CLI/gateway + overrides instead of the stale config.yaml default. + """ + global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL + _RUNTIME_MAIN_PROVIDER = (provider or "").strip().lower() + _RUNTIME_MAIN_MODEL = (model or "").strip() + + +def clear_runtime_main() -> None: + """Clear the runtime override (e.g. on session end).""" + global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL + _RUNTIME_MAIN_PROVIDER = "" + _RUNTIME_MAIN_MODEL = "" + + def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]: """Resolve the active custom/main endpoint the same way the main CLI does. diff --git a/agent/codex_responses_adapter.py b/agent/codex_responses_adapter.py index c5d6dfcea48..ef4119ceb89 100644 --- a/agent/codex_responses_adapter.py +++ b/agent/codex_responses_adapter.py @@ -410,10 +410,29 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di call_id = raw_tool_call_id.strip() if not isinstance(call_id, str) or not call_id.strip(): continue + + # Multimodal tool result: convert OpenAI-style content list into + # Responses ``function_call_output.output`` array. The Responses + # API accepts ``output`` as either a string or an array of + # ``input_text``/``input_image`` items. See + # https://developers.openai.com/api/reference/python/resources/responses/. + tool_content = msg.get("content") + output_value: Any + if isinstance(tool_content, list): + converted = _chat_content_to_responses_parts( + tool_content, role="user", + ) + if converted: + output_value = converted + else: + output_value = "" + else: + output_value = str(tool_content or "") + items.append({ "type": "function_call_output", "call_id": call_id, - "output": str(msg.get("content", "") or ""), + "output": output_value, }) return items @@ -466,6 +485,38 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]: output = item.get("output", "") if output is None: output = "" + # Output may be a string OR an array of structured content + # items (input_text / input_image) for multimodal tool results. + # Both shapes are accepted by the Responses API. We preserve + # the array form when present. + if isinstance(output, list): + # Validate each item is a recognised content shape; drop + # anything else to avoid 4xx from the API. + cleaned: List[Dict[str, Any]] = [] + for part in output: + if not isinstance(part, dict): + continue + ptype = part.get("type") + if ptype == "input_text": + text = part.get("text") + if isinstance(text, str) and text: + cleaned.append({"type": "input_text", "text": text}) + elif ptype == "input_image": + url = part.get("image_url") + if isinstance(url, str) and url: + entry: Dict[str, Any] = {"type": "input_image", "image_url": url} + detail = part.get("detail") + if isinstance(detail, str) and detail.strip(): + entry["detail"] = detail.strip() + cleaned.append(entry) + normalized.append( + { + "type": "function_call_output", + "call_id": call_id.strip(), + "output": cleaned if cleaned else "", + } + ) + continue if not isinstance(output, str): output = str(output) diff --git a/run_agent.py b/run_agent.py index 8ae39c6faf0..a4c2465d04d 100644 --- a/run_agent.py +++ b/run_agent.py @@ -11119,6 +11119,20 @@ class AIAgent: self._ensure_db_session() + # Tell auxiliary_client what the live main provider/model are for + # this turn. Used by tools whose behaviour depends on the active + # main model (e.g. vision_analyze's native fast path) so they see + # the CLI/gateway override instead of the stale config.yaml + # default. Idempotent — fine to call every turn. + try: + from agent.auxiliary_client import set_runtime_main + set_runtime_main( + getattr(self, "provider", "") or "", + getattr(self, "model", "") or "", + ) + except Exception: + pass + # Tag all log records on this thread with the session ID so # ``hermes logs --session `` can filter a single conversation. from hermes_logging import set_session_context diff --git a/tests/conftest.py b/tests/conftest.py index 4fc15fd1e00..651a48b3916 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -427,6 +427,15 @@ def _reset_module_state(): except Exception: pass + # --- agent.auxiliary_client — runtime main provider/model override --- + # Set per-turn by AIAgent.run_conversation; tests that import it must + # see a clean state so config.yaml fallback works as expected. + try: + from agent import auxiliary_client as _aux_mod + _aux_mod.clear_runtime_main() + except Exception: + pass + # --- tools.file_tools — per-task read history + file-ops cache --- # _read_tracker accumulates per-task_id read history for loop detection, # capped by _READ_HISTORY_CAP. If entries from a prior test persist, the diff --git a/tests/run_agent/test_codex_multimodal_tool_result.py b/tests/run_agent/test_codex_multimodal_tool_result.py new file mode 100644 index 00000000000..e02fe1eda77 --- /dev/null +++ b/tests/run_agent/test_codex_multimodal_tool_result.py @@ -0,0 +1,173 @@ +"""Tests for codex_responses_adapter multimodal tool-result handling. + +Tool messages can contain a list of OpenAI-style content parts +(``[{type:"text"...}, {type:"image_url"...}]``) when the +``vision_analyze`` native fast path returns image bytes for the main model. +This file verifies the Codex Responses adapter: + + 1. Converts that list into ``function_call_output.output`` as an array of + ``input_text``/``input_image`` items (not a stringified blob). + 2. Preserves array-shaped output through the preflight validator. +""" + +from __future__ import annotations + +from agent.codex_responses_adapter import ( + _chat_messages_to_responses_input, + _preflight_codex_input_items, +) + + +def _build_messages_with_multimodal_tool_result(): + return [ + {"role": "user", "content": "What's in /tmp/foo.png?"}, + { + "role": "assistant", + "content": "", + "tool_calls": [{ + "id": "call_abc", + "type": "function", + "function": { + "name": "vision_analyze", + "arguments": '{"image_url": "/tmp/foo.png", "question": "describe"}', + }, + }], + }, + { + "role": "tool", + "name": "vision_analyze", + "tool_call_id": "call_abc", + "content": [ + {"type": "text", "text": "Image loaded."}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,XYZ"}}, + ], + }, + ] + + +class TestMultimodalToolResultConversion: + def test_list_content_becomes_output_array(self): + items = _chat_messages_to_responses_input( + _build_messages_with_multimodal_tool_result() + ) + # Find the function_call_output item + outputs = [it for it in items if it.get("type") == "function_call_output"] + assert len(outputs) == 1 + out = outputs[0] + assert out["call_id"] == "call_abc" + # Output should be a LIST (array form), not a string + assert isinstance(out["output"], list), \ + f"Expected array output for multimodal tool result, got {type(out['output']).__name__}: {out['output']!r}" + types = [p.get("type") for p in out["output"]] + assert "input_text" in types + assert "input_image" in types + + def test_input_image_preserves_data_url(self): + items = _chat_messages_to_responses_input( + _build_messages_with_multimodal_tool_result() + ) + out = next(it for it in items if it.get("type") == "function_call_output") + image_parts = [p for p in out["output"] if p.get("type") == "input_image"] + assert len(image_parts) == 1 + assert image_parts[0]["image_url"] == "data:image/png;base64,XYZ" + + def test_string_tool_content_still_string_output(self): + msgs = [ + {"role": "user", "content": "hi"}, + { + "role": "assistant", "content": "", + "tool_calls": [{ + "id": "call_x", "type": "function", + "function": {"name": "terminal", "arguments": "{}"}, + }], + }, + { + "role": "tool", "name": "terminal", "tool_call_id": "call_x", + "content": "ls output here", + }, + ] + items = _chat_messages_to_responses_input(msgs) + out = next(it for it in items if it.get("type") == "function_call_output") + assert isinstance(out["output"], str) + assert out["output"] == "ls output here" + + +class TestPreflightAcceptsArrayOutput: + def test_preflight_passes_array_through(self): + raw = [ + { + "type": "function_call", + "call_id": "call_abc", + "name": "vision_analyze", + "arguments": "{}", + }, + { + "type": "function_call_output", + "call_id": "call_abc", + "output": [ + {"type": "input_text", "text": "Image loaded."}, + {"type": "input_image", "image_url": "data:image/png;base64,ABC"}, + ], + }, + ] + normalized = _preflight_codex_input_items(raw) + out = [it for it in normalized if it.get("type") == "function_call_output"][0] + assert isinstance(out["output"], list) + assert len(out["output"]) == 2 + assert out["output"][1]["type"] == "input_image" + assert out["output"][1]["image_url"] == "data:image/png;base64,ABC" + + def test_preflight_drops_unknown_part_types(self): + raw = [ + { + "type": "function_call", + "call_id": "call_abc", "name": "vision_analyze", "arguments": "{}", + }, + { + "type": "function_call_output", + "call_id": "call_abc", + "output": [ + {"type": "input_text", "text": "ok"}, + {"type": "garbage", "data": "nope"}, # unknown — should be dropped + {"type": "input_image", "image_url": "data:image/png;base64,ZZ"}, + ], + }, + ] + normalized = _preflight_codex_input_items(raw) + out = [it for it in normalized if it.get("type") == "function_call_output"][0] + # The "garbage" part is dropped; valid parts remain + types = [p.get("type") for p in out["output"]] + assert types == ["input_text", "input_image"] + + def test_preflight_empty_array_becomes_empty_string(self): + # Defensive: an array with no valid parts shouldn't break the API call + raw = [ + { + "type": "function_call", + "call_id": "call_x", "name": "vision_analyze", "arguments": "{}", + }, + { + "type": "function_call_output", + "call_id": "call_x", + "output": [{"type": "garbage"}], # all dropped + }, + ] + normalized = _preflight_codex_input_items(raw) + out = [it for it in normalized if it.get("type") == "function_call_output"][0] + assert out["output"] == "" + + def test_preflight_string_output_unchanged(self): + raw = [ + { + "type": "function_call", + "call_id": "call_x", "name": "terminal", "arguments": "{}", + }, + { + "type": "function_call_output", + "call_id": "call_x", + "output": "plain text output", + }, + ] + normalized = _preflight_codex_input_items(raw) + out = [it for it in normalized if it.get("type") == "function_call_output"][0] + assert out["output"] == "plain text output" diff --git a/tests/tools/test_vision_native_fast_path.py b/tests/tools/test_vision_native_fast_path.py new file mode 100644 index 00000000000..fce3772de8e --- /dev/null +++ b/tests/tools/test_vision_native_fast_path.py @@ -0,0 +1,207 @@ +"""Tests for the native-vision fast path inside vision_analyze. + +When the active main model supports native vision AND the provider supports +image content inside tool-result messages, ``_handle_vision_analyze`` skips +the auxiliary LLM and returns a multimodal envelope so the main model sees +the pixels directly on its next turn. +""" + +from __future__ import annotations + +import asyncio +import base64 +import json +from pathlib import Path +from unittest.mock import patch + +import pytest + +from tools.vision_tools import ( + _build_native_vision_tool_result, + _handle_vision_analyze, + _supports_media_in_tool_results, + _vision_analyze_native, +) + + +# Minimal valid 1x1 PNG bytes. +_TINY_PNG = base64.b64decode( + b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" +) + + +# ─── _supports_media_in_tool_results ───────────────────────────────────────── + + +class TestSupportsMediaInToolResults: + def test_anthropic_native_yes(self): + assert _supports_media_in_tool_results("anthropic", "claude-opus-4-6") is True + + def test_openrouter_yes(self): + assert _supports_media_in_tool_results("openrouter", "anthropic/claude-opus-4.6") is True + + def test_nous_yes(self): + assert _supports_media_in_tool_results("nous", "anthropic/claude-sonnet-4.6") is True + + def test_openai_chat_yes(self): + assert _supports_media_in_tool_results("openai", "gpt-5.4") is True + + def test_openai_codex_yes(self): + assert _supports_media_in_tool_results("openai-codex", "gpt-5-codex") is True + + def test_gemini_3_yes(self): + assert _supports_media_in_tool_results("google", "gemini-3-flash-preview") is True + + def test_gemini_2_no(self): + assert _supports_media_in_tool_results("google", "gemini-2.5-pro") is False + + def test_unknown_provider_conservative_no(self): + assert _supports_media_in_tool_results("brand-new-provider", "any-model") is False + + def test_empty_provider_no(self): + assert _supports_media_in_tool_results("", "anything") is False + assert _supports_media_in_tool_results(None, "anything") is False # type: ignore[arg-type] + + +# ─── _build_native_vision_tool_result ──────────────────────────────────────── + + +class TestBuildNativeVisionToolResult: + def test_envelope_shape(self): + env = _build_native_vision_tool_result( + image_url="/tmp/foo.png", + question="what does it say?", + image_data_url="data:image/png;base64,XYZ", + image_size_bytes=1024, + ) + assert env["_multimodal"] is True + assert isinstance(env["content"], list) + assert len(env["content"]) == 2 + assert env["content"][0]["type"] == "text" + assert env["content"][1]["type"] == "image_url" + assert env["content"][1]["image_url"]["url"] == "data:image/png;base64,XYZ" + assert "what does it say?" in env["content"][0]["text"] + assert "Image attached natively" in env["text_summary"] + + def test_no_question_omits_question_section(self): + env = _build_native_vision_tool_result( + image_url="/tmp/foo.png", + question="", + image_data_url="data:image/png;base64,XYZ", + image_size_bytes=512, + ) + text = env["content"][0]["text"] + assert "Question:" not in text + assert "Image loaded" in text + + +# ─── _vision_analyze_native ────────────────────────────────────────────────── + + +class TestVisionAnalyzeNative: + def test_local_file_returns_multimodal_envelope(self, tmp_path): + img = tmp_path / "test.png" + img.write_bytes(_TINY_PNG) + result = asyncio.get_event_loop().run_until_complete( + _vision_analyze_native(str(img), "what is this?") + ) + assert isinstance(result, dict) + assert result.get("_multimodal") is True + parts = result["content"] + assert any(p.get("type") == "image_url" for p in parts) + assert any(p.get("type") == "text" for p in parts) + url = next(p["image_url"]["url"] for p in parts if p.get("type") == "image_url") + assert url.startswith("data:image/") + + def test_missing_file_returns_error_string(self, tmp_path): + result = asyncio.get_event_loop().run_until_complete( + _vision_analyze_native(str(tmp_path / "nope.png"), "?") + ) + # tool_error returns a JSON string, not the multimodal envelope + assert isinstance(result, str) + parsed = json.loads(result) + assert parsed.get("success") is False + assert "Invalid image source" in parsed.get("error", "") + + def test_empty_image_url_returns_error(self): + result = asyncio.get_event_loop().run_until_complete( + _vision_analyze_native("", "?") + ) + assert isinstance(result, str) + parsed = json.loads(result) + assert parsed.get("success") is False + assert "image_url is required" in parsed.get("error", "") + + def test_file_url_scheme_resolves(self, tmp_path): + img = tmp_path / "t.png" + img.write_bytes(_TINY_PNG) + result = asyncio.get_event_loop().run_until_complete( + _vision_analyze_native(f"file://{img}", "?") + ) + assert isinstance(result, dict) + assert result.get("_multimodal") is True + + +# ─── _handle_vision_analyze fast-path gating ───────────────────────────────── + + +class TestHandleVisionAnalyzeFastPath: + """Verify the dispatcher chooses fast-path vs aux-LLM correctly.""" + + def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch): + """Main model supports native vision → fast path returns multimodal.""" + img = tmp_path / "x.png" + img.write_bytes(_TINY_PNG) + + # Set runtime override so the handler thinks we're on opus@openrouter + from agent.auxiliary_client import set_runtime_main, clear_runtime_main + set_runtime_main("openrouter", "anthropic/claude-opus-4.6") + try: + coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) + result = asyncio.get_event_loop().run_until_complete(coro) + finally: + clear_runtime_main() + + assert isinstance(result, dict), \ + f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}" + assert result.get("_multimodal") is True + + def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch): + """Non-vision main model → fast path skipped, aux LLM path attempted.""" + img = tmp_path / "x.png" + img.write_bytes(_TINY_PNG) + + async def _aux_sentinel(*args, **kwargs): + return '{"sentinel": "aux-path"}' + + from agent.auxiliary_client import set_runtime_main, clear_runtime_main + set_runtime_main("openrouter", "qwen/qwen3-coder") + try: + with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel): + coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) + result = asyncio.get_event_loop().run_until_complete(coro) + finally: + clear_runtime_main() + + assert not (isinstance(result, dict) and result.get("_multimodal") is True), \ + "Fast path fired for non-vision model; should have fallen through to aux LLM" + + def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch): + """Even with vision-capable model, unknown provider → fall through.""" + img = tmp_path / "x.png" + img.write_bytes(_TINY_PNG) + + async def _aux_sentinel(*args, **kwargs): + return '{"sentinel": "aux-path"}' + + from agent.auxiliary_client import set_runtime_main, clear_runtime_main + set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6") + try: + with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel): + coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) + result = asyncio.get_event_loop().run_until_complete(coro) + finally: + clear_runtime_main() + + assert not (isinstance(result, dict) and result.get("_multimodal") is True), \ + "Fast path fired for unknown provider; should have fallen through" diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 611e6bcef60..d8c6f64f021 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -403,6 +403,232 @@ def _resize_image_for_vision(image_path: Path, mime_type: Optional[str] = None, return data_url or _image_to_base64_data_url(image_path, mime_type=mime_type) +# --------------------------------------------------------------------------- +# Native fast path: short-circuit the auxiliary LLM when the active main model +# supports native vision. Instead of asking a separate LLM to describe the +# image and returning text, we load the image, base64-encode it, and return a +# multimodal tool-result envelope. The agent loop unwraps the envelope into an +# OpenAI-style content list on the `tool` role; provider adapters (anthropic, +# codex_responses, chat_completions) translate that into Anthropic +# tool_result image blocks / Responses input_image / OpenAI image_url tool +# content. The main model then "sees" the pixels directly on its next turn. +# --------------------------------------------------------------------------- + + +def _supports_media_in_tool_results(provider: str, model: str) -> bool: + """Whether the given provider+model combination accepts image content + inside a tool-result message. + + Providers covered today (per spec docs verified Apr-2026): + + * Anthropic Messages API (``anthropic`` provider, plus aggregators that + proxy Claude — ``openrouter``, ``nous``, ``vertex``, ``bedrock``): + ``tool_result`` blocks accept ``image`` content blocks. + * OpenAI Chat Completions: tool messages accept array content with + ``image_url`` parts. + * OpenAI Responses (``openai-codex``): ``function_call_output.output`` + accepts an array of ``input_text``/``input_image`` items. + * Gemini 3 (and proxied via aggregators): supports multimodal tool + results. Older Gemini does NOT. + + For unknown / legacy providers we conservatively return False — the + caller falls back to the legacy aux-LLM text path. + """ + if not isinstance(provider, str): + return False + p = provider.strip().lower() + if not p: + return False + + # Aggregators that route to multiple vendors — assume support since + # users on these aggregators are typically using vision-capable + # frontier models. Falling back to text would be a regression for + # them. + _AGGREGATORS = { + "openrouter", "nous", "vertex", "bedrock", "anthropic-vertex", + "google-vertex", + } + if p in _AGGREGATORS: + return True + + # Native Anthropic + if p in {"anthropic", "claude", "anthropic-direct"}: + return True + + # OpenAI Chat Completions and Responses + if p in {"openai", "openai-chat", "openai-codex", "azure-openai"}: + return True + + # Gemini — gate on model name; older Gemini variants did not support + # multimodal functionResponse. Gemini 3.x does. + if p in {"google", "gemini", "google-gemini", "google-vertex-gemini"}: + if not isinstance(model, str): + return False + m = model.strip().lower() + if "gemini-3" in m or "gemini-pro-3" in m or "gemini-flash-3" in m: + return True + return False + + # Other vision-capable provider stacks. Conservative default: False. + # Add explicit entries here as we verify each provider's tool-result + # multimodal support empirically. + return False + + +def _build_native_vision_tool_result( + image_url: str, + question: str, + image_data_url: str, + image_size_bytes: int, +) -> Dict[str, Any]: + """Build the multimodal tool-result envelope returned by the fast path. + + Shape: + { + "_multimodal": True, + "content": [ + {"type": "text", "text": ""}, + {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}} + ], + "text_summary": "", + "meta": {"image_url": ..., "size_bytes": N}, + } + + The text part exists for two reasons: (1) it gives the model an + instruction to act on now that the pixels are in context, and + (2) providers that don't support multimodal tool results can fall back + to ``text_summary``. + """ + # The tool-result text part is intentionally minimal. The model already + # has the user's original question in context; this just acknowledges + # the image is now visible and reminds it what it was asked. + text_part = ( + "Image loaded into your context — you can see it natively now. " + "Use your built-in vision to answer the user." + ) + if isinstance(question, str) and question.strip(): + text_part += f"\n\nQuestion: {question.strip()}" + + summary = ( + f"Image attached natively for the main model " + f"({image_size_bytes / 1024:.1f} KB). " + "Answer using built-in vision." + ) + + return { + "_multimodal": True, + "content": [ + {"type": "text", "text": text_part}, + {"type": "image_url", "image_url": {"url": image_data_url}}, + ], + "text_summary": summary, + "meta": { + "image_url": image_url[:200], + "size_bytes": image_size_bytes, + "native_vision": True, + }, + } + + +async def _vision_analyze_native( + image_url: str, + question: str, +) -> Any: + """Fast path for vision-capable main models. + + Loads the image (local file OR remote URL), base64-encodes it, and + returns a multimodal tool-result envelope. The agent loop unwraps it; + provider adapters serialize it into the right tool-result-with-image + shape for each backend. + + Returns: + A ``_multimodal`` envelope dict on success. + A JSON error string on failure (matches the existing tool-result + contract so the agent loop displays errors normally). + """ + if not isinstance(image_url, str) or not image_url.strip(): + return tool_error("image_url is required", success=False) + + temp_image_path: Optional[Path] = None + should_cleanup = False + try: + from tools.interrupt import is_interrupted + if is_interrupted(): + return tool_error("Interrupted", success=False) + + # Resolve the image source (mirrors vision_analyze_tool's logic + # exactly so behaviour is consistent). + resolved_url = image_url + if resolved_url.startswith("file://"): + resolved_url = resolved_url[len("file://"):] + local_path = Path(os.path.expanduser(resolved_url)) + + if local_path.is_file(): + temp_image_path = local_path + should_cleanup = False + elif _validate_image_url(image_url): + blocked = check_website_access(image_url) + if blocked: + return tool_error(blocked["message"], success=False) + temp_dir = get_hermes_dir("cache/vision", "temp_vision_images") + temp_image_path = temp_dir / f"temp_image_{uuid.uuid4()}.jpg" + await _download_image(image_url, temp_image_path) + should_cleanup = True + else: + return tool_error( + "Invalid image source. Provide an HTTP/HTTPS URL or a " + "valid local file path.", + success=False, + ) + + image_size_bytes = temp_image_path.stat().st_size + detected_mime_type = _detect_image_mime_type(temp_image_path) + if not detected_mime_type: + return tool_error( + "Only real image files are supported for vision analysis.", + success=False, + ) + + image_data_url = _image_to_base64_data_url( + temp_image_path, mime_type=detected_mime_type, + ) + + # Honour the same hard cap as the legacy path. Resize if needed. + if len(image_data_url) > _MAX_BASE64_BYTES: + image_data_url = _resize_image_for_vision( + temp_image_path, mime_type=detected_mime_type, + ) + if len(image_data_url) > _MAX_BASE64_BYTES: + return tool_error( + f"Image too large for vision API: base64 payload is " + f"{len(image_data_url) / (1024 * 1024):.1f} MB " + f"(limit {_MAX_BASE64_BYTES / (1024 * 1024):.0f} MB) " + f"even after resizing. Install Pillow " + f"(`pip install Pillow`) for better auto-resize, " + f"or compress the image manually.", + success=False, + ) + + return _build_native_vision_tool_result( + image_url=image_url, + question=question, + image_data_url=image_data_url, + image_size_bytes=image_size_bytes, + ) + + except Exception as exc: + logger.warning("Native vision fast path failed: %s", exc) + return tool_error(f"Native vision failed: {exc}", success=False) + finally: + # Only delete temp files we created — never user-provided paths. + if should_cleanup and temp_image_path is not None: + try: + if temp_image_path.exists(): + temp_image_path.unlink() + except Exception: + pass + + async def vision_analyze_tool( image_url: str, user_prompt: str, @@ -758,24 +984,25 @@ from tools.registry import registry, tool_error VISION_ANALYZE_SCHEMA = { "name": "vision_analyze", "description": ( - "Inspect an image from a URL, file path, or tool output when you need " - "closer detail than what's visible in the conversation. If the user's " - "image is already attached to the conversation and you can see it, " - "just answer directly — only call this tool for images referenced by " - "URL/path, images returned inside other tool results (browser " - "screenshots, search thumbnails), or when you need a deeper look at " - "a specific region the main model's vision may have missed." + "Load an image into the conversation so you can see it. Accepts a " + "URL, local file path, or data URL. When your active model has " + "native vision, the image is attached to your context directly " + "and you read the pixels yourself on the next turn — call this " + "any time the user references an image (filepath in their message, " + "URL in tool output, screenshot from the browser, etc.). For " + "non-vision models, falls back to an auxiliary vision model that " + "returns a text description." ), "parameters": { "type": "object", "properties": { "image_url": { "type": "string", - "description": "Image URL (http/https) or local file path to analyze." + "description": "Image URL (http/https), local file path, or data: URL to load." }, "question": { "type": "string", - "description": "Your specific question or request about the image to resolve. The AI will automatically provide a complete image description AND answer your specific question." + "description": "Your specific question or request about the image. Optional context the model uses on the next turn after seeing the image." } }, "required": ["image_url", "question"] @@ -786,6 +1013,31 @@ VISION_ANALYZE_SCHEMA = { def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: image_url = args.get("image_url", "") question = args.get("question", "") + + # Fast path: when the active main model supports native vision AND the + # provider supports image content inside tool results, short-circuit + # the auxiliary LLM and return the image bytes as a multimodal + # tool-result envelope. The main model sees the pixels directly on its + # next turn — no aux call, no information loss, no extra latency. + try: + from agent.auxiliary_client import _read_main_provider, _read_main_model + from agent.image_routing import decide_image_input_mode + from hermes_cli.config import load_config + + _provider = _read_main_provider() + _model = _read_main_model() + _cfg = load_config() + _mode = decide_image_input_mode(_provider, _model, _cfg) + if _mode == "native" and _supports_media_in_tool_results(_provider, _model): + logger.info( + "vision_analyze: native fast path (provider=%s, model=%s)", + _provider, _model, + ) + return _vision_analyze_native(image_url, question) + except Exception as exc: + logger.debug("Native vision fast-path check failed; using aux LLM: %s", exc) + + # Legacy path: aux LLM describes the image and we return its text. full_prompt = ( "Fully describe and explain everything about this image, then answer the " f"following question:\n\n{question}"