feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955)

When the active main model has native vision and the provider supports multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini 3, OpenRouter, Nous), vision_analyze loads the image bytes and returns them to the model as a multimodal tool-result envelope. The model then sees the pixels directly on its next turn instead of receiving a lossy text description from an auxiliary LLM. Falls back to the legacy aux-LLM text path for non-vision models and unverified providers. Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and Cline. All four converge on the same pattern: tool results carry image content blocks for vision-capable provider/model combinations. Changes - tools/vision_tools.py: _vision_analyze_native fast path + provider capability table (_supports_media_in_tool_results). Schema description updated to reflect new behaviour. - agent/codex_responses_adapter.py: function_call_output.output now accepts the array form for multimodal tool results (was string-only). Preflight validates input_text/input_image parts. - agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so tools see the live CLI/gateway override, not the stale config.yaml default. set_runtime_main()/clear_runtime_main() helpers. - run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn start so vision_analyze's fast-path check sees the actual runtime. - tests/conftest.py: clear runtime-main override between tests. Tests - tests/tools/test_vision_native_fast_path.py: provider capability table, envelope shape, fast-path gating (vision-capable model uses fast path; non-vision model falls through to aux). - tests/run_agent/test_codex_multimodal_tool_result.py: list tool content becomes function_call_output.output array; preflight preserves arrays and drops unknown part types. Live verified - Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a typed filepath, gets pixels back, reads exact text from images that no aux description could capture (font color irony, multi-line fruit-count list, etc.). PR replaces the closed prior efforts (#16506 shipped the inbound user- attached path; this PR closes the gap for tool-discovered images).
2026-05-20 05:01:30 +00:00 · 2026-05-09 21:06:19 -07:00 · 2026-05-09 21:06:19 -07:00 · 3800972dd0
commit 3800972dd0
parent e62250453b
7 changed files with 757 additions and 10 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -427,6 +427,15 @@ def _reset_module_state():
    except Exception:
        pass

+    # --- agent.auxiliary_client — runtime main provider/model override ---
+    # Set per-turn by AIAgent.run_conversation; tests that import it must
+    # see a clean state so config.yaml fallback works as expected.
+    try:
+        from agent import auxiliary_client as _aux_mod
+        _aux_mod.clear_runtime_main()
+    except Exception:
+        pass
+
    # --- tools.file_tools — per-task read history + file-ops cache ---
    # _read_tracker accumulates per-task_id read history for loop detection,
    # capped by _READ_HISTORY_CAP. If entries from a prior test persist, the
--- a/tests/run_agent/test_codex_multimodal_tool_result.py
+++ b/tests/run_agent/test_codex_multimodal_tool_result.py
@ -0,0 +1,173 @@
+"""Tests for codex_responses_adapter multimodal tool-result handling.
+
+Tool messages can contain a list of OpenAI-style content parts
+(``[{type:"text"...}, {type:"image_url"...}]``) when the
+``vision_analyze`` native fast path returns image bytes for the main model.
+This file verifies the Codex Responses adapter:
+
+  1. Converts that list into ``function_call_output.output`` as an array of
+     ``input_text``/``input_image`` items (not a stringified blob).
+  2. Preserves array-shaped output through the preflight validator.
+"""
+
+from __future__ import annotations
+
+from agent.codex_responses_adapter import (
+    _chat_messages_to_responses_input,
+    _preflight_codex_input_items,
+)
+
+
+def _build_messages_with_multimodal_tool_result():
+    return [
+        {"role": "user", "content": "What's in /tmp/foo.png?"},
+        {
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{
+                "id": "call_abc",
+                "type": "function",
+                "function": {
+                    "name": "vision_analyze",
+                    "arguments": '{"image_url": "/tmp/foo.png", "question": "describe"}',
+                },
+            }],
+        },
+        {
+            "role": "tool",
+            "name": "vision_analyze",
+            "tool_call_id": "call_abc",
+            "content": [
+                {"type": "text", "text": "Image loaded."},
+                {"type": "image_url", "image_url": {"url": "data:image/png;base64,XYZ"}},
+            ],
+        },
+    ]
+
+
+class TestMultimodalToolResultConversion:
+    def test_list_content_becomes_output_array(self):
+        items = _chat_messages_to_responses_input(
+            _build_messages_with_multimodal_tool_result()
+        )
+        # Find the function_call_output item
+        outputs = [it for it in items if it.get("type") == "function_call_output"]
+        assert len(outputs) == 1
+        out = outputs[0]
+        assert out["call_id"] == "call_abc"
+        # Output should be a LIST (array form), not a string
+        assert isinstance(out["output"], list), \
+            f"Expected array output for multimodal tool result, got {type(out['output']).__name__}: {out['output']!r}"
+        types = [p.get("type") for p in out["output"]]
+        assert "input_text" in types
+        assert "input_image" in types
+
+    def test_input_image_preserves_data_url(self):
+        items = _chat_messages_to_responses_input(
+            _build_messages_with_multimodal_tool_result()
+        )
+        out = next(it for it in items if it.get("type") == "function_call_output")
+        image_parts = [p for p in out["output"] if p.get("type") == "input_image"]
+        assert len(image_parts) == 1
+        assert image_parts[0]["image_url"] == "data:image/png;base64,XYZ"
+
+    def test_string_tool_content_still_string_output(self):
+        msgs = [
+            {"role": "user", "content": "hi"},
+            {
+                "role": "assistant", "content": "",
+                "tool_calls": [{
+                    "id": "call_x", "type": "function",
+                    "function": {"name": "terminal", "arguments": "{}"},
+                }],
+            },
+            {
+                "role": "tool", "name": "terminal", "tool_call_id": "call_x",
+                "content": "ls output here",
+            },
+        ]
+        items = _chat_messages_to_responses_input(msgs)
+        out = next(it for it in items if it.get("type") == "function_call_output")
+        assert isinstance(out["output"], str)
+        assert out["output"] == "ls output here"
+
+
+class TestPreflightAcceptsArrayOutput:
+    def test_preflight_passes_array_through(self):
+        raw = [
+            {
+                "type": "function_call",
+                "call_id": "call_abc",
+                "name": "vision_analyze",
+                "arguments": "{}",
+            },
+            {
+                "type": "function_call_output",
+                "call_id": "call_abc",
+                "output": [
+                    {"type": "input_text", "text": "Image loaded."},
+                    {"type": "input_image", "image_url": "data:image/png;base64,ABC"},
+                ],
+            },
+        ]
+        normalized = _preflight_codex_input_items(raw)
+        out = [it for it in normalized if it.get("type") == "function_call_output"][0]
+        assert isinstance(out["output"], list)
+        assert len(out["output"]) == 2
+        assert out["output"][1]["type"] == "input_image"
+        assert out["output"][1]["image_url"] == "data:image/png;base64,ABC"
+
+    def test_preflight_drops_unknown_part_types(self):
+        raw = [
+            {
+                "type": "function_call",
+                "call_id": "call_abc", "name": "vision_analyze", "arguments": "{}",
+            },
+            {
+                "type": "function_call_output",
+                "call_id": "call_abc",
+                "output": [
+                    {"type": "input_text", "text": "ok"},
+                    {"type": "garbage", "data": "nope"},  # unknown — should be dropped
+                    {"type": "input_image", "image_url": "data:image/png;base64,ZZ"},
+                ],
+            },
+        ]
+        normalized = _preflight_codex_input_items(raw)
+        out = [it for it in normalized if it.get("type") == "function_call_output"][0]
+        # The "garbage" part is dropped; valid parts remain
+        types = [p.get("type") for p in out["output"]]
+        assert types == ["input_text", "input_image"]
+
+    def test_preflight_empty_array_becomes_empty_string(self):
+        # Defensive: an array with no valid parts shouldn't break the API call
+        raw = [
+            {
+                "type": "function_call",
+                "call_id": "call_x", "name": "vision_analyze", "arguments": "{}",
+            },
+            {
+                "type": "function_call_output",
+                "call_id": "call_x",
+                "output": [{"type": "garbage"}],  # all dropped
+            },
+        ]
+        normalized = _preflight_codex_input_items(raw)
+        out = [it for it in normalized if it.get("type") == "function_call_output"][0]
+        assert out["output"] == ""
+
+    def test_preflight_string_output_unchanged(self):
+        raw = [
+            {
+                "type": "function_call",
+                "call_id": "call_x", "name": "terminal", "arguments": "{}",
+            },
+            {
+                "type": "function_call_output",
+                "call_id": "call_x",
+                "output": "plain text output",
+            },
+        ]
+        normalized = _preflight_codex_input_items(raw)
+        out = [it for it in normalized if it.get("type") == "function_call_output"][0]
+        assert out["output"] == "plain text output"
--- a/tests/tools/test_vision_native_fast_path.py
+++ b/tests/tools/test_vision_native_fast_path.py
@ -0,0 +1,207 @@
+"""Tests for the native-vision fast path inside vision_analyze.
+
+When the active main model supports native vision AND the provider supports
+image content inside tool-result messages, ``_handle_vision_analyze`` skips
+the auxiliary LLM and returns a multimodal envelope so the main model sees
+the pixels directly on its next turn.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import base64
+import json
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from tools.vision_tools import (
+    _build_native_vision_tool_result,
+    _handle_vision_analyze,
+    _supports_media_in_tool_results,
+    _vision_analyze_native,
+)
+
+
+# Minimal valid 1x1 PNG bytes.
+_TINY_PNG = base64.b64decode(
+    b"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
+)
+
+
+# ─── _supports_media_in_tool_results ─────────────────────────────────────────
+
+
+class TestSupportsMediaInToolResults:
+    def test_anthropic_native_yes(self):
+        assert _supports_media_in_tool_results("anthropic", "claude-opus-4-6") is True
+
+    def test_openrouter_yes(self):
+        assert _supports_media_in_tool_results("openrouter", "anthropic/claude-opus-4.6") is True
+
+    def test_nous_yes(self):
+        assert _supports_media_in_tool_results("nous", "anthropic/claude-sonnet-4.6") is True
+
+    def test_openai_chat_yes(self):
+        assert _supports_media_in_tool_results("openai", "gpt-5.4") is True
+
+    def test_openai_codex_yes(self):
+        assert _supports_media_in_tool_results("openai-codex", "gpt-5-codex") is True
+
+    def test_gemini_3_yes(self):
+        assert _supports_media_in_tool_results("google", "gemini-3-flash-preview") is True
+
+    def test_gemini_2_no(self):
+        assert _supports_media_in_tool_results("google", "gemini-2.5-pro") is False
+
+    def test_unknown_provider_conservative_no(self):
+        assert _supports_media_in_tool_results("brand-new-provider", "any-model") is False
+
+    def test_empty_provider_no(self):
+        assert _supports_media_in_tool_results("", "anything") is False
+        assert _supports_media_in_tool_results(None, "anything") is False  # type: ignore[arg-type]
+
+
+# ─── _build_native_vision_tool_result ────────────────────────────────────────
+
+
+class TestBuildNativeVisionToolResult:
+    def test_envelope_shape(self):
+        env = _build_native_vision_tool_result(
+            image_url="/tmp/foo.png",
+            question="what does it say?",
+            image_data_url="data:image/png;base64,XYZ",
+            image_size_bytes=1024,
+        )
+        assert env["_multimodal"] is True
+        assert isinstance(env["content"], list)
+        assert len(env["content"]) == 2
+        assert env["content"][0]["type"] == "text"
+        assert env["content"][1]["type"] == "image_url"
+        assert env["content"][1]["image_url"]["url"] == "data:image/png;base64,XYZ"
+        assert "what does it say?" in env["content"][0]["text"]
+        assert "Image attached natively" in env["text_summary"]
+
+    def test_no_question_omits_question_section(self):
+        env = _build_native_vision_tool_result(
+            image_url="/tmp/foo.png",
+            question="",
+            image_data_url="data:image/png;base64,XYZ",
+            image_size_bytes=512,
+        )
+        text = env["content"][0]["text"]
+        assert "Question:" not in text
+        assert "Image loaded" in text
+
+
+# ─── _vision_analyze_native ──────────────────────────────────────────────────
+
+
+class TestVisionAnalyzeNative:
+    def test_local_file_returns_multimodal_envelope(self, tmp_path):
+        img = tmp_path / "test.png"
+        img.write_bytes(_TINY_PNG)
+        result = asyncio.get_event_loop().run_until_complete(
+            _vision_analyze_native(str(img), "what is this?")
+        )
+        assert isinstance(result, dict)
+        assert result.get("_multimodal") is True
+        parts = result["content"]
+        assert any(p.get("type") == "image_url" for p in parts)
+        assert any(p.get("type") == "text" for p in parts)
+        url = next(p["image_url"]["url"] for p in parts if p.get("type") == "image_url")
+        assert url.startswith("data:image/")
+
+    def test_missing_file_returns_error_string(self, tmp_path):
+        result = asyncio.get_event_loop().run_until_complete(
+            _vision_analyze_native(str(tmp_path / "nope.png"), "?")
+        )
+        # tool_error returns a JSON string, not the multimodal envelope
+        assert isinstance(result, str)
+        parsed = json.loads(result)
+        assert parsed.get("success") is False
+        assert "Invalid image source" in parsed.get("error", "")
+
+    def test_empty_image_url_returns_error(self):
+        result = asyncio.get_event_loop().run_until_complete(
+            _vision_analyze_native("", "?")
+        )
+        assert isinstance(result, str)
+        parsed = json.loads(result)
+        assert parsed.get("success") is False
+        assert "image_url is required" in parsed.get("error", "")
+
+    def test_file_url_scheme_resolves(self, tmp_path):
+        img = tmp_path / "t.png"
+        img.write_bytes(_TINY_PNG)
+        result = asyncio.get_event_loop().run_until_complete(
+            _vision_analyze_native(f"file://{img}", "?")
+        )
+        assert isinstance(result, dict)
+        assert result.get("_multimodal") is True
+
+
+# ─── _handle_vision_analyze fast-path gating ─────────────────────────────────
+
+
+class TestHandleVisionAnalyzeFastPath:
+    """Verify the dispatcher chooses fast-path vs aux-LLM correctly."""
+
+    def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch):
+        """Main model supports native vision → fast path returns multimodal."""
+        img = tmp_path / "x.png"
+        img.write_bytes(_TINY_PNG)
+
+        # Set runtime override so the handler thinks we're on opus@openrouter
+        from agent.auxiliary_client import set_runtime_main, clear_runtime_main
+        set_runtime_main("openrouter", "anthropic/claude-opus-4.6")
+        try:
+            coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
+            result = asyncio.get_event_loop().run_until_complete(coro)
+        finally:
+            clear_runtime_main()
+
+        assert isinstance(result, dict), \
+            f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
+        assert result.get("_multimodal") is True
+
+    def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch):
+        """Non-vision main model → fast path skipped, aux LLM path attempted."""
+        img = tmp_path / "x.png"
+        img.write_bytes(_TINY_PNG)
+
+        async def _aux_sentinel(*args, **kwargs):
+            return '{"sentinel": "aux-path"}'
+
+        from agent.auxiliary_client import set_runtime_main, clear_runtime_main
+        set_runtime_main("openrouter", "qwen/qwen3-coder")
+        try:
+            with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
+                coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
+                result = asyncio.get_event_loop().run_until_complete(coro)
+        finally:
+            clear_runtime_main()
+
+        assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
+            "Fast path fired for non-vision model; should have fallen through to aux LLM"
+
+    def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch):
+        """Even with vision-capable model, unknown provider → fall through."""
+        img = tmp_path / "x.png"
+        img.write_bytes(_TINY_PNG)
+
+        async def _aux_sentinel(*args, **kwargs):
+            return '{"sentinel": "aux-path"}'
+
+        from agent.auxiliary_client import set_runtime_main, clear_runtime_main
+        set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6")
+        try:
+            with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
+                coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
+                result = asyncio.get_event_loop().run_until_complete(coro)
+        finally:
+            clear_runtime_main()
+
+        assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
+            "Fast path fired for unknown provider; should have fallen through"