feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955)

When the active main model has native vision and the provider supports multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini 3, OpenRouter, Nous), vision_analyze loads the image bytes and returns them to the model as a multimodal tool-result envelope. The model then sees the pixels directly on its next turn instead of receiving a lossy text description from an auxiliary LLM. Falls back to the legacy aux-LLM text path for non-vision models and unverified providers. Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and Cline. All four converge on the same pattern: tool results carry image content blocks for vision-capable provider/model combinations. Changes - tools/vision_tools.py: _vision_analyze_native fast path + provider capability table (_supports_media_in_tool_results). Schema description updated to reflect new behaviour. - agent/codex_responses_adapter.py: function_call_output.output now accepts the array form for multimodal tool results (was string-only). Preflight validates input_text/input_image parts. - agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so tools see the live CLI/gateway override, not the stale config.yaml default. set_runtime_main()/clear_runtime_main() helpers. - run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn start so vision_analyze's fast-path check sees the actual runtime. - tests/conftest.py: clear runtime-main override between tests. Tests - tests/tools/test_vision_native_fast_path.py: provider capability table, envelope shape, fast-path gating (vision-capable model uses fast path; non-vision model falls through to aux). - tests/run_agent/test_codex_multimodal_tool_result.py: list tool content becomes function_call_output.output array; preflight preserves arrays and drops unknown part types. Live verified - Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a typed filepath, gets pixels back, reads exact text from images that no aux description could capture (font color irony, multi-line fruit-count list, etc.). PR replaces the closed prior efforts (#16506 shipped the inbound user- attached path; this PR closes the gap for tool-discovered images).
2026-05-18 04:41:56 +00:00 · 2026-05-09 21:06:19 -07:00 · 2026-05-09 21:06:19 -07:00 · 3800972dd0
commit 3800972dd0
parent e62250453b
7 changed files with 757 additions and 10 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -1463,7 +1463,16 @@ def _read_main_model() -> str:

    config.yaml model.default is the single source of truth for the active
    model. Environment variables are no longer consulted.
+
+    Runtime override: when an AIAgent is active with a CLI/gateway-provided
+    model that differs from config.yaml, ``set_runtime_main()`` records the
+    override in a process-local global. This is consulted FIRST so tools
+    that gate on "the active main model" (e.g. ``vision_analyze``'s native
+    fast path) see the live runtime, not the persisted config default.
    """
+    override = _RUNTIME_MAIN_MODEL
+    if isinstance(override, str) and override.strip():
+        return override.strip()
    try:
        from hermes_cli.config import load_config
        cfg = load_config()
@ -1484,7 +1493,13 @@ def _read_main_provider() -> str:

    Returns the lowercase provider id (e.g. "alibaba", "openrouter") or ""
    if not configured.
+
+    Runtime override: see ``_read_main_model`` — same mechanism for the
+    provider half of the runtime tuple.
    """
+    override = _RUNTIME_MAIN_PROVIDER
+    if isinstance(override, str) and override.strip():
+        return override.strip().lower()
    try:
        from hermes_cli.config import load_config
        cfg = load_config()
@ -1498,6 +1513,32 @@ def _read_main_provider() -> str:
    return ""


+# Process-local override set by AIAgent at session/turn start. Single-threaded
+# per turn — no lock needed. Cleared by ``clear_runtime_main()``.
+_RUNTIME_MAIN_PROVIDER: str = ""
+_RUNTIME_MAIN_MODEL: str = ""
+
+
+def set_runtime_main(provider: str, model: str) -> None:
+    """Record the live runtime provider/model for the current AIAgent.
+
+    Called by ``run_agent.AIAgent._sync_runtime_main_for_aux_routing`` (or
+    equivalent setter) at the top of each turn so that
+    ``_read_main_provider`` / ``_read_main_model`` reflect CLI/gateway
+    overrides instead of the stale config.yaml default.
+    """
+    global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
+    _RUNTIME_MAIN_PROVIDER = (provider or "").strip().lower()
+    _RUNTIME_MAIN_MODEL = (model or "").strip()
+
+
+def clear_runtime_main() -> None:
+    """Clear the runtime override (e.g. on session end)."""
+    global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
+    _RUNTIME_MAIN_PROVIDER = ""
+    _RUNTIME_MAIN_MODEL = ""
+
+
 def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]:
    """Resolve the active custom/main endpoint the same way the main CLI does.

--- a/agent/codex_responses_adapter.py
+++ b/agent/codex_responses_adapter.py
@ -410,10 +410,29 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di
                    call_id = raw_tool_call_id.strip()
            if not isinstance(call_id, str) or not call_id.strip():
                continue
+
+            # Multimodal tool result: convert OpenAI-style content list into
+            # Responses ``function_call_output.output`` array. The Responses
+            # API accepts ``output`` as either a string or an array of
+            # ``input_text``/``input_image`` items. See
+            # https://developers.openai.com/api/reference/python/resources/responses/.
+            tool_content = msg.get("content")
+            output_value: Any
+            if isinstance(tool_content, list):
+                converted = _chat_content_to_responses_parts(
+                    tool_content, role="user",
+                )
+                if converted:
+                    output_value = converted
+                else:
+                    output_value = ""
+            else:
+                output_value = str(tool_content or "")
+
            items.append({
                "type": "function_call_output",
                "call_id": call_id,
-                "output": str(msg.get("content", "") or ""),
+                "output": output_value,
            })

    return items
@ -466,6 +485,38 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
            output = item.get("output", "")
            if output is None:
                output = ""
+            # Output may be a string OR an array of structured content
+            # items (input_text / input_image) for multimodal tool results.
+            # Both shapes are accepted by the Responses API. We preserve
+            # the array form when present.
+            if isinstance(output, list):
+                # Validate each item is a recognised content shape; drop
+                # anything else to avoid 4xx from the API.
+                cleaned: List[Dict[str, Any]] = []
+                for part in output:
+                    if not isinstance(part, dict):
+                        continue
+                    ptype = part.get("type")
+                    if ptype == "input_text":
+                        text = part.get("text")
+                        if isinstance(text, str) and text:
+                            cleaned.append({"type": "input_text", "text": text})
+                    elif ptype == "input_image":
+                        url = part.get("image_url")
+                        if isinstance(url, str) and url:
+                            entry: Dict[str, Any] = {"type": "input_image", "image_url": url}
+                            detail = part.get("detail")
+                            if isinstance(detail, str) and detail.strip():
+                                entry["detail"] = detail.strip()
+                            cleaned.append(entry)
+                normalized.append(
+                    {
+                        "type": "function_call_output",
+                        "call_id": call_id.strip(),
+                        "output": cleaned if cleaned else "",
+                    }
+                )
+                continue
            if not isinstance(output, str):
                output = str(output)