From 531efe7208f9c813bdd7c00ca09ec0425788769b Mon Sep 17 00:00:00 2001
From: xxxigm <tuancanhnguyen706@gmail.com>
Date: Tue, 12 May 2026 07:13:38 +0700
Subject: [PATCH] fix(computer_use): add helper to decide capture vision
 routing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add tools/computer_use/vision_routing.py with
should_route_capture_to_aux_vision(provider, model, cfg) — a small
policy helper that decides whether a captured screenshot should be
returned as a multimodal envelope (main model has native vision) or
pre-analysed through the auxiliary.vision pipeline so the main model
only sees text.

The decision mirrors agent.image_routing.decide_image_input_mode for
user-attached images, so the capture path and the user-turn path agree
on what counts as an explicit aux vision override:
  * provider/model/base_url under auxiliary.vision => explicit override
    => route through aux vision
  * provider+model accepts multimodal tool results AND main model
    reports supports_vision=True => keep multimodal envelope
  * everything else (no tool-result image support, non-vision model,
    metadata lookup failure) => fail closed and route through aux

No call sites are changed in this commit; the helper is added in
isolation so the routing decision can be unit-tested before it is
plumbed into _capture_response().
---
 tools/computer_use/vision_routing.py | 152 +++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 tools/computer_use/vision_routing.py

diff --git a/tools/computer_use/vision_routing.py b/tools/computer_use/vision_routing.py
new file mode 100644
index 00000000000..3b4be1e15a6
--- /dev/null
+++ b/tools/computer_use/vision_routing.py
@@ -0,0 +1,152 @@
+"""Vision-routing decisions for ``computer_use`` capture results.
+
+Background
+----------
+``computer_use(action='capture', mode='som'|'vision')`` returns a
+``_multimodal`` envelope containing the captured screenshot. That envelope
+is delivered back to the **active session model** as the tool result. When
+the active main model has no vision capability (e.g. text-only or
+text+code-only models), or when the active provider rejects multimodal
+content inside tool-result messages, the screenshot trips a 404 / 400 at
+the provider boundary and the agent loop reports a hard tool failure.
+
+Issue #24015 reports this regression for the ``cua-driver`` backend:
+configuring ``auxiliary.vision`` (a dedicated vision-capable model) in
+``config.yaml`` was silently ignored — the screenshot was still routed at
+the *main* model and failed with HTTP 404 ``No endpoints found that
+support image input`` even though a perfectly good vision backend was
+sitting in config waiting to be used.
+
+This module centralises the small policy decision: should a captured
+screenshot be returned as multimodal content (main model handles vision
+natively) or pre-analysed via the auxiliary vision pipeline so the main
+model only ever sees text?
+
+Behaviour (mirrors ``vision_analyze`` for consistency)
+------------------------------------------------------
+* If the user explicitly configured ``auxiliary.vision`` (any of
+  ``provider``, ``model``, or ``base_url`` non-empty / not ``"auto"``),
+  the screenshot is routed through the aux vision pipeline. Users who
+  pay for a dedicated vision model usually want it used.
+* Otherwise, if the active main model+provider can carry an image inside
+  a tool-result message AND the model reports ``supports_vision=True``
+  in models.dev metadata, return ``False`` (use the multimodal path).
+* In every other case (non-vision main model, provider that does not
+  accept multimodal tool results, lookup failure), route through aux
+  vision so the main model receives a text description it can act on.
+
+The decision intentionally fails *closed* (i.e. towards aux routing) when
+metadata is missing or ambiguous: returning a screenshot to a model that
+cannot read it is a hard tool failure, while routing it through aux costs
+one extra LLM call and yields a usable description.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
+    """True when ``auxiliary.vision`` carries a non-default user override.
+
+    Mirrors ``agent.image_routing._explicit_aux_vision_override`` so the
+    capture path and the user-attached-image path agree on what counts as
+    an explicit user request for the aux vision pipeline. ``provider:
+    "auto"``, blank values, or a missing block all count as *not*
+    explicit.
+    """
+    if not isinstance(cfg, dict):
+        return False
+    aux = cfg.get("auxiliary") or {}
+    if not isinstance(aux, dict):
+        return False
+    vision = aux.get("vision") or {}
+    if not isinstance(vision, dict):
+        return False
+
+    provider = str(vision.get("provider") or "").strip().lower()
+    model = str(vision.get("model") or "").strip()
+    base_url = str(vision.get("base_url") or "").strip()
+
+    if provider in ("", "auto") and not model and not base_url:
+        return False
+    return True
+
+
+def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]:
+    """Return models.dev ``supports_vision`` for *(provider, model)* or None."""
+    if not provider or not model:
+        return None
+    try:
+        from agent.models_dev import get_model_capabilities
+        caps = get_model_capabilities(provider, model)
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.debug(
+            "computer_use vision_routing: caps lookup failed for %s:%s — %s",
+            provider, model, exc,
+        )
+        return None
+    if caps is None:
+        return None
+    return bool(getattr(caps, "supports_vision", False))
+
+
+def _provider_accepts_multimodal_tool_result(provider: str, model: str) -> Optional[bool]:
+    """Return whether *provider*+*model* carries images inside tool-result messages.
+
+    Reuses ``tools.vision_tools._supports_media_in_tool_results`` so the
+    capture-routing decision stays in lockstep with the
+    ``vision_analyze`` native fast path. Returns None on import failure
+    so callers fall back to aux routing rather than guessing.
+    """
+    if not provider:
+        return None
+    try:
+        from tools.vision_tools import _supports_media_in_tool_results
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.debug(
+            "computer_use vision_routing: tool-result support lookup failed: %s",
+            exc,
+        )
+        return None
+    return bool(_supports_media_in_tool_results(provider, model))
+
+
+def should_route_capture_to_aux_vision(
+    provider: str,
+    model: str,
+    cfg: Optional[Dict[str, Any]],
+) -> bool:
+    """Return True iff the captured screenshot should be pre-analysed via aux vision.
+
+    Args:
+      provider: active inference provider id (e.g. ``"openrouter"``,
+        ``"anthropic"``, ``"openai-codex"``). Lower-case canonical id.
+      model:    active main model slug as it would be sent to the provider.
+      cfg:      loaded ``config.yaml`` dict (or None).
+
+    Returns:
+      ``True`` when the caller should hand the screenshot to the aux vision
+      pipeline (and surface a text-only tool result). ``False`` when the
+      caller should keep the existing multimodal envelope (main model
+      handles vision natively).
+    """
+    if _explicit_aux_vision_override(cfg):
+        return True
+
+    accepts_tool_image = _provider_accepts_multimodal_tool_result(provider, model)
+    if accepts_tool_image is None or accepts_tool_image is False:
+        return True
+
+    supports_vision = _lookup_supports_vision(provider, model)
+    if supports_vision is True:
+        return False
+    return True
+
+
+__all__ = [
+    "should_route_capture_to_aux_vision",
+]