From 531efe7208f9c813bdd7c00ca09ec0425788769b Mon Sep 17 00:00:00 2001 From: xxxigm Date: Tue, 12 May 2026 07:13:38 +0700 Subject: [PATCH] fix(computer_use): add helper to decide capture vision routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tools/computer_use/vision_routing.py with should_route_capture_to_aux_vision(provider, model, cfg) — a small policy helper that decides whether a captured screenshot should be returned as a multimodal envelope (main model has native vision) or pre-analysed through the auxiliary.vision pipeline so the main model only sees text. The decision mirrors agent.image_routing.decide_image_input_mode for user-attached images, so the capture path and the user-turn path agree on what counts as an explicit aux vision override: * provider/model/base_url under auxiliary.vision => explicit override => route through aux vision * provider+model accepts multimodal tool results AND main model reports supports_vision=True => keep multimodal envelope * everything else (no tool-result image support, non-vision model, metadata lookup failure) => fail closed and route through aux No call sites are changed in this commit; the helper is added in isolation so the routing decision can be unit-tested before it is plumbed into _capture_response(). --- tools/computer_use/vision_routing.py | 152 +++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 tools/computer_use/vision_routing.py diff --git a/tools/computer_use/vision_routing.py b/tools/computer_use/vision_routing.py new file mode 100644 index 00000000000..3b4be1e15a6 --- /dev/null +++ b/tools/computer_use/vision_routing.py @@ -0,0 +1,152 @@ +"""Vision-routing decisions for ``computer_use`` capture results. + +Background +---------- +``computer_use(action='capture', mode='som'|'vision')`` returns a +``_multimodal`` envelope containing the captured screenshot. That envelope +is delivered back to the **active session model** as the tool result. When +the active main model has no vision capability (e.g. text-only or +text+code-only models), or when the active provider rejects multimodal +content inside tool-result messages, the screenshot trips a 404 / 400 at +the provider boundary and the agent loop reports a hard tool failure. + +Issue #24015 reports this regression for the ``cua-driver`` backend: +configuring ``auxiliary.vision`` (a dedicated vision-capable model) in +``config.yaml`` was silently ignored — the screenshot was still routed at +the *main* model and failed with HTTP 404 ``No endpoints found that +support image input`` even though a perfectly good vision backend was +sitting in config waiting to be used. + +This module centralises the small policy decision: should a captured +screenshot be returned as multimodal content (main model handles vision +natively) or pre-analysed via the auxiliary vision pipeline so the main +model only ever sees text? + +Behaviour (mirrors ``vision_analyze`` for consistency) +------------------------------------------------------ +* If the user explicitly configured ``auxiliary.vision`` (any of + ``provider``, ``model``, or ``base_url`` non-empty / not ``"auto"``), + the screenshot is routed through the aux vision pipeline. Users who + pay for a dedicated vision model usually want it used. +* Otherwise, if the active main model+provider can carry an image inside + a tool-result message AND the model reports ``supports_vision=True`` + in models.dev metadata, return ``False`` (use the multimodal path). +* In every other case (non-vision main model, provider that does not + accept multimodal tool results, lookup failure), route through aux + vision so the main model receives a text description it can act on. + +The decision intentionally fails *closed* (i.e. towards aux routing) when +metadata is missing or ambiguous: returning a screenshot to a model that +cannot read it is a hard tool failure, while routing it through aux costs +one extra LLM call and yields a usable description. +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, Optional + +logger = logging.getLogger(__name__) + + +def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool: + """True when ``auxiliary.vision`` carries a non-default user override. + + Mirrors ``agent.image_routing._explicit_aux_vision_override`` so the + capture path and the user-attached-image path agree on what counts as + an explicit user request for the aux vision pipeline. ``provider: + "auto"``, blank values, or a missing block all count as *not* + explicit. + """ + if not isinstance(cfg, dict): + return False + aux = cfg.get("auxiliary") or {} + if not isinstance(aux, dict): + return False + vision = aux.get("vision") or {} + if not isinstance(vision, dict): + return False + + provider = str(vision.get("provider") or "").strip().lower() + model = str(vision.get("model") or "").strip() + base_url = str(vision.get("base_url") or "").strip() + + if provider in ("", "auto") and not model and not base_url: + return False + return True + + +def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]: + """Return models.dev ``supports_vision`` for *(provider, model)* or None.""" + if not provider or not model: + return None + try: + from agent.models_dev import get_model_capabilities + caps = get_model_capabilities(provider, model) + except Exception as exc: # pragma: no cover - defensive + logger.debug( + "computer_use vision_routing: caps lookup failed for %s:%s — %s", + provider, model, exc, + ) + return None + if caps is None: + return None + return bool(getattr(caps, "supports_vision", False)) + + +def _provider_accepts_multimodal_tool_result(provider: str, model: str) -> Optional[bool]: + """Return whether *provider*+*model* carries images inside tool-result messages. + + Reuses ``tools.vision_tools._supports_media_in_tool_results`` so the + capture-routing decision stays in lockstep with the + ``vision_analyze`` native fast path. Returns None on import failure + so callers fall back to aux routing rather than guessing. + """ + if not provider: + return None + try: + from tools.vision_tools import _supports_media_in_tool_results + except Exception as exc: # pragma: no cover - defensive + logger.debug( + "computer_use vision_routing: tool-result support lookup failed: %s", + exc, + ) + return None + return bool(_supports_media_in_tool_results(provider, model)) + + +def should_route_capture_to_aux_vision( + provider: str, + model: str, + cfg: Optional[Dict[str, Any]], +) -> bool: + """Return True iff the captured screenshot should be pre-analysed via aux vision. + + Args: + provider: active inference provider id (e.g. ``"openrouter"``, + ``"anthropic"``, ``"openai-codex"``). Lower-case canonical id. + model: active main model slug as it would be sent to the provider. + cfg: loaded ``config.yaml`` dict (or None). + + Returns: + ``True`` when the caller should hand the screenshot to the aux vision + pipeline (and surface a text-only tool result). ``False`` when the + caller should keep the existing multimodal envelope (main model + handles vision natively). + """ + if _explicit_aux_vision_override(cfg): + return True + + accepts_tool_image = _provider_accepts_multimodal_tool_result(provider, model) + if accepts_tool_image is None or accepts_tool_image is False: + return True + + supports_vision = _lookup_supports_vision(provider, model) + if supports_vision is True: + return False + return True + + +__all__ = [ + "should_route_capture_to_aux_vision", +]