"""Vision-routing decisions for ``computer_use`` capture results. Background ---------- ``computer_use(action='capture', mode='som'|'vision')`` returns a ``_multimodal`` envelope containing the captured screenshot. That envelope is delivered back to the **active session model** as the tool result. When the active main model has no vision capability (e.g. text-only or text+code-only models), or when the active provider rejects multimodal content inside tool-result messages, the screenshot trips a 404 / 400 at the provider boundary and the agent loop reports a hard tool failure. Issue #24015 reports this regression for the ``cua-driver`` backend: configuring ``auxiliary.vision`` (a dedicated vision-capable model) in ``config.yaml`` was silently ignored — the screenshot was still routed at the *main* model and failed with HTTP 404 ``No endpoints found that support image input`` even though a perfectly good vision backend was sitting in config waiting to be used. This module centralises the small policy decision: should a captured screenshot be returned as multimodal content (main model handles vision natively) or pre-analysed via the auxiliary vision pipeline so the main model only ever sees text? Behaviour (mirrors ``vision_analyze`` for consistency) ------------------------------------------------------ * If the user explicitly configured ``auxiliary.vision`` (any of ``provider``, ``model``, or ``base_url`` non-empty / not ``"auto"``), the screenshot is routed through the aux vision pipeline. Users who pay for a dedicated vision model usually want it used. * Otherwise, if the active main model+provider can carry an image inside a tool-result message AND the model reports ``supports_vision=True`` in models.dev metadata, return ``False`` (use the multimodal path). * In every other case (non-vision main model, provider that does not accept multimodal tool results, lookup failure), route through aux vision so the main model receives a text description it can act on. The decision intentionally fails *closed* (i.e. towards aux routing) when metadata is missing or ambiguous: returning a screenshot to a model that cannot read it is a hard tool failure, while routing it through aux costs one extra LLM call and yields a usable description. """ from __future__ import annotations import logging from typing import Any, Dict, Optional logger = logging.getLogger(__name__) def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool: """True when ``auxiliary.vision`` carries a non-default user override. Mirrors ``agent.image_routing._explicit_aux_vision_override`` so the capture path and the user-attached-image path agree on what counts as an explicit user request for the aux vision pipeline. ``provider: "auto"``, blank values, or a missing block all count as *not* explicit. """ if not isinstance(cfg, dict): return False aux = cfg.get("auxiliary") or {} if not isinstance(aux, dict): return False vision = aux.get("vision") or {} if not isinstance(vision, dict): return False provider = str(vision.get("provider") or "").strip().lower() model = str(vision.get("model") or "").strip() base_url = str(vision.get("base_url") or "").strip() if provider in ("", "auto") and not model and not base_url: return False return True def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]: """Return models.dev ``supports_vision`` for *(provider, model)* or None.""" if not provider or not model: return None try: from agent.models_dev import get_model_capabilities caps = get_model_capabilities(provider, model) except Exception as exc: # pragma: no cover - defensive logger.debug( "computer_use vision_routing: caps lookup failed for %s:%s — %s", provider, model, exc, ) return None if caps is None: return None return bool(getattr(caps, "supports_vision", False)) def _provider_accepts_multimodal_tool_result(provider: str, model: str) -> Optional[bool]: """Return whether *provider*+*model* carries images inside tool-result messages. Reuses ``tools.vision_tools._supports_media_in_tool_results`` so the capture-routing decision stays in lockstep with the ``vision_analyze`` native fast path. Returns None on import failure so callers fall back to aux routing rather than guessing. """ if not provider: return None try: from tools.vision_tools import _supports_media_in_tool_results except Exception as exc: # pragma: no cover - defensive logger.debug( "computer_use vision_routing: tool-result support lookup failed: %s", exc, ) return None return bool(_supports_media_in_tool_results(provider, model)) def should_route_capture_to_aux_vision( provider: str, model: str, cfg: Optional[Dict[str, Any]], ) -> bool: """Return True iff the captured screenshot should be pre-analysed via aux vision. Args: provider: active inference provider id (e.g. ``"openrouter"``, ``"anthropic"``, ``"openai-codex"``). Lower-case canonical id. model: active main model slug as it would be sent to the provider. cfg: loaded ``config.yaml`` dict (or None). Returns: ``True`` when the caller should hand the screenshot to the aux vision pipeline (and surface a text-only tool result). ``False`` when the caller should keep the existing multimodal envelope (main model handles vision natively). """ if _explicit_aux_vision_override(cfg): return True accepts_tool_image = _provider_accepts_multimodal_tool_result(provider, model) if accepts_tool_image is None or accepts_tool_image is False: return True supports_vision = _lookup_supports_vision(provider, model) if supports_vision is True: return False return True __all__ = [ "should_route_capture_to_aux_vision", ]