fix(computer_use): honor custom vision routing

This commit is contained in:
helix4u 2026-06-03 21:03:31 -06:00 committed by Teknium
parent ffe665277c
commit 591e6fb8f4
6 changed files with 207 additions and 7 deletions

View file

@ -126,6 +126,45 @@ def _parse_elements_from_tree(markdown: str) -> List[UIElement]:
return elements
def _image_dimensions_from_bytes(raw: bytes) -> Tuple[int, int]:
"""Best-effort PNG/JPEG dimension sniffing without extra dependencies."""
if raw.startswith(b"\x89PNG\r\n\x1a\n") and len(raw) >= 24:
width = int.from_bytes(raw[16:20], "big")
height = int.from_bytes(raw[20:24], "big")
if width > 0 and height > 0:
return width, height
if raw.startswith(b"\xff\xd8"):
i = 2
n = len(raw)
while i + 9 < n:
if raw[i] != 0xFF:
i += 1
continue
marker = raw[i + 1]
i += 2
if marker in {0xD8, 0xD9} or 0xD0 <= marker <= 0xD7:
continue
if i + 2 > n:
break
segment_len = int.from_bytes(raw[i:i + 2], "big")
if segment_len < 2 or i + segment_len > n:
break
if marker in {
0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7,
0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF,
}:
if segment_len >= 7:
height = int.from_bytes(raw[i + 3:i + 5], "big")
width = int.from_bytes(raw[i + 5:i + 7], "big")
if width > 0 and height > 0:
return width, height
break
i += segment_len
return 0, 0
def _split_tree_text(full_text: str) -> Tuple[str, str]:
"""Split get_window_state text into (summary_line, tree_markdown)."""
lines = full_text.split("\n", 1)
@ -491,7 +530,12 @@ class CuaDriverBackend(ComputerUseBackend):
png_bytes_len = 0
if png_b64:
try:
png_bytes_len = len(base64.b64decode(png_b64, validate=False))
raw = base64.b64decode(png_b64, validate=False)
png_bytes_len = len(raw)
detected_width, detected_height = _image_dimensions_from_bytes(raw)
if detected_width and detected_height:
width = detected_width
height = detected_height
except Exception:
png_bytes_len = len(png_b64) * 3 // 4

View file

@ -615,6 +615,7 @@ def _route_capture_through_aux_vision(
# MIME sniffing returns the right content-type.
ext = ".jpg" if cap.png_b64[:8].startswith("/9j/") else ".png"
cache_dir = get_hermes_dir("cache/vision", "temp_vision_images")
cache_dir.mkdir(parents=True, exist_ok=True)
temp_image_path = cache_dir / f"computer_use_{_uuid.uuid4().hex}{ext}"
temp_image_path.write_bytes(raw)

View file

@ -28,6 +28,10 @@ Behaviour (mirrors ``vision_analyze`` for consistency)
``provider``, ``model``, or ``base_url`` non-empty / not ``"auto"``),
the screenshot is routed through the aux vision pipeline. Users who
pay for a dedicated vision model usually want it used.
* Otherwise, if the user explicitly declared the active model vision-capable
via ``model.supports_vision`` / provider model config, return ``False``.
This is the escape hatch for custom/local OpenAI-compatible VLM routes that
are absent from models.dev and provider allowlists.
* Otherwise, if the active main model+provider can carry an image inside
a tool-result message AND the model reports ``supports_vision=True``
in models.dev metadata, return ``False`` (use the multimodal path).
@ -76,10 +80,52 @@ def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
return True
def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]:
"""Return models.dev ``supports_vision`` for *(provider, model)* or None."""
def _lookup_user_declared_supports_vision(
provider: str,
model: str,
cfg: Optional[Dict[str, Any]],
) -> Optional[bool]:
"""Return config-declared ``supports_vision`` for the active route."""
try:
from agent.image_routing import _supports_vision_override
except Exception as exc: # pragma: no cover - defensive
logger.debug(
"computer_use vision_routing: config override lookup import failed: %s",
exc,
)
return None
try:
return _supports_vision_override(cfg, provider, model)
except Exception as exc: # pragma: no cover - defensive
logger.debug(
"computer_use vision_routing: config override lookup failed: %s",
exc,
)
return None
def _lookup_supports_vision(
provider: str,
model: str,
cfg: Optional[Dict[str, Any]] = None,
) -> Optional[bool]:
"""Return config/models.dev ``supports_vision`` for *(provider, model)*."""
if not provider or not model:
return None
try:
from agent.image_routing import _lookup_supports_vision as _lookup_image_supports
except Exception:
_lookup_image_supports = None
if _lookup_image_supports is not None:
try:
return _lookup_image_supports(provider, model, cfg)
except Exception as exc: # pragma: no cover - defensive
logger.debug(
"computer_use vision_routing: image-routing caps lookup failed "
"for %s:%s%s",
provider, model, exc,
)
return None
try:
from agent.models_dev import get_model_capabilities
caps = get_model_capabilities(provider, model)
@ -137,11 +183,17 @@ def should_route_capture_to_aux_vision(
if _explicit_aux_vision_override(cfg):
return True
user_declared = _lookup_user_declared_supports_vision(provider, model, cfg)
if user_declared is True:
return False
if user_declared is False:
return True
accepts_tool_image = _provider_accepts_multimodal_tool_result(provider, model)
if accepts_tool_image is None or accepts_tool_image is False:
return True
supports_vision = _lookup_supports_vision(provider, model)
supports_vision = _lookup_supports_vision(provider, model, cfg)
if supports_vision is True:
return False
return True