fix(computer_use): honor custom vision routing

2026-07-29 18:46:59 +00:00 · 2026-06-03 21:03:31 -06:00 · 2026-06-03 21:03:31 -06:00 · 591e6fb8f4
commit 591e6fb8f4
parent ffe665277c
6 changed files with 207 additions and 7 deletions
--- a/tests/tools/test_computer_use.py
+++ b/tests/tools/test_computer_use.py
@ -2,6 +2,7 @@

 from __future__ import annotations

+import base64
 import json
 import os
 import sys
@ -360,7 +361,9 @@ class TestCaptureResponse:
            def focus_app(self, app, raise_window=False): ...

        cu_tool.reset_backend_for_tests()
-        with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()):
+        with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()), \
+             patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=False):
            out = cu_tool.handle_computer_use({"action": "capture", "mode": "vision"})

        assert isinstance(out, dict)
@ -398,7 +401,9 @@ class TestCaptureResponse:
            def focus_app(self, app, raise_window=False): ...

        cu_tool.reset_backend_for_tests()
-        with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()):
+        with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()), \
+             patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=False):
            out = cu_tool.handle_computer_use({"action": "capture", "mode": "som"})
        assert isinstance(out, dict)
        text_part = next(p for p in out["content"] if p.get("type") == "text")
@ -436,6 +441,7 @@ class TestCaptureResponse:

        return FakeBackend()

+
    def test_capture_ax_caps_elements_at_default_for_dense_trees(self):
        """Regression for #22865: an Electron-style 600-element AX tree must
        not emit the entire array verbatim into the tool result.
@ -582,7 +588,9 @@ class TestCaptureResponse:
            def focus_app(self, app, raise_window=False): ...

        cu_tool.reset_backend_for_tests()
-        with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()):
+        with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()), \
+             patch.object(cu_tool, "_should_route_through_aux_vision",
+                          return_value=False):
            out = cu_tool.handle_computer_use({"action": "capture", "mode": "som"})

        assert isinstance(out, dict) and out["_multimodal"] is True
@ -594,6 +602,32 @@ class TestCaptureResponse:
        assert "truncated to" not in out["text_summary"]


+class TestCuaCaptureImageDimensions:
+    def test_png_dimensions_are_sniffed_from_image_bytes(self):
+        from tools.computer_use.cua_backend import _image_dimensions_from_bytes
+
+        raw_png = base64.b64decode(
+            "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m"
+            "NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=",
+            validate=False,
+        )
+        assert _image_dimensions_from_bytes(raw_png) == (1, 1)
+
+    def test_jpeg_dimensions_are_sniffed_from_sof_segment(self):
+        from tools.computer_use.cua_backend import _image_dimensions_from_bytes
+
+        raw_jpeg = (
+            b"\xff\xd8" +
+            b"\xff\xe0\x00\x10" + (b"0" * 14)
+            + b"\xff\xc0\x00\x11\x08"
+            + b"\x01\x2c"  # height: 300
+            + b"\x01\x90"  # width: 400
+            + b"\x03\x01\x11\x00\x02\x11\x00\x03\x11\x00"
+            + b"\xff\xd9"
+        )
+        assert _image_dimensions_from_bytes(raw_jpeg) == (400, 300)
+
+
 # ---------------------------------------------------------------------------
 # Anthropic adapter: multimodal tool-result conversion
 # ---------------------------------------------------------------------------