From 5250335863eea92b589066a4ba1a1a57acc3f7b7 Mon Sep 17 00:00:00 2001
From: jeeves-assistant <jeevesassistant00@gmail.com>
Date: Mon, 22 Jun 2026 12:19:54 -0700
Subject: [PATCH 1/2] fix(computer-use): route CuaDriver vision capture via
 get_window_state

cua-driver 0.6.x removed the standalone screenshot MCP tool, so
capture(mode='vision') hit 'Unknown tool: screenshot' and returned a
0x0 image with no PNG while som/ax (which use get_window_state) still
worked. Route vision through get_window_state(capture_mode='vision').

Salvaged from PR #50771; same fix submitted earlier as #39262 by
@Tranquil-Flow.
---
 scripts/release.py                |  1 +
 tests/tools/test_computer_use.py  | 44 +++++++++++++++++++++++++++++++
 tools/computer_use/cua_backend.py | 11 +++++---
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/scripts/release.py b/scripts/release.py
index 7cea21ce9b6..d60400e1883 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
 
 # Auto-extracted from noreply emails + manual overrides
 AUTHOR_MAP = {
+    "jeevesassistant00@gmail.com": "jeeves-assistant",  # PR #50771 (computer-use CuaDriver vision capture routing)
     "21178861+ScotterMonk@users.noreply.github.com": "ScotterMonk",  # PR #50145 salvage (cron output truncation: adapter-aware chunking, #50126)
     "rrandqua@gmail.com": "TutkuEroglu",  # PR #50481 salvage (AGENTS.md stale token-lock adapter path)
     "f@trycua.com": "f-trycua",  # PR #50507 salvage (cross-platform computer_use; supersedes #44221/#30660)
diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py
index c75d87c8513..b22f918154d 100644
--- a/tests/tools/test_computer_use.py
+++ b/tests/tools/test_computer_use.py
@@ -2139,6 +2139,50 @@ class TestStructuredElementsConsumption:
         # Markdown surface doesn't carry bounds — lossy by design.
         assert cap.elements[0].bounds == (0, 0, 0, 0)
 
+    def test_vision_capture_uses_get_window_state_not_removed_screenshot_tool(self):
+        """cua-driver 0.6.x returns vision screenshots from
+        get_window_state(capture_mode="vision"); the old standalone
+        screenshot tool is no longer available."""
+        from tools.computer_use.cua_backend import CuaDriverBackend
+
+        backend = CuaDriverBackend()
+        backend._session = MagicMock()
+
+        windows_payload = {
+            "windows": [{
+                "app_name": "Demo", "pid": 9, "window_id": 1,
+                "is_on_screen": True, "title": "Demo", "z_index": 0,
+            }],
+        }
+        png_b64 = (
+            "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m"
+            "NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
+        )
+
+        def fake_call_tool(name, args):
+            if name == "list_windows":
+                return {"data": "", "images": [], "image_mime_types": [],
+                        "structuredContent": windows_payload, "isError": False}
+            if name == "get_window_state":
+                assert args["capture_mode"] == "vision"
+                return {"data": "", "images": [png_b64],
+                        "image_mime_types": ["image/png"],
+                        "structuredContent": None, "isError": False}
+            if name == "screenshot":
+                raise AssertionError("vision capture must not call removed screenshot tool")
+            return {"data": "", "images": [], "image_mime_types": [],
+                    "structuredContent": None, "isError": False}
+
+        backend._session.call_tool.side_effect = fake_call_tool
+        cap = backend.capture(mode="vision")
+
+        tool_names = [call.args[0] for call in backend._session.call_tool.call_args_list]
+        assert tool_names == ["list_windows", "get_window_state"]
+        assert cap.png_b64 == png_b64
+        assert cap.image_mime_type == "image/png"
+        assert cap.width == 1
+        assert cap.height == 1
+
 
 class TestCapabilityDiscovery:
     """Surface 4 (NousResearch/hermes-agent#47072): the wrapper learns
diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py
index b46785d2e95..af0bb9fc392 100644
--- a/tools/computer_use/cua_backend.py
+++ b/tools/computer_use/cua_backend.py
@@ -1003,13 +1003,16 @@ class CuaDriverBackend(ComputerUseBackend):
         window_title = ""
 
         if mode == "vision":
-            # screenshot tool: just the PNG, no AX walk.
+            # Newer cua-driver releases no longer expose a standalone
+            # `screenshot` MCP tool. Request a screenshot-only capture via
+            # get_window_state instead; this keeps vision mode working while
+            # avoiding the AX walk used by som/ax captures.
             sc_out = self._session.call_tool(
-                "screenshot",
+                "get_window_state",
                 {
+                    "pid": self._active_pid,
                     "window_id": self._active_window_id,
-                    "format": "jpeg",
-                    "quality": 85,
+                    "capture_mode": "vision",
                     "session": self._session_id,
                 },
             )

From 30e5d0092dacc35fb0a09d537077e93f495bb90a Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 22 Jun 2026 12:21:48 -0700
Subject: [PATCH 2/2] feat(computer-use): add whole-screen/desktop capture
 target
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

capture(app='screen'|'desktop') now resolves to the OS shell/desktop
window (Windows Progman/WorkerW desktop or Shell_TrayWnd taskbar, macOS
Finder/Dock) so 'show me my screen' and 'click the taskbar' work.
Previously capture() only matched application windows, and the schema
advertised 'or the whole screen' without any code path delivering it.

cua-driver is window-oriented (no virtual-desktop or per-monitor MCP
tool), so a single image still cannot span multiple monitors — the
schema now states this and the no-desktop-window path returns a clear
message instead of silently grabbing the frontmost app.
---
 tests/tools/test_computer_use.py  | 68 +++++++++++++++++++++++++++++++
 tools/computer_use/cua_backend.py | 61 ++++++++++++++++++++++++++-
 tools/computer_use/schema.py      | 11 +++--
 3 files changed, 136 insertions(+), 4 deletions(-)

diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py
index b22f918154d..673ad8a29c1 100644
--- a/tests/tools/test_computer_use.py
+++ b/tests/tools/test_computer_use.py
@@ -2183,6 +2183,74 @@ class TestStructuredElementsConsumption:
         assert cap.width == 1
         assert cap.height == 1
 
+    def test_capture_app_screen_targets_desktop_window(self):
+        """capture(app='screen') resolves to the OS shell/desktop window
+        (Windows Progman) rather than an application window, so 'show me my
+        screen' works on cua-driver's window-oriented capture surface."""
+        from tools.computer_use.cua_backend import CuaDriverBackend
+
+        backend = CuaDriverBackend()
+        backend._session = MagicMock()
+
+        windows_payload = {
+            "windows": [
+                {"app_name": "Code", "pid": 11, "window_id": 1,
+                 "is_on_screen": True, "title": "editor", "z_index": 0},
+                {"app_name": "Progman", "pid": 4, "window_id": 99,
+                 "is_on_screen": True, "title": "Program Manager", "z_index": 5},
+                {"app_name": "Shell_TrayWnd", "pid": 4, "window_id": 50,
+                 "is_on_screen": True, "title": "Taskbar", "z_index": 4},
+            ],
+        }
+
+        def fake_call_tool(name, args):
+            if name == "list_windows":
+                return {"data": "", "images": [], "image_mime_types": [],
+                        "structuredContent": windows_payload, "isError": False}
+            if name == "get_window_state":
+                # Should be invoked against the desktop backdrop, not Code.
+                assert args["window_id"] == 99
+                return {"data": "✅ Desktop — 0 elements", "images": [],
+                        "image_mime_types": [], "structuredContent": None,
+                        "isError": False}
+            return {"data": "", "images": [], "image_mime_types": [],
+                    "structuredContent": None, "isError": False}
+
+        backend._session.call_tool.side_effect = fake_call_tool
+        cap = backend.capture(mode="ax", app="screen")
+
+        assert backend._active_window_id == 99
+        assert cap.app == "Progman"
+
+    def test_capture_app_screen_no_desktop_window_surfaces_limitation(self):
+        """When no desktop/shell window is present, capture(app='screen')
+        returns a clear message about cua-driver's per-window capture limit
+        instead of silently grabbing the frontmost app."""
+        from tools.computer_use.cua_backend import CuaDriverBackend
+
+        backend = CuaDriverBackend()
+        backend._session = MagicMock()
+
+        windows_payload = {
+            "windows": [
+                {"app_name": "Code", "pid": 11, "window_id": 1,
+                 "is_on_screen": True, "title": "editor", "z_index": 0},
+            ],
+        }
+
+        def fake_call_tool(name, args):
+            if name == "list_windows":
+                return {"data": "", "images": [], "image_mime_types": [],
+                        "structuredContent": windows_payload, "isError": False}
+            raise AssertionError(f"unexpected tool {name} — should short-circuit")
+
+        backend._session.call_tool.side_effect = fake_call_tool
+        cap = backend.capture(mode="vision", app="desktop")
+
+        assert cap.width == 0 and cap.height == 0
+        assert cap.png_b64 is None
+        assert "captures one window at a time" in cap.window_title
+
 
 class TestCapabilityDiscovery:
     """Surface 4 (NousResearch/hermes-agent#47072): the wrapper learns
diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py
index af0bb9fc392..fbf9ff07b2c 100644
--- a/tools/computer_use/cua_backend.py
+++ b/tools/computer_use/cua_backend.py
@@ -78,6 +78,29 @@ _CUA_DRIVER_ARGS = ["mcp"]  # stdio MCP transport (fallback when the
                             # driver doesn't expose `manifest` — see
                             # `_resolve_mcp_invocation` below)
 
+# Whole-screen / desktop capture. cua-driver is a window-oriented driver —
+# its `get_window_state` / `screenshot` tools capture a single window (by
+# pid + window_id), and there is no MCP tool that captures the entire virtual
+# desktop or an arbitrary monitor as one image. But the OS shell surfaces
+# themselves (the desktop backdrop and the taskbar/menu-bar) are real windows
+# that show up in `list_windows`, so "show me my screen" / "click the taskbar"
+# is reachable by targeting those windows. When `app` is one of these
+# sentinels, capture() resolves to the desktop/shell window instead of an
+# application window.
+_SCREEN_CAPTURE_SENTINELS = {"screen", "desktop", "fullscreen", "full screen", "all"}
+
+# Known shell/desktop window identifiers across platforms. Matched
+# case-insensitively as a substring against both the window's app_name and
+# its title (cua-driver surfaces the Win32 class name / app name here).
+#   Windows: Progman / WorkerW back the desktop; Shell_TrayWnd is the taskbar.
+#   macOS:   Finder owns the desktop; the menu bar / Dock are the shell.
+_DESKTOP_WINDOW_NAMES = (
+    "progman", "workerw", "program manager",  # Windows desktop
+    "shell_traywnd", "taskbar",               # Windows taskbar
+    "finder", "desktop", "dock",              # macOS desktop / shell
+)
+
+
 # Env var cua-driver reads to gate its anonymous usage telemetry (PostHog).
 # Setting it to "0" disables telemetry; absence => the binary's own default
 # (telemetry ON upstream).
@@ -968,7 +991,43 @@ class CuaDriverBackend(ComputerUseBackend):
         # returned by list_windows is the localized name (e.g. "計算機"), so
         # `app="Calculator"` legitimately matches no windows on a non-English
         # system and the caller needs to retry with the localized name.
-        if app:
+        if app and app.strip().lower() in _SCREEN_CAPTURE_SENTINELS:
+            # Whole-screen / desktop request. cua-driver has no virtual-desktop
+            # capture tool, so resolve to the OS shell/desktop window (the
+            # desktop backdrop or the taskbar/menu-bar), which list_windows
+            # does surface. This makes "show me my screen" and "click the
+            # taskbar" work; a single image still can't span multiple monitors
+            # — that's a driver limitation, not a wrapper one.
+            def _is_desktop_window(w: Dict[str, Any]) -> bool:
+                haystack = f"{w.get('app_name', '')} {w.get('title', '')}".lower()
+                return any(name in haystack for name in _DESKTOP_WINDOW_NAMES)
+
+            desktop = [w for w in windows if _is_desktop_window(w)]
+            if not desktop:
+                return CaptureResult(
+                    mode=mode, width=0, height=0, png_b64=None,
+                    elements=[], app="",
+                    window_title=(
+                        f"<no desktop/shell window found for app={app!r}; "
+                        f"cua-driver captures one window at a time and exposes "
+                        f"no whole-virtual-desktop or per-monitor capture. "
+                        f"Call list_apps / capture(app='<AppName>') to target a "
+                        f"specific window instead. On Windows the taskbar is "
+                        f"'Shell_TrayWnd' and the desktop is 'Progman'.>"
+                    ),
+                    png_bytes_len=0,
+                )
+            # Prefer the desktop backdrop (Progman/WorkerW/Finder) over the
+            # taskbar when both are present, so a bare "screen" capture shows
+            # the full desktop rather than just the task strip.
+            windows = sorted(
+                desktop,
+                key=lambda w: 0 if any(
+                    n in f"{w.get('app_name', '')} {w.get('title', '')}".lower()
+                    for n in ("progman", "workerw", "program manager", "finder", "desktop")
+                ) else 1,
+            )
+        elif app:
             app_lower = app.lower()
             filtered = [w for w in windows if app_lower in w["app_name"].lower()]
             if not filtered:
diff --git a/tools/computer_use/schema.py b/tools/computer_use/schema.py
index 5bb855ccc0f..a3394d23276 100644
--- a/tools/computer_use/schema.py
+++ b/tools/computer_use/schema.py
@@ -71,9 +71,14 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = {
                 "type": "string",
                 "description": (
                     "Optional. Limit capture/action to a specific app "
-                    "(by name, e.g. 'Safari' or 'Notepad', or bundle ID "
-                    "where the platform supports it). If omitted, operates "
-                    "on the frontmost app's window or the whole screen."
+                    "(by name, e.g. 'Safari', or bundle ID, "
+                    "'com.apple.Safari'). If omitted, operates on the "
+                    "frontmost app's window. Pass app='screen' (or "
+                    "'desktop') to capture the OS desktop/shell surface — "
+                    "e.g. to see the wallpaper or click the taskbar. Note: "
+                    "capture is per-window; a single image cannot span "
+                    "multiple monitors, so on a multi-screen setup capture "
+                    "one window or display at a time."
                 ),
             },
             "max_elements": {