From 5250335863eea92b589066a4ba1a1a57acc3f7b7 Mon Sep 17 00:00:00 2001 From: jeeves-assistant Date: Mon, 22 Jun 2026 12:19:54 -0700 Subject: [PATCH 1/2] fix(computer-use): route CuaDriver vision capture via get_window_state cua-driver 0.6.x removed the standalone screenshot MCP tool, so capture(mode='vision') hit 'Unknown tool: screenshot' and returned a 0x0 image with no PNG while som/ax (which use get_window_state) still worked. Route vision through get_window_state(capture_mode='vision'). Salvaged from PR #50771; same fix submitted earlier as #39262 by @Tranquil-Flow. --- scripts/release.py | 1 + tests/tools/test_computer_use.py | 44 +++++++++++++++++++++++++++++++ tools/computer_use/cua_backend.py | 11 +++++--- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/scripts/release.py b/scripts/release.py index 7cea21ce9b6..d60400e1883 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json" # Auto-extracted from noreply emails + manual overrides AUTHOR_MAP = { + "jeevesassistant00@gmail.com": "jeeves-assistant", # PR #50771 (computer-use CuaDriver vision capture routing) "21178861+ScotterMonk@users.noreply.github.com": "ScotterMonk", # PR #50145 salvage (cron output truncation: adapter-aware chunking, #50126) "rrandqua@gmail.com": "TutkuEroglu", # PR #50481 salvage (AGENTS.md stale token-lock adapter path) "f@trycua.com": "f-trycua", # PR #50507 salvage (cross-platform computer_use; supersedes #44221/#30660) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index c75d87c8513..b22f918154d 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -2139,6 +2139,50 @@ class TestStructuredElementsConsumption: # Markdown surface doesn't carry bounds — lossy by design. assert cap.elements[0].bounds == (0, 0, 0, 0) + def test_vision_capture_uses_get_window_state_not_removed_screenshot_tool(self): + """cua-driver 0.6.x returns vision screenshots from + get_window_state(capture_mode="vision"); the old standalone + screenshot tool is no longer available.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + + windows_payload = { + "windows": [{ + "app_name": "Demo", "pid": 9, "window_id": 1, + "is_on_screen": True, "title": "Demo", "z_index": 0, + }], + } + png_b64 = ( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m" + "NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" + ) + + def fake_call_tool(name, args): + if name == "list_windows": + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": windows_payload, "isError": False} + if name == "get_window_state": + assert args["capture_mode"] == "vision" + return {"data": "", "images": [png_b64], + "image_mime_types": ["image/png"], + "structuredContent": None, "isError": False} + if name == "screenshot": + raise AssertionError("vision capture must not call removed screenshot tool") + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": None, "isError": False} + + backend._session.call_tool.side_effect = fake_call_tool + cap = backend.capture(mode="vision") + + tool_names = [call.args[0] for call in backend._session.call_tool.call_args_list] + assert tool_names == ["list_windows", "get_window_state"] + assert cap.png_b64 == png_b64 + assert cap.image_mime_type == "image/png" + assert cap.width == 1 + assert cap.height == 1 + class TestCapabilityDiscovery: """Surface 4 (NousResearch/hermes-agent#47072): the wrapper learns diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py index b46785d2e95..af0bb9fc392 100644 --- a/tools/computer_use/cua_backend.py +++ b/tools/computer_use/cua_backend.py @@ -1003,13 +1003,16 @@ class CuaDriverBackend(ComputerUseBackend): window_title = "" if mode == "vision": - # screenshot tool: just the PNG, no AX walk. + # Newer cua-driver releases no longer expose a standalone + # `screenshot` MCP tool. Request a screenshot-only capture via + # get_window_state instead; this keeps vision mode working while + # avoiding the AX walk used by som/ax captures. sc_out = self._session.call_tool( - "screenshot", + "get_window_state", { + "pid": self._active_pid, "window_id": self._active_window_id, - "format": "jpeg", - "quality": 85, + "capture_mode": "vision", "session": self._session_id, }, ) From 30e5d0092dacc35fb0a09d537077e93f495bb90a Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 22 Jun 2026 12:21:48 -0700 Subject: [PATCH 2/2] feat(computer-use): add whole-screen/desktop capture target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit capture(app='screen'|'desktop') now resolves to the OS shell/desktop window (Windows Progman/WorkerW desktop or Shell_TrayWnd taskbar, macOS Finder/Dock) so 'show me my screen' and 'click the taskbar' work. Previously capture() only matched application windows, and the schema advertised 'or the whole screen' without any code path delivering it. cua-driver is window-oriented (no virtual-desktop or per-monitor MCP tool), so a single image still cannot span multiple monitors — the schema now states this and the no-desktop-window path returns a clear message instead of silently grabbing the frontmost app. --- tests/tools/test_computer_use.py | 68 +++++++++++++++++++++++++++++++ tools/computer_use/cua_backend.py | 61 ++++++++++++++++++++++++++- tools/computer_use/schema.py | 11 +++-- 3 files changed, 136 insertions(+), 4 deletions(-) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index b22f918154d..673ad8a29c1 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -2183,6 +2183,74 @@ class TestStructuredElementsConsumption: assert cap.width == 1 assert cap.height == 1 + def test_capture_app_screen_targets_desktop_window(self): + """capture(app='screen') resolves to the OS shell/desktop window + (Windows Progman) rather than an application window, so 'show me my + screen' works on cua-driver's window-oriented capture surface.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + + windows_payload = { + "windows": [ + {"app_name": "Code", "pid": 11, "window_id": 1, + "is_on_screen": True, "title": "editor", "z_index": 0}, + {"app_name": "Progman", "pid": 4, "window_id": 99, + "is_on_screen": True, "title": "Program Manager", "z_index": 5}, + {"app_name": "Shell_TrayWnd", "pid": 4, "window_id": 50, + "is_on_screen": True, "title": "Taskbar", "z_index": 4}, + ], + } + + def fake_call_tool(name, args): + if name == "list_windows": + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": windows_payload, "isError": False} + if name == "get_window_state": + # Should be invoked against the desktop backdrop, not Code. + assert args["window_id"] == 99 + return {"data": "✅ Desktop — 0 elements", "images": [], + "image_mime_types": [], "structuredContent": None, + "isError": False} + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": None, "isError": False} + + backend._session.call_tool.side_effect = fake_call_tool + cap = backend.capture(mode="ax", app="screen") + + assert backend._active_window_id == 99 + assert cap.app == "Progman" + + def test_capture_app_screen_no_desktop_window_surfaces_limitation(self): + """When no desktop/shell window is present, capture(app='screen') + returns a clear message about cua-driver's per-window capture limit + instead of silently grabbing the frontmost app.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + + windows_payload = { + "windows": [ + {"app_name": "Code", "pid": 11, "window_id": 1, + "is_on_screen": True, "title": "editor", "z_index": 0}, + ], + } + + def fake_call_tool(name, args): + if name == "list_windows": + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": windows_payload, "isError": False} + raise AssertionError(f"unexpected tool {name} — should short-circuit") + + backend._session.call_tool.side_effect = fake_call_tool + cap = backend.capture(mode="vision", app="desktop") + + assert cap.width == 0 and cap.height == 0 + assert cap.png_b64 is None + assert "captures one window at a time" in cap.window_title + class TestCapabilityDiscovery: """Surface 4 (NousResearch/hermes-agent#47072): the wrapper learns diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py index af0bb9fc392..fbf9ff07b2c 100644 --- a/tools/computer_use/cua_backend.py +++ b/tools/computer_use/cua_backend.py @@ -78,6 +78,29 @@ _CUA_DRIVER_ARGS = ["mcp"] # stdio MCP transport (fallback when the # driver doesn't expose `manifest` — see # `_resolve_mcp_invocation` below) +# Whole-screen / desktop capture. cua-driver is a window-oriented driver — +# its `get_window_state` / `screenshot` tools capture a single window (by +# pid + window_id), and there is no MCP tool that captures the entire virtual +# desktop or an arbitrary monitor as one image. But the OS shell surfaces +# themselves (the desktop backdrop and the taskbar/menu-bar) are real windows +# that show up in `list_windows`, so "show me my screen" / "click the taskbar" +# is reachable by targeting those windows. When `app` is one of these +# sentinels, capture() resolves to the desktop/shell window instead of an +# application window. +_SCREEN_CAPTURE_SENTINELS = {"screen", "desktop", "fullscreen", "full screen", "all"} + +# Known shell/desktop window identifiers across platforms. Matched +# case-insensitively as a substring against both the window's app_name and +# its title (cua-driver surfaces the Win32 class name / app name here). +# Windows: Progman / WorkerW back the desktop; Shell_TrayWnd is the taskbar. +# macOS: Finder owns the desktop; the menu bar / Dock are the shell. +_DESKTOP_WINDOW_NAMES = ( + "progman", "workerw", "program manager", # Windows desktop + "shell_traywnd", "taskbar", # Windows taskbar + "finder", "desktop", "dock", # macOS desktop / shell +) + + # Env var cua-driver reads to gate its anonymous usage telemetry (PostHog). # Setting it to "0" disables telemetry; absence => the binary's own default # (telemetry ON upstream). @@ -968,7 +991,43 @@ class CuaDriverBackend(ComputerUseBackend): # returned by list_windows is the localized name (e.g. "計算機"), so # `app="Calculator"` legitimately matches no windows on a non-English # system and the caller needs to retry with the localized name. - if app: + if app and app.strip().lower() in _SCREEN_CAPTURE_SENTINELS: + # Whole-screen / desktop request. cua-driver has no virtual-desktop + # capture tool, so resolve to the OS shell/desktop window (the + # desktop backdrop or the taskbar/menu-bar), which list_windows + # does surface. This makes "show me my screen" and "click the + # taskbar" work; a single image still can't span multiple monitors + # — that's a driver limitation, not a wrapper one. + def _is_desktop_window(w: Dict[str, Any]) -> bool: + haystack = f"{w.get('app_name', '')} {w.get('title', '')}".lower() + return any(name in haystack for name in _DESKTOP_WINDOW_NAMES) + + desktop = [w for w in windows if _is_desktop_window(w)] + if not desktop: + return CaptureResult( + mode=mode, width=0, height=0, png_b64=None, + elements=[], app="", + window_title=( + f"" + ), + png_bytes_len=0, + ) + # Prefer the desktop backdrop (Progman/WorkerW/Finder) over the + # taskbar when both are present, so a bare "screen" capture shows + # the full desktop rather than just the task strip. + windows = sorted( + desktop, + key=lambda w: 0 if any( + n in f"{w.get('app_name', '')} {w.get('title', '')}".lower() + for n in ("progman", "workerw", "program manager", "finder", "desktop") + ) else 1, + ) + elif app: app_lower = app.lower() filtered = [w for w in windows if app_lower in w["app_name"].lower()] if not filtered: diff --git a/tools/computer_use/schema.py b/tools/computer_use/schema.py index 5bb855ccc0f..a3394d23276 100644 --- a/tools/computer_use/schema.py +++ b/tools/computer_use/schema.py @@ -71,9 +71,14 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = { "type": "string", "description": ( "Optional. Limit capture/action to a specific app " - "(by name, e.g. 'Safari' or 'Notepad', or bundle ID " - "where the platform supports it). If omitted, operates " - "on the frontmost app's window or the whole screen." + "(by name, e.g. 'Safari', or bundle ID, " + "'com.apple.Safari'). If omitted, operates on the " + "frontmost app's window. Pass app='screen' (or " + "'desktop') to capture the OS desktop/shell surface — " + "e.g. to see the wallpaper or click the taskbar. Note: " + "capture is per-window; a single image cannot span " + "multiple monitors, so on a multi-screen setup capture " + "one window or display at a time." ), }, "max_elements": {