From 30e5d0092dacc35fb0a09d537077e93f495bb90a Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 22 Jun 2026 12:21:48 -0700 Subject: [PATCH] feat(computer-use): add whole-screen/desktop capture target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit capture(app='screen'|'desktop') now resolves to the OS shell/desktop window (Windows Progman/WorkerW desktop or Shell_TrayWnd taskbar, macOS Finder/Dock) so 'show me my screen' and 'click the taskbar' work. Previously capture() only matched application windows, and the schema advertised 'or the whole screen' without any code path delivering it. cua-driver is window-oriented (no virtual-desktop or per-monitor MCP tool), so a single image still cannot span multiple monitors — the schema now states this and the no-desktop-window path returns a clear message instead of silently grabbing the frontmost app. --- tests/tools/test_computer_use.py | 68 +++++++++++++++++++++++++++++++ tools/computer_use/cua_backend.py | 61 ++++++++++++++++++++++++++- tools/computer_use/schema.py | 11 +++-- 3 files changed, 136 insertions(+), 4 deletions(-) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index b22f918154d..673ad8a29c1 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -2183,6 +2183,74 @@ class TestStructuredElementsConsumption: assert cap.width == 1 assert cap.height == 1 + def test_capture_app_screen_targets_desktop_window(self): + """capture(app='screen') resolves to the OS shell/desktop window + (Windows Progman) rather than an application window, so 'show me my + screen' works on cua-driver's window-oriented capture surface.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + + windows_payload = { + "windows": [ + {"app_name": "Code", "pid": 11, "window_id": 1, + "is_on_screen": True, "title": "editor", "z_index": 0}, + {"app_name": "Progman", "pid": 4, "window_id": 99, + "is_on_screen": True, "title": "Program Manager", "z_index": 5}, + {"app_name": "Shell_TrayWnd", "pid": 4, "window_id": 50, + "is_on_screen": True, "title": "Taskbar", "z_index": 4}, + ], + } + + def fake_call_tool(name, args): + if name == "list_windows": + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": windows_payload, "isError": False} + if name == "get_window_state": + # Should be invoked against the desktop backdrop, not Code. + assert args["window_id"] == 99 + return {"data": "✅ Desktop — 0 elements", "images": [], + "image_mime_types": [], "structuredContent": None, + "isError": False} + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": None, "isError": False} + + backend._session.call_tool.side_effect = fake_call_tool + cap = backend.capture(mode="ax", app="screen") + + assert backend._active_window_id == 99 + assert cap.app == "Progman" + + def test_capture_app_screen_no_desktop_window_surfaces_limitation(self): + """When no desktop/shell window is present, capture(app='screen') + returns a clear message about cua-driver's per-window capture limit + instead of silently grabbing the frontmost app.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + + windows_payload = { + "windows": [ + {"app_name": "Code", "pid": 11, "window_id": 1, + "is_on_screen": True, "title": "editor", "z_index": 0}, + ], + } + + def fake_call_tool(name, args): + if name == "list_windows": + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": windows_payload, "isError": False} + raise AssertionError(f"unexpected tool {name} — should short-circuit") + + backend._session.call_tool.side_effect = fake_call_tool + cap = backend.capture(mode="vision", app="desktop") + + assert cap.width == 0 and cap.height == 0 + assert cap.png_b64 is None + assert "captures one window at a time" in cap.window_title + class TestCapabilityDiscovery: """Surface 4 (NousResearch/hermes-agent#47072): the wrapper learns diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py index af0bb9fc392..fbf9ff07b2c 100644 --- a/tools/computer_use/cua_backend.py +++ b/tools/computer_use/cua_backend.py @@ -78,6 +78,29 @@ _CUA_DRIVER_ARGS = ["mcp"] # stdio MCP transport (fallback when the # driver doesn't expose `manifest` — see # `_resolve_mcp_invocation` below) +# Whole-screen / desktop capture. cua-driver is a window-oriented driver — +# its `get_window_state` / `screenshot` tools capture a single window (by +# pid + window_id), and there is no MCP tool that captures the entire virtual +# desktop or an arbitrary monitor as one image. But the OS shell surfaces +# themselves (the desktop backdrop and the taskbar/menu-bar) are real windows +# that show up in `list_windows`, so "show me my screen" / "click the taskbar" +# is reachable by targeting those windows. When `app` is one of these +# sentinels, capture() resolves to the desktop/shell window instead of an +# application window. +_SCREEN_CAPTURE_SENTINELS = {"screen", "desktop", "fullscreen", "full screen", "all"} + +# Known shell/desktop window identifiers across platforms. Matched +# case-insensitively as a substring against both the window's app_name and +# its title (cua-driver surfaces the Win32 class name / app name here). +# Windows: Progman / WorkerW back the desktop; Shell_TrayWnd is the taskbar. +# macOS: Finder owns the desktop; the menu bar / Dock are the shell. +_DESKTOP_WINDOW_NAMES = ( + "progman", "workerw", "program manager", # Windows desktop + "shell_traywnd", "taskbar", # Windows taskbar + "finder", "desktop", "dock", # macOS desktop / shell +) + + # Env var cua-driver reads to gate its anonymous usage telemetry (PostHog). # Setting it to "0" disables telemetry; absence => the binary's own default # (telemetry ON upstream). @@ -968,7 +991,43 @@ class CuaDriverBackend(ComputerUseBackend): # returned by list_windows is the localized name (e.g. "計算機"), so # `app="Calculator"` legitimately matches no windows on a non-English # system and the caller needs to retry with the localized name. - if app: + if app and app.strip().lower() in _SCREEN_CAPTURE_SENTINELS: + # Whole-screen / desktop request. cua-driver has no virtual-desktop + # capture tool, so resolve to the OS shell/desktop window (the + # desktop backdrop or the taskbar/menu-bar), which list_windows + # does surface. This makes "show me my screen" and "click the + # taskbar" work; a single image still can't span multiple monitors + # — that's a driver limitation, not a wrapper one. + def _is_desktop_window(w: Dict[str, Any]) -> bool: + haystack = f"{w.get('app_name', '')} {w.get('title', '')}".lower() + return any(name in haystack for name in _DESKTOP_WINDOW_NAMES) + + desktop = [w for w in windows if _is_desktop_window(w)] + if not desktop: + return CaptureResult( + mode=mode, width=0, height=0, png_b64=None, + elements=[], app="", + window_title=( + f"" + ), + png_bytes_len=0, + ) + # Prefer the desktop backdrop (Progman/WorkerW/Finder) over the + # taskbar when both are present, so a bare "screen" capture shows + # the full desktop rather than just the task strip. + windows = sorted( + desktop, + key=lambda w: 0 if any( + n in f"{w.get('app_name', '')} {w.get('title', '')}".lower() + for n in ("progman", "workerw", "program manager", "finder", "desktop") + ) else 1, + ) + elif app: app_lower = app.lower() filtered = [w for w in windows if app_lower in w["app_name"].lower()] if not filtered: diff --git a/tools/computer_use/schema.py b/tools/computer_use/schema.py index 5bb855ccc0f..a3394d23276 100644 --- a/tools/computer_use/schema.py +++ b/tools/computer_use/schema.py @@ -71,9 +71,14 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = { "type": "string", "description": ( "Optional. Limit capture/action to a specific app " - "(by name, e.g. 'Safari' or 'Notepad', or bundle ID " - "where the platform supports it). If omitted, operates " - "on the frontmost app's window or the whole screen." + "(by name, e.g. 'Safari', or bundle ID, " + "'com.apple.Safari'). If omitted, operates on the " + "frontmost app's window. Pass app='screen' (or " + "'desktop') to capture the OS desktop/shell surface — " + "e.g. to see the wallpaper or click the taskbar. Note: " + "capture is per-window; a single image cannot span " + "multiple monitors, so on a multi-screen setup capture " + "one window or display at a time." ), }, "max_elements": {