diff --git a/scripts/release.py b/scripts/release.py index 2c781838fc8..ebb80074b7e 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json" # Auto-extracted from noreply emails + manual overrides AUTHOR_MAP = { + "jeevesassistant00@gmail.com": "jeeves-assistant", # PR #50771 (computer-use CuaDriver vision capture routing) "21178861+ScotterMonk@users.noreply.github.com": "ScotterMonk", # PR #50145 salvage (cron output truncation: adapter-aware chunking, #50126) "rrandqua@gmail.com": "TutkuEroglu", # PR #50481 salvage (AGENTS.md stale token-lock adapter path) "f@trycua.com": "f-trycua", # PR #50507 salvage (cross-platform computer_use; supersedes #44221/#30660) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index c75d87c8513..85f62e4e3c7 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -2139,6 +2139,123 @@ class TestStructuredElementsConsumption: # Markdown surface doesn't carry bounds — lossy by design. assert cap.elements[0].bounds == (0, 0, 0, 0) + def test_vision_capture_falls_back_to_get_window_state_when_screenshot_dropped(self): + """cua-driver >=0.5.x dropped the standalone `screenshot` MCP tool and + folded full-window PNG capture into `get_window_state`. When the driver + no longer advertises `screenshot`, vision capture must route through + `get_window_state` (discarding the AX tree) and still return a PNG.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + # Modern driver: capabilities discovered, `screenshot` not advertised. + backend._session._has_tool.return_value = False + backend._session.capabilities_discovered = True + + windows_payload = { + "windows": [{ + "app_name": "Demo", "pid": 9, "window_id": 1, + "is_on_screen": True, "title": "Demo", "z_index": 0, + }], + } + png_b64 = ( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m" + "NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" + ) + + def fake_call_tool(name, args): + if name == "list_windows": + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": windows_payload, "isError": False} + if name == "get_window_state": + return {"data": "", "images": [png_b64], + "image_mime_types": ["image/png"], + "structuredContent": None, "isError": False} + if name == "screenshot": + raise AssertionError("driver dropped screenshot; must not be called") + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": None, "isError": False} + + backend._session.call_tool.side_effect = fake_call_tool + cap = backend.capture(mode="vision") + + tool_names = [call.args[0] for call in backend._session.call_tool.call_args_list] + assert tool_names == ["list_windows", "get_window_state"] + assert cap.png_b64 == png_b64 + assert cap.image_mime_type == "image/png" + assert cap.width == 1 + assert cap.height == 1 + # Vision mode stays free of AX element noise. + assert cap.elements == [] + + def test_capture_app_screen_targets_desktop_window(self): + """capture(app='screen') resolves to the OS shell/desktop window + (Windows Progman) rather than an application window, so 'show me my + screen' works on cua-driver's window-oriented capture surface.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + + windows_payload = { + "windows": [ + {"app_name": "Code", "pid": 11, "window_id": 1, + "is_on_screen": True, "title": "editor", "z_index": 0}, + {"app_name": "Progman", "pid": 4, "window_id": 99, + "is_on_screen": True, "title": "Program Manager", "z_index": 5}, + {"app_name": "Shell_TrayWnd", "pid": 4, "window_id": 50, + "is_on_screen": True, "title": "Taskbar", "z_index": 4}, + ], + } + + def fake_call_tool(name, args): + if name == "list_windows": + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": windows_payload, "isError": False} + if name == "get_window_state": + # Should be invoked against the desktop backdrop, not Code. + assert args["window_id"] == 99 + return {"data": "✅ Desktop — 0 elements", "images": [], + "image_mime_types": [], "structuredContent": None, + "isError": False} + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": None, "isError": False} + + backend._session.call_tool.side_effect = fake_call_tool + cap = backend.capture(mode="ax", app="screen") + + assert backend._active_window_id == 99 + assert cap.app == "Progman" + + def test_capture_app_screen_no_desktop_window_surfaces_limitation(self): + """When no desktop/shell window is present, capture(app='screen') + returns a clear message about cua-driver's per-window capture limit + instead of silently grabbing the frontmost app.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + + windows_payload = { + "windows": [ + {"app_name": "Code", "pid": 11, "window_id": 1, + "is_on_screen": True, "title": "editor", "z_index": 0}, + ], + } + + def fake_call_tool(name, args): + if name == "list_windows": + return {"data": "", "images": [], "image_mime_types": [], + "structuredContent": windows_payload, "isError": False} + raise AssertionError(f"unexpected tool {name} — should short-circuit") + + backend._session.call_tool.side_effect = fake_call_tool + cap = backend.capture(mode="vision", app="desktop") + + assert cap.width == 0 and cap.height == 0 + assert cap.png_b64 is None + assert "captures one window at a time" in cap.window_title + class TestCapabilityDiscovery: """Surface 4 (NousResearch/hermes-agent#47072): the wrapper learns diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py index 5acf28faf98..a8077204f97 100644 --- a/tools/computer_use/cua_backend.py +++ b/tools/computer_use/cua_backend.py @@ -78,6 +78,29 @@ _CUA_DRIVER_ARGS = ["mcp"] # stdio MCP transport (fallback when the # driver doesn't expose `manifest` — see # `_resolve_mcp_invocation` below) +# Whole-screen / desktop capture. cua-driver is a window-oriented driver — +# its `get_window_state` / `screenshot` tools capture a single window (by +# pid + window_id), and there is no MCP tool that captures the entire virtual +# desktop or an arbitrary monitor as one image. But the OS shell surfaces +# themselves (the desktop backdrop and the taskbar/menu-bar) are real windows +# that show up in `list_windows`, so "show me my screen" / "click the taskbar" +# is reachable by targeting those windows. When `app` is one of these +# sentinels, capture() resolves to the desktop/shell window instead of an +# application window. +_SCREEN_CAPTURE_SENTINELS = {"screen", "desktop", "fullscreen", "full screen", "all"} + +# Known shell/desktop window identifiers across platforms. Matched +# case-insensitively as a substring against both the window's app_name and +# its title (cua-driver surfaces the Win32 class name / app name here). +# Windows: Progman / WorkerW back the desktop; Shell_TrayWnd is the taskbar. +# macOS: Finder owns the desktop; the menu bar / Dock are the shell. +_DESKTOP_WINDOW_NAMES = ( + "progman", "workerw", "program manager", # Windows desktop + "shell_traywnd", "taskbar", # Windows taskbar + "finder", "desktop", "dock", # macOS desktop / shell +) + + # Env var cua-driver reads to gate its anonymous usage telemetry (PostHog). # Setting it to "0" disables telemetry; absence => the binary's own default # (telemetry ON upstream). @@ -1029,7 +1052,43 @@ class CuaDriverBackend(ComputerUseBackend): # returned by list_windows is the localized name (e.g. "計算機"), so # `app="Calculator"` legitimately matches no windows on a non-English # system and the caller needs to retry with the localized name. - if app: + if app and app.strip().lower() in _SCREEN_CAPTURE_SENTINELS: + # Whole-screen / desktop request. cua-driver has no virtual-desktop + # capture tool, so resolve to the OS shell/desktop window (the + # desktop backdrop or the taskbar/menu-bar), which list_windows + # does surface. This makes "show me my screen" and "click the + # taskbar" work; a single image still can't span multiple monitors + # — that's a driver limitation, not a wrapper one. + def _is_desktop_window(w: Dict[str, Any]) -> bool: + haystack = f"{w.get('app_name', '')} {w.get('title', '')}".lower() + return any(name in haystack for name in _DESKTOP_WINDOW_NAMES) + + desktop = [w for w in windows if _is_desktop_window(w)] + if not desktop: + return CaptureResult( + mode=mode, width=0, height=0, png_b64=None, + elements=[], app="", + window_title=( + f"" + ), + png_bytes_len=0, + ) + # Prefer the desktop backdrop (Progman/WorkerW/Finder) over the + # taskbar when both are present, so a bare "screen" capture shows + # the full desktop rather than just the task strip. + windows = sorted( + desktop, + key=lambda w: 0 if any( + n in f"{w.get('app_name', '')} {w.get('title', '')}".lower() + for n in ("progman", "workerw", "program manager", "finder", "desktop") + ) else 1, + ) + elif app: app_lower = app.lower() filtered = [w for w in windows if app_lower in w["app_name"].lower()] if not filtered: diff --git a/tools/computer_use/schema.py b/tools/computer_use/schema.py index 5bb855ccc0f..a3394d23276 100644 --- a/tools/computer_use/schema.py +++ b/tools/computer_use/schema.py @@ -71,9 +71,14 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = { "type": "string", "description": ( "Optional. Limit capture/action to a specific app " - "(by name, e.g. 'Safari' or 'Notepad', or bundle ID " - "where the platform supports it). If omitted, operates " - "on the frontmost app's window or the whole screen." + "(by name, e.g. 'Safari', or bundle ID, " + "'com.apple.Safari'). If omitted, operates on the " + "frontmost app's window. Pass app='screen' (or " + "'desktop') to capture the OS desktop/shell surface — " + "e.g. to see the wallpaper or click the taskbar. Note: " + "capture is per-window; a single image cannot span " + "multiple monitors, so on a multi-screen setup capture " + "one window or display at a time." ), }, "max_elements": {