diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index 3c8bdd9713d..7afaa7b57de 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -925,3 +925,136 @@ class TestCaptureAfterAppContext: # No app context — should pass None so cua-driver picks the frontmost window assert len(captured_app_args) == 1 assert captured_app_args[0] is None + +# --------------------------------------------------------------------------- +# Regression tests for bug 1 from issue #24170: +# capture(app=...) and focus_app(app=...) must surface when the filter +# matches nothing instead of silently picking the frontmost window. +# --------------------------------------------------------------------------- + +def _make_cua_backend_with_windows(windows: List[Dict[str, Any]]): + """Construct a CuaDriverBackend with a mocked MCP session that returns + the supplied list_windows payload.""" + from tools.computer_use.cua_backend import CuaDriverBackend + + backend = CuaDriverBackend() + backend._session = MagicMock() + backend._session.call_tool.return_value = { + "data": "", + "images": [], + "structuredContent": {"windows": windows}, + "isError": False, + } + return backend + + +class TestCaptureAppFilterNoMatch: + """capture(app=X) must not silently fall back to the frontmost window + when X matches nothing — on a non-English macOS, list_windows returns + localized app names (e.g. "計算機"), so an English `app="Calculator"` + legitimately matches nothing and the caller needs to retry with the + localized name. The old code silently captured the frontmost window + (e.g. a menu-bar utility), giving the agent wrong UI elements. + """ + + def test_app_filter_no_match_returns_empty_capture_with_diagnostic(self): + # Simulates a localized macOS where Calculator's app_name is "計算機". + windows = [ + {"app_name": "Fuwari", "pid": 100, "window_id": 1, + "is_on_screen": True, "title": "menu bar", "z_index": 0}, + {"app_name": "計算機", "pid": 200, "window_id": 2, + "is_on_screen": True, "title": "Calculator", "z_index": 1}, + ] + backend = _make_cua_backend_with_windows(windows) + + cap = backend.capture(mode="som", app="Calculator") + + # No window matched; capture must NOT pick the frontmost (Fuwari). + assert cap.app == "", ( + f"app= filter no-match should not silently target a window; got {cap.app!r}" + ) + assert cap.elements == [] + assert "Calculator" in cap.window_title + assert "list_apps" in cap.window_title + # _active_pid must remain unset so a subsequent click doesn't hit Fuwari. + assert backend._active_pid is None + assert backend._active_window_id is None + + def test_app_filter_match_still_works(self): + windows = [ + {"app_name": "Fuwari", "pid": 100, "window_id": 1, + "is_on_screen": True, "title": "menu bar", "z_index": 0}, + {"app_name": "計算機", "pid": 200, "window_id": 2, + "is_on_screen": True, "title": "Calculator", "z_index": 1}, + ] + backend = _make_cua_backend_with_windows(windows) + # get_window_state for the matched window + backend._session.call_tool.side_effect = [ + {"data": "", "images": [], "isError": False, + "structuredContent": {"windows": windows}}, + {"data": '✅ 計算機 — 0 elements\n', "images": [], "isError": False, + "structuredContent": None}, + ] + + cap = backend.capture(mode="ax", app="計算機") + + assert backend._active_pid == 200 + assert backend._active_window_id == 2 + + def test_no_app_filter_still_picks_frontmost(self): + """When no app= is given, capture continues to pick the frontmost + window — the no-match early-return must not fire on the empty case.""" + windows = [ + {"app_name": "Fuwari", "pid": 100, "window_id": 1, + "is_on_screen": True, "title": "menu bar", "z_index": 0}, + ] + backend = _make_cua_backend_with_windows(windows) + backend._session.call_tool.side_effect = [ + {"data": "", "images": [], "isError": False, + "structuredContent": {"windows": windows}}, + {"data": '✅ Fuwari — 0 elements\n', "images": [], "isError": False, + "structuredContent": None}, + ] + + cap = backend.capture(mode="ax", app=None) + + assert backend._active_pid == 100 + + +class TestFocusAppFilterNoMatch: + """focus_app(app=X) must return ok=False when X matches nothing — + not silently target the frontmost window and report ok=True with a + misleading 'Targeted Fuwari' message. + """ + + def test_focus_app_no_match_returns_not_ok(self): + windows = [ + {"app_name": "Fuwari", "pid": 100, "window_id": 1, + "is_on_screen": True, "title": "menu bar", "z_index": 0}, + {"app_name": "計算機", "pid": 200, "window_id": 2, + "is_on_screen": True, "title": "Calculator", "z_index": 1}, + ] + backend = _make_cua_backend_with_windows(windows) + + res = backend.focus_app("Calculator") + + assert res.ok is False + assert res.action == "focus_app" + assert "Calculator" in res.message + # _active_pid must remain unset so a subsequent click doesn't hit Fuwari. + assert backend._active_pid is None + + def test_focus_app_match_still_works(self): + windows = [ + {"app_name": "Fuwari", "pid": 100, "window_id": 1, + "is_on_screen": True, "title": "menu bar", "z_index": 0}, + {"app_name": "計算機", "pid": 200, "window_id": 2, + "is_on_screen": True, "title": "Calculator", "z_index": 1}, + ] + backend = _make_cua_backend_with_windows(windows) + + res = backend.focus_app("計算機") + + assert res.ok is True + assert backend._active_pid == 200 + assert backend._active_window_id == 2 diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py index afeb04e1342..ffdeeb2a388 100644 --- a/tools/computer_use/cua_backend.py +++ b/tools/computer_use/cua_backend.py @@ -393,11 +393,27 @@ class CuaDriverBackend(ComputerUseBackend): elements=[], app="", window_title="", png_bytes_len=0) # Filter by app name (case-insensitive substring) if requested. + # When the filter matches nothing, surface that explicitly instead of + # silently capturing the frontmost window — on macOS the `app_name` + # returned by list_windows is the localized name (e.g. "計算機"), so + # `app="Calculator"` legitimately matches no windows on a non-English + # system and the caller needs to retry with the localized name. if app: app_lower = app.lower() filtered = [w for w in windows if app_lower in w["app_name"].lower()] - if filtered: - windows = filtered + if not filtered: + return CaptureResult( + mode=mode, width=0, height=0, png_b64=None, + elements=[], app="", + window_title=( + f"" + ), + png_bytes_len=0, + ) + windows = filtered # Pick first on-screen window (sorted by z_index / z-order above). target = next((w for w in windows if not w["off_screen"]), windows[0]) @@ -658,7 +674,11 @@ class CuaDriverBackend(ComputerUseBackend): app_lower = app.lower() matched = [w for w in windows if app_lower in w["app_name"].lower()] - target = matched[0] if matched else (windows[0] if windows else None) + # Don't silently fall back to the frontmost window when the filter + # matches nothing — that hides the real failure (often a localized + # macOS app name mismatch, e.g. caller passed "Calculator" but + # list_windows returns "計算機"). + target = matched[0] if matched else None if target: self._active_pid = target["pid"] self._active_window_id = target["window_id"]