fix(computer-use): surface app=… filter no-match instead of silently using frontmost (#24170 bug 1)

`CuaDriverBackend.capture(app=X)` and `focus_app(app=X)` silently fell back
to the frontmost on-screen window when X matched no app — typically a
menu-bar utility (e.g. "Fuwari" in the bug reporter's case) rather than
the requested app. The agent then received UI elements for the wrong app
and clicked / typed into it.

The root cause is a localized macOS app name mismatch: `list_windows`
returns the localized `app_name` (e.g. "計算機" on a Japanese/Chinese
system) but callers naturally pass the English name ("Calculator"). The
substring filter doesn't match, and the code falls through to picking the
frontmost window with no signal that the filter was effectively dropped.

Fix:

- `capture(app=…)`: when the filter matches nothing, return a
  `CaptureResult` with empty `app`/`elements` and a diagnostic
  `window_title` pointing the caller at `list_apps` and noting the
  localized-name convention. `_active_pid` / `_active_window_id` are left
  untouched so a subsequent action doesn't inadvertently hit the wrong
  process.
- `focus_app(app=…)`: when the filter matches nothing, set `target = None`
  and let the existing `return ActionResult(ok=False, …, "No on-screen
  window found for app …")` path fire instead of falsely reporting success
  on the frontmost window.

This addresses bug 1 only from #24170. Bugs 2 & 5 are addressed in #30046;
bugs 3 & 4 in #30032.
This commit is contained in:
briandevans 2026-05-12 04:19:01 -07:00 committed by Teknium
parent 4cc18877c6
commit 5aa4727f34
2 changed files with 156 additions and 3 deletions

View file

@ -925,3 +925,136 @@ class TestCaptureAfterAppContext:
# No app context — should pass None so cua-driver picks the frontmost window
assert len(captured_app_args) == 1
assert captured_app_args[0] is None
# ---------------------------------------------------------------------------
# Regression tests for bug 1 from issue #24170:
# capture(app=...) and focus_app(app=...) must surface when the filter
# matches nothing instead of silently picking the frontmost window.
# ---------------------------------------------------------------------------
def _make_cua_backend_with_windows(windows: List[Dict[str, Any]]):
"""Construct a CuaDriverBackend with a mocked MCP session that returns
the supplied list_windows payload."""
from tools.computer_use.cua_backend import CuaDriverBackend
backend = CuaDriverBackend()
backend._session = MagicMock()
backend._session.call_tool.return_value = {
"data": "",
"images": [],
"structuredContent": {"windows": windows},
"isError": False,
}
return backend
class TestCaptureAppFilterNoMatch:
"""capture(app=X) must not silently fall back to the frontmost window
when X matches nothing on a non-English macOS, list_windows returns
localized app names (e.g. "計算機"), so an English `app="Calculator"`
legitimately matches nothing and the caller needs to retry with the
localized name. The old code silently captured the frontmost window
(e.g. a menu-bar utility), giving the agent wrong UI elements.
"""
def test_app_filter_no_match_returns_empty_capture_with_diagnostic(self):
# Simulates a localized macOS where Calculator's app_name is "計算機".
windows = [
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
"is_on_screen": True, "title": "menu bar", "z_index": 0},
{"app_name": "計算機", "pid": 200, "window_id": 2,
"is_on_screen": True, "title": "Calculator", "z_index": 1},
]
backend = _make_cua_backend_with_windows(windows)
cap = backend.capture(mode="som", app="Calculator")
# No window matched; capture must NOT pick the frontmost (Fuwari).
assert cap.app == "", (
f"app= filter no-match should not silently target a window; got {cap.app!r}"
)
assert cap.elements == []
assert "Calculator" in cap.window_title
assert "list_apps" in cap.window_title
# _active_pid must remain unset so a subsequent click doesn't hit Fuwari.
assert backend._active_pid is None
assert backend._active_window_id is None
def test_app_filter_match_still_works(self):
windows = [
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
"is_on_screen": True, "title": "menu bar", "z_index": 0},
{"app_name": "計算機", "pid": 200, "window_id": 2,
"is_on_screen": True, "title": "Calculator", "z_index": 1},
]
backend = _make_cua_backend_with_windows(windows)
# get_window_state for the matched window
backend._session.call_tool.side_effect = [
{"data": "", "images": [], "isError": False,
"structuredContent": {"windows": windows}},
{"data": '✅ 計算機 — 0 elements\n', "images": [], "isError": False,
"structuredContent": None},
]
cap = backend.capture(mode="ax", app="計算機")
assert backend._active_pid == 200
assert backend._active_window_id == 2
def test_no_app_filter_still_picks_frontmost(self):
"""When no app= is given, capture continues to pick the frontmost
window the no-match early-return must not fire on the empty case."""
windows = [
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
"is_on_screen": True, "title": "menu bar", "z_index": 0},
]
backend = _make_cua_backend_with_windows(windows)
backend._session.call_tool.side_effect = [
{"data": "", "images": [], "isError": False,
"structuredContent": {"windows": windows}},
{"data": '✅ Fuwari — 0 elements\n', "images": [], "isError": False,
"structuredContent": None},
]
cap = backend.capture(mode="ax", app=None)
assert backend._active_pid == 100
class TestFocusAppFilterNoMatch:
"""focus_app(app=X) must return ok=False when X matches nothing —
not silently target the frontmost window and report ok=True with a
misleading 'Targeted Fuwari' message.
"""
def test_focus_app_no_match_returns_not_ok(self):
windows = [
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
"is_on_screen": True, "title": "menu bar", "z_index": 0},
{"app_name": "計算機", "pid": 200, "window_id": 2,
"is_on_screen": True, "title": "Calculator", "z_index": 1},
]
backend = _make_cua_backend_with_windows(windows)
res = backend.focus_app("Calculator")
assert res.ok is False
assert res.action == "focus_app"
assert "Calculator" in res.message
# _active_pid must remain unset so a subsequent click doesn't hit Fuwari.
assert backend._active_pid is None
def test_focus_app_match_still_works(self):
windows = [
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
"is_on_screen": True, "title": "menu bar", "z_index": 0},
{"app_name": "計算機", "pid": 200, "window_id": 2,
"is_on_screen": True, "title": "Calculator", "z_index": 1},
]
backend = _make_cua_backend_with_windows(windows)
res = backend.focus_app("計算機")
assert res.ok is True
assert backend._active_pid == 200
assert backend._active_window_id == 2

View file

@ -393,11 +393,27 @@ class CuaDriverBackend(ComputerUseBackend):
elements=[], app="", window_title="", png_bytes_len=0)
# Filter by app name (case-insensitive substring) if requested.
# When the filter matches nothing, surface that explicitly instead of
# silently capturing the frontmost window — on macOS the `app_name`
# returned by list_windows is the localized name (e.g. "計算機"), so
# `app="Calculator"` legitimately matches no windows on a non-English
# system and the caller needs to retry with the localized name.
if app:
app_lower = app.lower()
filtered = [w for w in windows if app_lower in w["app_name"].lower()]
if filtered:
windows = filtered
if not filtered:
return CaptureResult(
mode=mode, width=0, height=0, png_b64=None,
elements=[], app="",
window_title=(
f"<no on-screen window matched app={app!r}; "
f"call list_apps to see available app names "
f"(macOS reports localized names, e.g. '計算機' "
f"instead of 'Calculator')>"
),
png_bytes_len=0,
)
windows = filtered
# Pick first on-screen window (sorted by z_index / z-order above).
target = next((w for w in windows if not w["off_screen"]), windows[0])
@ -658,7 +674,11 @@ class CuaDriverBackend(ComputerUseBackend):
app_lower = app.lower()
matched = [w for w in windows if app_lower in w["app_name"].lower()]
target = matched[0] if matched else (windows[0] if windows else None)
# Don't silently fall back to the frontmost window when the filter
# matches nothing — that hides the real failure (often a localized
# macOS app name mismatch, e.g. caller passed "Calculator" but
# list_windows returns "計算機").
target = matched[0] if matched else None
if target:
self._active_pid = target["pid"]
self._active_window_id = target["window_id"]