mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(computer-use): surface app=… filter no-match instead of silently using frontmost (#24170 bug 1)
`CuaDriverBackend.capture(app=X)` and `focus_app(app=X)` silently fell back
to the frontmost on-screen window when X matched no app — typically a
menu-bar utility (e.g. "Fuwari" in the bug reporter's case) rather than
the requested app. The agent then received UI elements for the wrong app
and clicked / typed into it.
The root cause is a localized macOS app name mismatch: `list_windows`
returns the localized `app_name` (e.g. "計算機" on a Japanese/Chinese
system) but callers naturally pass the English name ("Calculator"). The
substring filter doesn't match, and the code falls through to picking the
frontmost window with no signal that the filter was effectively dropped.
Fix:
- `capture(app=…)`: when the filter matches nothing, return a
`CaptureResult` with empty `app`/`elements` and a diagnostic
`window_title` pointing the caller at `list_apps` and noting the
localized-name convention. `_active_pid` / `_active_window_id` are left
untouched so a subsequent action doesn't inadvertently hit the wrong
process.
- `focus_app(app=…)`: when the filter matches nothing, set `target = None`
and let the existing `return ActionResult(ok=False, …, "No on-screen
window found for app …")` path fire instead of falsely reporting success
on the frontmost window.
This addresses bug 1 only from #24170. Bugs 2 & 5 are addressed in #30046;
bugs 3 & 4 in #30032.
This commit is contained in:
parent
4cc18877c6
commit
5aa4727f34
2 changed files with 156 additions and 3 deletions
|
|
@ -925,3 +925,136 @@ class TestCaptureAfterAppContext:
|
|||
# No app context — should pass None so cua-driver picks the frontmost window
|
||||
assert len(captured_app_args) == 1
|
||||
assert captured_app_args[0] is None
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regression tests for bug 1 from issue #24170:
|
||||
# capture(app=...) and focus_app(app=...) must surface when the filter
|
||||
# matches nothing instead of silently picking the frontmost window.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_cua_backend_with_windows(windows: List[Dict[str, Any]]):
|
||||
"""Construct a CuaDriverBackend with a mocked MCP session that returns
|
||||
the supplied list_windows payload."""
|
||||
from tools.computer_use.cua_backend import CuaDriverBackend
|
||||
|
||||
backend = CuaDriverBackend()
|
||||
backend._session = MagicMock()
|
||||
backend._session.call_tool.return_value = {
|
||||
"data": "",
|
||||
"images": [],
|
||||
"structuredContent": {"windows": windows},
|
||||
"isError": False,
|
||||
}
|
||||
return backend
|
||||
|
||||
|
||||
class TestCaptureAppFilterNoMatch:
|
||||
"""capture(app=X) must not silently fall back to the frontmost window
|
||||
when X matches nothing — on a non-English macOS, list_windows returns
|
||||
localized app names (e.g. "計算機"), so an English `app="Calculator"`
|
||||
legitimately matches nothing and the caller needs to retry with the
|
||||
localized name. The old code silently captured the frontmost window
|
||||
(e.g. a menu-bar utility), giving the agent wrong UI elements.
|
||||
"""
|
||||
|
||||
def test_app_filter_no_match_returns_empty_capture_with_diagnostic(self):
|
||||
# Simulates a localized macOS where Calculator's app_name is "計算機".
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
{"app_name": "計算機", "pid": 200, "window_id": 2,
|
||||
"is_on_screen": True, "title": "Calculator", "z_index": 1},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
|
||||
cap = backend.capture(mode="som", app="Calculator")
|
||||
|
||||
# No window matched; capture must NOT pick the frontmost (Fuwari).
|
||||
assert cap.app == "", (
|
||||
f"app= filter no-match should not silently target a window; got {cap.app!r}"
|
||||
)
|
||||
assert cap.elements == []
|
||||
assert "Calculator" in cap.window_title
|
||||
assert "list_apps" in cap.window_title
|
||||
# _active_pid must remain unset so a subsequent click doesn't hit Fuwari.
|
||||
assert backend._active_pid is None
|
||||
assert backend._active_window_id is None
|
||||
|
||||
def test_app_filter_match_still_works(self):
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
{"app_name": "計算機", "pid": 200, "window_id": 2,
|
||||
"is_on_screen": True, "title": "Calculator", "z_index": 1},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
# get_window_state for the matched window
|
||||
backend._session.call_tool.side_effect = [
|
||||
{"data": "", "images": [], "isError": False,
|
||||
"structuredContent": {"windows": windows}},
|
||||
{"data": '✅ 計算機 — 0 elements\n', "images": [], "isError": False,
|
||||
"structuredContent": None},
|
||||
]
|
||||
|
||||
cap = backend.capture(mode="ax", app="計算機")
|
||||
|
||||
assert backend._active_pid == 200
|
||||
assert backend._active_window_id == 2
|
||||
|
||||
def test_no_app_filter_still_picks_frontmost(self):
|
||||
"""When no app= is given, capture continues to pick the frontmost
|
||||
window — the no-match early-return must not fire on the empty case."""
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
backend._session.call_tool.side_effect = [
|
||||
{"data": "", "images": [], "isError": False,
|
||||
"structuredContent": {"windows": windows}},
|
||||
{"data": '✅ Fuwari — 0 elements\n', "images": [], "isError": False,
|
||||
"structuredContent": None},
|
||||
]
|
||||
|
||||
cap = backend.capture(mode="ax", app=None)
|
||||
|
||||
assert backend._active_pid == 100
|
||||
|
||||
|
||||
class TestFocusAppFilterNoMatch:
|
||||
"""focus_app(app=X) must return ok=False when X matches nothing —
|
||||
not silently target the frontmost window and report ok=True with a
|
||||
misleading 'Targeted Fuwari' message.
|
||||
"""
|
||||
|
||||
def test_focus_app_no_match_returns_not_ok(self):
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
{"app_name": "計算機", "pid": 200, "window_id": 2,
|
||||
"is_on_screen": True, "title": "Calculator", "z_index": 1},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
|
||||
res = backend.focus_app("Calculator")
|
||||
|
||||
assert res.ok is False
|
||||
assert res.action == "focus_app"
|
||||
assert "Calculator" in res.message
|
||||
# _active_pid must remain unset so a subsequent click doesn't hit Fuwari.
|
||||
assert backend._active_pid is None
|
||||
|
||||
def test_focus_app_match_still_works(self):
|
||||
windows = [
|
||||
{"app_name": "Fuwari", "pid": 100, "window_id": 1,
|
||||
"is_on_screen": True, "title": "menu bar", "z_index": 0},
|
||||
{"app_name": "計算機", "pid": 200, "window_id": 2,
|
||||
"is_on_screen": True, "title": "Calculator", "z_index": 1},
|
||||
]
|
||||
backend = _make_cua_backend_with_windows(windows)
|
||||
|
||||
res = backend.focus_app("計算機")
|
||||
|
||||
assert res.ok is True
|
||||
assert backend._active_pid == 200
|
||||
assert backend._active_window_id == 2
|
||||
|
|
|
|||
|
|
@ -393,11 +393,27 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
elements=[], app="", window_title="", png_bytes_len=0)
|
||||
|
||||
# Filter by app name (case-insensitive substring) if requested.
|
||||
# When the filter matches nothing, surface that explicitly instead of
|
||||
# silently capturing the frontmost window — on macOS the `app_name`
|
||||
# returned by list_windows is the localized name (e.g. "計算機"), so
|
||||
# `app="Calculator"` legitimately matches no windows on a non-English
|
||||
# system and the caller needs to retry with the localized name.
|
||||
if app:
|
||||
app_lower = app.lower()
|
||||
filtered = [w for w in windows if app_lower in w["app_name"].lower()]
|
||||
if filtered:
|
||||
windows = filtered
|
||||
if not filtered:
|
||||
return CaptureResult(
|
||||
mode=mode, width=0, height=0, png_b64=None,
|
||||
elements=[], app="",
|
||||
window_title=(
|
||||
f"<no on-screen window matched app={app!r}; "
|
||||
f"call list_apps to see available app names "
|
||||
f"(macOS reports localized names, e.g. '計算機' "
|
||||
f"instead of 'Calculator')>"
|
||||
),
|
||||
png_bytes_len=0,
|
||||
)
|
||||
windows = filtered
|
||||
|
||||
# Pick first on-screen window (sorted by z_index / z-order above).
|
||||
target = next((w for w in windows if not w["off_screen"]), windows[0])
|
||||
|
|
@ -658,7 +674,11 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
|
||||
app_lower = app.lower()
|
||||
matched = [w for w in windows if app_lower in w["app_name"].lower()]
|
||||
target = matched[0] if matched else (windows[0] if windows else None)
|
||||
# Don't silently fall back to the frontmost window when the filter
|
||||
# matches nothing — that hides the real failure (often a localized
|
||||
# macOS app name mismatch, e.g. caller passed "Calculator" but
|
||||
# list_windows returns "計算機").
|
||||
target = matched[0] if matched else None
|
||||
if target:
|
||||
self._active_pid = target["pid"]
|
||||
self._active_window_id = target["window_id"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue