From 4cc18877c69bbc2663fab79eebacdb7018f85c14 Mon Sep 17 00:00:00 2001 From: Bartok9 Date: Tue, 12 May 2026 03:36:38 -0400 Subject: [PATCH] fix(computer_use): preserve app context for capture_after; fix element label parsing (#24170 bugs 2 & 5) Bug 2 (capture_after=True loses app context): _maybe_follow_capture called backend.capture(mode='som') with no app=, causing cua-driver to capture the frontmost window instead of the app targeted by the preceding capture/focus_app. Fix: track _last_app on CuaDriverBackend and thread it through the follow-up capture call so the same app is re-captured regardless of which window has OS focus. Bug 5 (element labels stripped in capture results): _ELEMENT_LINE_RE matched the classic ' - [N] AXRole "label"' format but not the '[N] AXRole (order) id=Label' format introduced in cua-driver v0.1.6. All element labels were silently dropped as empty strings, making element identification impossible. Fix: extend regex to capture both group(3) (quoted label) and group(4) (id= label), and update _parse_elements_from_tree to use group(4) as fallback. Both old and new cua-driver output now produce populated UIElement.label values. focus_app() now also sets _last_app so that capture_after= on any subsequent action re-targets the focused app. 5 new regression tests added. Part of #24170 (bugs 1 and 3/4 addressed separately). --- tests/tools/test_computer_use.py | 196 ++++++++++++++++++++++++++++++ tools/computer_use/cua_backend.py | 30 ++++- tools/computer_use/tool.py | 6 +- 3 files changed, 226 insertions(+), 6 deletions(-) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index 4a108d1ce51..3c8bdd9713d 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -729,3 +729,199 @@ class TestUniversality: source = inspect.getsource(entry.check_fn) assert "anthropic" not in source.lower() assert "openai" not in source.lower() + + +# --------------------------------------------------------------------------- +# Regression tests for bugs 2 & 5 from issue #24170 (cua-driver v0.1.6) +# --------------------------------------------------------------------------- + +class TestElementLabelParsing: + """Bug 5: element labels stripped in capture results (cua-driver v0.1.6 format). + + cua-driver ≥0.1.6 emits ``[N] AXRole (order) id=Label`` instead of + `` - [N] AXRole "label"``. _parse_elements_from_tree must handle both. + """ + + def test_classic_quoted_label_format(self): + from tools.computer_use.cua_backend import _parse_elements_from_tree + tree = ( + ' - [14] AXButton "One"\n' + ' - [15] AXButton "Two"\n' + ' - [16] AXTextField ""\n' + ) + els = _parse_elements_from_tree(tree) + assert len(els) == 3 + assert els[0].index == 14 + assert els[0].role == "AXButton" + assert els[0].label == "One" + assert els[1].label == "Two" + assert els[2].label == "" # empty quoted label + + def test_new_id_eq_format(self): + """cua-driver v0.1.6 format: [N] AXRole (order) id=Label""" + from tools.computer_use.cua_backend import _parse_elements_from_tree + tree = ( + "[14] AXButton (1) id=One\n" + "[15] AXButton (2) id=Two\n" + "[16] AXTextField (3) id=\n" + ) + els = _parse_elements_from_tree(tree) + assert len(els) == 3 + assert els[0].index == 14 + assert els[0].role == "AXButton" + assert els[0].label == "One" + assert els[1].label == "Two" + assert els[2].label == "" # empty id= value + + def test_mixed_formats_in_single_tree(self): + """Gracefully handles trees that mix old and new line formats.""" + from tools.computer_use.cua_backend import _parse_elements_from_tree + tree = ( + ' - [1] AXWindow "Main Window"\n' + "[14] AXButton (1) id=One\n" + ' - [15] AXTextField "Search"\n' + ) + els = _parse_elements_from_tree(tree) + assert len(els) == 3 + labels = {e.index: e.label for e in els} + assert labels[1] == "Main Window" + assert labels[14] == "One" + assert labels[15] == "Search" + + +class TestCaptureAfterAppContext: + """Bug 2: capture_after=True loses app context after actions. + + _maybe_follow_capture must re-target the same app that was set by + the preceding capture/focus_app call, rather than the frontmost window. + """ + + def test_capture_after_uses_last_app(self): + """capture_after=True should pass _last_app to the follow-up capture.""" + from tools.computer_use.backend import ActionResult, CaptureResult + from tools.computer_use import tool as cu_tool + + captured_app_args = [] + + class TrackingBackend: + _last_app = "Calculator" # simulates a previous focus_app call + + def start(self): + pass + + def stop(self): + pass + + def is_available(self): + return True + + def capture(self, mode="som", app=None): + captured_app_args.append(app) + return CaptureResult( + mode=mode, width=100, height=100, + png_b64=None, elements=[], + app=app or "Calculator", window_title="", + ) + + def click(self, **kw): + return ActionResult(ok=True, action="click") + + def drag(self, **kw): + return ActionResult(ok=True, action="drag") + + def scroll(self, **kw): + return ActionResult(ok=True, action="scroll") + + def type_text(self, text): + return ActionResult(ok=True, action="type") + + def key(self, keys): + return ActionResult(ok=True, action="key") + + def list_apps(self): + return [] + + def focus_app(self, app, raise_window=False): + return ActionResult(ok=True, action="focus_app") + + def set_value(self, value, element=None): + return ActionResult(ok=True, action="set_value") + + def wait(self, seconds=1.0): + return ActionResult(ok=True, action="wait") + + backend = TrackingBackend() + cu_tool.reset_backend_for_tests() + cu_tool._backend = backend + + cu_tool.handle_computer_use({"action": "click", "element": 14, "capture_after": True}) + + # The follow-up capture must have been called with app="Calculator" + assert len(captured_app_args) == 1 + assert captured_app_args[0] == "Calculator", ( + f"Expected follow-up capture with app='Calculator', got {captured_app_args[0]!r}" + ) + + def test_capture_after_without_prior_app_uses_none(self): + """When no app context is set, follow-up capture uses app=None (frontmost).""" + from tools.computer_use.backend import ActionResult, CaptureResult + from tools.computer_use import tool as cu_tool + + captured_app_args = [] + + class NoContextBackend: + _last_app = None # no prior context + + def start(self): + pass + + def stop(self): + pass + + def is_available(self): + return True + + def capture(self, mode="som", app=None): + captured_app_args.append(app) + return CaptureResult( + mode=mode, width=100, height=100, + png_b64=None, elements=[], + app="Finder", window_title="", + ) + + def click(self, **kw): + return ActionResult(ok=True, action="click") + + def drag(self, **kw): + return ActionResult(ok=True, action="drag") + + def scroll(self, **kw): + return ActionResult(ok=True, action="scroll") + + def type_text(self, text): + return ActionResult(ok=True, action="type") + + def key(self, keys): + return ActionResult(ok=True, action="key") + + def list_apps(self): + return [] + + def focus_app(self, app, raise_window=False): + return ActionResult(ok=True, action="focus_app") + + def set_value(self, value, element=None): + return ActionResult(ok=True, action="set_value") + + def wait(self, seconds=1.0): + return ActionResult(ok=True, action="wait") + + backend = NoContextBackend() + cu_tool.reset_backend_for_tests() + cu_tool._backend = backend + + cu_tool.handle_computer_use({"action": "click", "element": 5, "capture_after": True}) + + # No app context — should pass None so cua-driver picks the frontmost window + assert len(captured_app_args) == 1 + assert captured_app_args[0] is None diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py index e611350d943..afeb04e1342 100644 --- a/tools/computer_use/cua_backend.py +++ b/tools/computer_use/cua_backend.py @@ -57,10 +57,18 @@ _WINDOW_LINE_RE = re.compile( re.MULTILINE, ) -# Regex to parse element lines from get_window_state AX tree markdown: -# " - [N] AXRole "label"" +# Regex to parse element lines from get_window_state AX tree markdown. +# +# Handles two output formats from different cua-driver versions: +# Classic: " - [N] AXRole \"label\"" +# New: "[N] AXRole (order) id=Label" +# +# Group 1: element index +# Group 2: AX role +# Group 3: quoted label (classic format) +# Group 4: id= label (new format) _ELEMENT_LINE_RE = re.compile( - r'^\s*-\s+\[(\d+)\]\s+(\w+)(?:\s+"([^"]*)")?', + r'^\s*(?:-\s+)?\[(\d+)\]\s+(\w+)(?:\s+"([^"]*)"|(?:\s+\(\d+\))?\s+id=([^\s\[\]]*))?' , re.MULTILINE, ) @@ -107,13 +115,19 @@ def _parse_windows_from_text(text: str) -> List[Dict[str, Any]]: def _parse_elements_from_tree(markdown: str) -> List[UIElement]: - """Parse UIElement list from get_window_state AX tree markdown.""" + """Parse UIElement list from get_window_state AX tree markdown. + + Handles both the classic ``"label"``-quoted format and the newer + ``id=Label`` format introduced in cua-driver v0.1.6. + """ elements = [] for m in _ELEMENT_LINE_RE.finditer(markdown): + # group(3) = quoted label (classic); group(4) = id= label (new) + label = m.group(3) or m.group(4) or "" elements.append(UIElement( index=int(m.group(1)), role=m.group(2), - label=m.group(3) or "", + label=label, bounds=(0, 0, 0, 0), )) return elements @@ -325,6 +339,7 @@ class CuaDriverBackend(ComputerUseBackend): # Sticky context — updated by capture(), used by action tools. self._active_pid: Optional[int] = None self._active_window_id: Optional[int] = None + self._last_app: Optional[str] = None # last app name targeted via capture/focus_app # ── Lifecycle ────────────────────────────────────────────────── def start(self) -> None: @@ -389,6 +404,10 @@ class CuaDriverBackend(ComputerUseBackend): self._active_pid = target["pid"] self._active_window_id = target["window_id"] app_name = target["app_name"] + # Record the resolved app name so capture_after= follow-ups can re-target + # the same app rather than falling back to the frontmost window. + if app or not self._last_app: + self._last_app = app_name # Step 2: capture. png_b64: Optional[str] = None @@ -643,6 +662,7 @@ class CuaDriverBackend(ComputerUseBackend): if target: self._active_pid = target["pid"] self._active_window_id = target["window_id"] + self._last_app = target["app_name"] # preserve for capture_after= follow-ups return ActionResult( ok=True, action="focus_app", message=f"Targeted {target['app_name']} (pid {self._active_pid}, " diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py index 7d2bc0cfca2..af7193e177b 100644 --- a/tools/computer_use/tool.py +++ b/tools/computer_use/tool.py @@ -463,7 +463,11 @@ def _maybe_follow_capture( if not do_capture: return _text_response(res) try: - cap = backend.capture(mode="som") + # Preserve the app context established by the preceding capture/focus_app so + # that capture_after=True re-captures the same app rather than the frontmost + # window (which may have changed if the action caused a focus shift). + last_app = getattr(backend, "_last_app", None) + cap = backend.capture(mode="som", app=last_app) except Exception as e: logger.warning("follow-up capture failed: %s", e) return _text_response(res)