fix(computer_use): preserve app context for capture_after; fix element label parsing (#24170 bugs 2 & 5)

Bug 2 (capture_after=True loses app context):
_maybe_follow_capture called backend.capture(mode='som') with no app=,
causing cua-driver to capture the frontmost window instead of the app
targeted by the preceding capture/focus_app. Fix: track _last_app on
CuaDriverBackend and thread it through the follow-up capture call so
the same app is re-captured regardless of which window has OS focus.

Bug 5 (element labels stripped in capture results):
_ELEMENT_LINE_RE matched the classic '  - [N] AXRole "label"' format
but not the '[N] AXRole (order) id=Label' format introduced in
cua-driver v0.1.6. All element labels were silently dropped as empty
strings, making element identification impossible.

Fix: extend regex to capture both group(3) (quoted label) and group(4)
(id= label), and update _parse_elements_from_tree to use group(4) as
fallback. Both old and new cua-driver output now produce populated
UIElement.label values.

focus_app() now also sets _last_app so that capture_after= on any
subsequent action re-targets the focused app.

5 new regression tests added.

Part of #24170 (bugs 1 and 3/4 addressed separately).
This commit is contained in:
Bartok9 2026-05-12 03:36:38 -04:00 committed by Teknium
parent 3fde8c153d
commit 4cc18877c6
3 changed files with 226 additions and 6 deletions

View file

@ -57,10 +57,18 @@ _WINDOW_LINE_RE = re.compile(
re.MULTILINE,
)
# Regex to parse element lines from get_window_state AX tree markdown:
# " - [N] AXRole "label""
# Regex to parse element lines from get_window_state AX tree markdown.
#
# Handles two output formats from different cua-driver versions:
# Classic: " - [N] AXRole \"label\""
# New: "[N] AXRole (order) id=Label"
#
# Group 1: element index
# Group 2: AX role
# Group 3: quoted label (classic format)
# Group 4: id= label (new format)
_ELEMENT_LINE_RE = re.compile(
r'^\s*-\s+\[(\d+)\]\s+(\w+)(?:\s+"([^"]*)")?',
r'^\s*(?:-\s+)?\[(\d+)\]\s+(\w+)(?:\s+"([^"]*)"|(?:\s+\(\d+\))?\s+id=([^\s\[\]]*))?' ,
re.MULTILINE,
)
@ -107,13 +115,19 @@ def _parse_windows_from_text(text: str) -> List[Dict[str, Any]]:
def _parse_elements_from_tree(markdown: str) -> List[UIElement]:
"""Parse UIElement list from get_window_state AX tree markdown."""
"""Parse UIElement list from get_window_state AX tree markdown.
Handles both the classic ``"label"``-quoted format and the newer
``id=Label`` format introduced in cua-driver v0.1.6.
"""
elements = []
for m in _ELEMENT_LINE_RE.finditer(markdown):
# group(3) = quoted label (classic); group(4) = id= label (new)
label = m.group(3) or m.group(4) or ""
elements.append(UIElement(
index=int(m.group(1)),
role=m.group(2),
label=m.group(3) or "",
label=label,
bounds=(0, 0, 0, 0),
))
return elements
@ -325,6 +339,7 @@ class CuaDriverBackend(ComputerUseBackend):
# Sticky context — updated by capture(), used by action tools.
self._active_pid: Optional[int] = None
self._active_window_id: Optional[int] = None
self._last_app: Optional[str] = None # last app name targeted via capture/focus_app
# ── Lifecycle ──────────────────────────────────────────────────
def start(self) -> None:
@ -389,6 +404,10 @@ class CuaDriverBackend(ComputerUseBackend):
self._active_pid = target["pid"]
self._active_window_id = target["window_id"]
app_name = target["app_name"]
# Record the resolved app name so capture_after= follow-ups can re-target
# the same app rather than falling back to the frontmost window.
if app or not self._last_app:
self._last_app = app_name
# Step 2: capture.
png_b64: Optional[str] = None
@ -643,6 +662,7 @@ class CuaDriverBackend(ComputerUseBackend):
if target:
self._active_pid = target["pid"]
self._active_window_id = target["window_id"]
self._last_app = target["app_name"] # preserve for capture_after= follow-ups
return ActionResult(
ok=True, action="focus_app",
message=f"Targeted {target['app_name']} (pid {self._active_pid}, "

View file

@ -463,7 +463,11 @@ def _maybe_follow_capture(
if not do_capture:
return _text_response(res)
try:
cap = backend.capture(mode="som")
# Preserve the app context established by the preceding capture/focus_app so
# that capture_after=True re-captures the same app rather than the frontmost
# window (which may have changed if the action caused a focus shift).
last_app = getattr(backend, "_last_app", None)
cap = backend.capture(mode="som", app=last_app)
except Exception as e:
logger.warning("follow-up capture failed: %s", e)
return _text_response(res)