mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(computer_use): preserve app context for capture_after; fix element label parsing (#24170 bugs 2 & 5)
Bug 2 (capture_after=True loses app context): _maybe_follow_capture called backend.capture(mode='som') with no app=, causing cua-driver to capture the frontmost window instead of the app targeted by the preceding capture/focus_app. Fix: track _last_app on CuaDriverBackend and thread it through the follow-up capture call so the same app is re-captured regardless of which window has OS focus. Bug 5 (element labels stripped in capture results): _ELEMENT_LINE_RE matched the classic ' - [N] AXRole "label"' format but not the '[N] AXRole (order) id=Label' format introduced in cua-driver v0.1.6. All element labels were silently dropped as empty strings, making element identification impossible. Fix: extend regex to capture both group(3) (quoted label) and group(4) (id= label), and update _parse_elements_from_tree to use group(4) as fallback. Both old and new cua-driver output now produce populated UIElement.label values. focus_app() now also sets _last_app so that capture_after= on any subsequent action re-targets the focused app. 5 new regression tests added. Part of #24170 (bugs 1 and 3/4 addressed separately).
This commit is contained in:
parent
3fde8c153d
commit
4cc18877c6
3 changed files with 226 additions and 6 deletions
|
|
@ -729,3 +729,199 @@ class TestUniversality:
|
|||
source = inspect.getsource(entry.check_fn)
|
||||
assert "anthropic" not in source.lower()
|
||||
assert "openai" not in source.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regression tests for bugs 2 & 5 from issue #24170 (cua-driver v0.1.6)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestElementLabelParsing:
|
||||
"""Bug 5: element labels stripped in capture results (cua-driver v0.1.6 format).
|
||||
|
||||
cua-driver ≥0.1.6 emits ``[N] AXRole (order) id=Label`` instead of
|
||||
`` - [N] AXRole "label"``. _parse_elements_from_tree must handle both.
|
||||
"""
|
||||
|
||||
def test_classic_quoted_label_format(self):
|
||||
from tools.computer_use.cua_backend import _parse_elements_from_tree
|
||||
tree = (
|
||||
' - [14] AXButton "One"\n'
|
||||
' - [15] AXButton "Two"\n'
|
||||
' - [16] AXTextField ""\n'
|
||||
)
|
||||
els = _parse_elements_from_tree(tree)
|
||||
assert len(els) == 3
|
||||
assert els[0].index == 14
|
||||
assert els[0].role == "AXButton"
|
||||
assert els[0].label == "One"
|
||||
assert els[1].label == "Two"
|
||||
assert els[2].label == "" # empty quoted label
|
||||
|
||||
def test_new_id_eq_format(self):
|
||||
"""cua-driver v0.1.6 format: [N] AXRole (order) id=Label"""
|
||||
from tools.computer_use.cua_backend import _parse_elements_from_tree
|
||||
tree = (
|
||||
"[14] AXButton (1) id=One\n"
|
||||
"[15] AXButton (2) id=Two\n"
|
||||
"[16] AXTextField (3) id=\n"
|
||||
)
|
||||
els = _parse_elements_from_tree(tree)
|
||||
assert len(els) == 3
|
||||
assert els[0].index == 14
|
||||
assert els[0].role == "AXButton"
|
||||
assert els[0].label == "One"
|
||||
assert els[1].label == "Two"
|
||||
assert els[2].label == "" # empty id= value
|
||||
|
||||
def test_mixed_formats_in_single_tree(self):
|
||||
"""Gracefully handles trees that mix old and new line formats."""
|
||||
from tools.computer_use.cua_backend import _parse_elements_from_tree
|
||||
tree = (
|
||||
' - [1] AXWindow "Main Window"\n'
|
||||
"[14] AXButton (1) id=One\n"
|
||||
' - [15] AXTextField "Search"\n'
|
||||
)
|
||||
els = _parse_elements_from_tree(tree)
|
||||
assert len(els) == 3
|
||||
labels = {e.index: e.label for e in els}
|
||||
assert labels[1] == "Main Window"
|
||||
assert labels[14] == "One"
|
||||
assert labels[15] == "Search"
|
||||
|
||||
|
||||
class TestCaptureAfterAppContext:
|
||||
"""Bug 2: capture_after=True loses app context after actions.
|
||||
|
||||
_maybe_follow_capture must re-target the same app that was set by
|
||||
the preceding capture/focus_app call, rather than the frontmost window.
|
||||
"""
|
||||
|
||||
def test_capture_after_uses_last_app(self):
|
||||
"""capture_after=True should pass _last_app to the follow-up capture."""
|
||||
from tools.computer_use.backend import ActionResult, CaptureResult
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
captured_app_args = []
|
||||
|
||||
class TrackingBackend:
|
||||
_last_app = "Calculator" # simulates a previous focus_app call
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
pass
|
||||
|
||||
def is_available(self):
|
||||
return True
|
||||
|
||||
def capture(self, mode="som", app=None):
|
||||
captured_app_args.append(app)
|
||||
return CaptureResult(
|
||||
mode=mode, width=100, height=100,
|
||||
png_b64=None, elements=[],
|
||||
app=app or "Calculator", window_title="",
|
||||
)
|
||||
|
||||
def click(self, **kw):
|
||||
return ActionResult(ok=True, action="click")
|
||||
|
||||
def drag(self, **kw):
|
||||
return ActionResult(ok=True, action="drag")
|
||||
|
||||
def scroll(self, **kw):
|
||||
return ActionResult(ok=True, action="scroll")
|
||||
|
||||
def type_text(self, text):
|
||||
return ActionResult(ok=True, action="type")
|
||||
|
||||
def key(self, keys):
|
||||
return ActionResult(ok=True, action="key")
|
||||
|
||||
def list_apps(self):
|
||||
return []
|
||||
|
||||
def focus_app(self, app, raise_window=False):
|
||||
return ActionResult(ok=True, action="focus_app")
|
||||
|
||||
def set_value(self, value, element=None):
|
||||
return ActionResult(ok=True, action="set_value")
|
||||
|
||||
def wait(self, seconds=1.0):
|
||||
return ActionResult(ok=True, action="wait")
|
||||
|
||||
backend = TrackingBackend()
|
||||
cu_tool.reset_backend_for_tests()
|
||||
cu_tool._backend = backend
|
||||
|
||||
cu_tool.handle_computer_use({"action": "click", "element": 14, "capture_after": True})
|
||||
|
||||
# The follow-up capture must have been called with app="Calculator"
|
||||
assert len(captured_app_args) == 1
|
||||
assert captured_app_args[0] == "Calculator", (
|
||||
f"Expected follow-up capture with app='Calculator', got {captured_app_args[0]!r}"
|
||||
)
|
||||
|
||||
def test_capture_after_without_prior_app_uses_none(self):
|
||||
"""When no app context is set, follow-up capture uses app=None (frontmost)."""
|
||||
from tools.computer_use.backend import ActionResult, CaptureResult
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
captured_app_args = []
|
||||
|
||||
class NoContextBackend:
|
||||
_last_app = None # no prior context
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
|
||||
def stop(self):
|
||||
pass
|
||||
|
||||
def is_available(self):
|
||||
return True
|
||||
|
||||
def capture(self, mode="som", app=None):
|
||||
captured_app_args.append(app)
|
||||
return CaptureResult(
|
||||
mode=mode, width=100, height=100,
|
||||
png_b64=None, elements=[],
|
||||
app="Finder", window_title="",
|
||||
)
|
||||
|
||||
def click(self, **kw):
|
||||
return ActionResult(ok=True, action="click")
|
||||
|
||||
def drag(self, **kw):
|
||||
return ActionResult(ok=True, action="drag")
|
||||
|
||||
def scroll(self, **kw):
|
||||
return ActionResult(ok=True, action="scroll")
|
||||
|
||||
def type_text(self, text):
|
||||
return ActionResult(ok=True, action="type")
|
||||
|
||||
def key(self, keys):
|
||||
return ActionResult(ok=True, action="key")
|
||||
|
||||
def list_apps(self):
|
||||
return []
|
||||
|
||||
def focus_app(self, app, raise_window=False):
|
||||
return ActionResult(ok=True, action="focus_app")
|
||||
|
||||
def set_value(self, value, element=None):
|
||||
return ActionResult(ok=True, action="set_value")
|
||||
|
||||
def wait(self, seconds=1.0):
|
||||
return ActionResult(ok=True, action="wait")
|
||||
|
||||
backend = NoContextBackend()
|
||||
cu_tool.reset_backend_for_tests()
|
||||
cu_tool._backend = backend
|
||||
|
||||
cu_tool.handle_computer_use({"action": "click", "element": 5, "capture_after": True})
|
||||
|
||||
# No app context — should pass None so cua-driver picks the frontmost window
|
||||
assert len(captured_app_args) == 1
|
||||
assert captured_app_args[0] is None
|
||||
|
|
|
|||
|
|
@ -57,10 +57,18 @@ _WINDOW_LINE_RE = re.compile(
|
|||
re.MULTILINE,
|
||||
)
|
||||
|
||||
# Regex to parse element lines from get_window_state AX tree markdown:
|
||||
# " - [N] AXRole "label""
|
||||
# Regex to parse element lines from get_window_state AX tree markdown.
|
||||
#
|
||||
# Handles two output formats from different cua-driver versions:
|
||||
# Classic: " - [N] AXRole \"label\""
|
||||
# New: "[N] AXRole (order) id=Label"
|
||||
#
|
||||
# Group 1: element index
|
||||
# Group 2: AX role
|
||||
# Group 3: quoted label (classic format)
|
||||
# Group 4: id= label (new format)
|
||||
_ELEMENT_LINE_RE = re.compile(
|
||||
r'^\s*-\s+\[(\d+)\]\s+(\w+)(?:\s+"([^"]*)")?',
|
||||
r'^\s*(?:-\s+)?\[(\d+)\]\s+(\w+)(?:\s+"([^"]*)"|(?:\s+\(\d+\))?\s+id=([^\s\[\]]*))?' ,
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
|
|
@ -107,13 +115,19 @@ def _parse_windows_from_text(text: str) -> List[Dict[str, Any]]:
|
|||
|
||||
|
||||
def _parse_elements_from_tree(markdown: str) -> List[UIElement]:
|
||||
"""Parse UIElement list from get_window_state AX tree markdown."""
|
||||
"""Parse UIElement list from get_window_state AX tree markdown.
|
||||
|
||||
Handles both the classic ``"label"``-quoted format and the newer
|
||||
``id=Label`` format introduced in cua-driver v0.1.6.
|
||||
"""
|
||||
elements = []
|
||||
for m in _ELEMENT_LINE_RE.finditer(markdown):
|
||||
# group(3) = quoted label (classic); group(4) = id= label (new)
|
||||
label = m.group(3) or m.group(4) or ""
|
||||
elements.append(UIElement(
|
||||
index=int(m.group(1)),
|
||||
role=m.group(2),
|
||||
label=m.group(3) or "",
|
||||
label=label,
|
||||
bounds=(0, 0, 0, 0),
|
||||
))
|
||||
return elements
|
||||
|
|
@ -325,6 +339,7 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
# Sticky context — updated by capture(), used by action tools.
|
||||
self._active_pid: Optional[int] = None
|
||||
self._active_window_id: Optional[int] = None
|
||||
self._last_app: Optional[str] = None # last app name targeted via capture/focus_app
|
||||
|
||||
# ── Lifecycle ──────────────────────────────────────────────────
|
||||
def start(self) -> None:
|
||||
|
|
@ -389,6 +404,10 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
self._active_pid = target["pid"]
|
||||
self._active_window_id = target["window_id"]
|
||||
app_name = target["app_name"]
|
||||
# Record the resolved app name so capture_after= follow-ups can re-target
|
||||
# the same app rather than falling back to the frontmost window.
|
||||
if app or not self._last_app:
|
||||
self._last_app = app_name
|
||||
|
||||
# Step 2: capture.
|
||||
png_b64: Optional[str] = None
|
||||
|
|
@ -643,6 +662,7 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
if target:
|
||||
self._active_pid = target["pid"]
|
||||
self._active_window_id = target["window_id"]
|
||||
self._last_app = target["app_name"] # preserve for capture_after= follow-ups
|
||||
return ActionResult(
|
||||
ok=True, action="focus_app",
|
||||
message=f"Targeted {target['app_name']} (pid {self._active_pid}, "
|
||||
|
|
|
|||
|
|
@ -463,7 +463,11 @@ def _maybe_follow_capture(
|
|||
if not do_capture:
|
||||
return _text_response(res)
|
||||
try:
|
||||
cap = backend.capture(mode="som")
|
||||
# Preserve the app context established by the preceding capture/focus_app so
|
||||
# that capture_after=True re-captures the same app rather than the frontmost
|
||||
# window (which may have changed if the action caused a focus shift).
|
||||
last_app = getattr(backend, "_last_app", None)
|
||||
cap = backend.capture(mode="som", app=last_app)
|
||||
except Exception as e:
|
||||
logger.warning("follow-up capture failed: %s", e)
|
||||
return _text_response(res)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue