From 18cd1e5c728ddf93a854ac9818f527013a9f6daf Mon Sep 17 00:00:00 2001 From: liuhao1024 Date: Tue, 12 May 2026 13:02:28 +0800 Subject: [PATCH] fix(computer_use): correct type_text MCP tool name and implement drag action Bug 3: The cua_backend type_text() method called MCP tool 'type_text_chars' which does not exist in current cua-driver. Changed to 'type_text' which is the correct MCP tool name. Bug 4: The drag() method returned a hardcoded 'not supported' error even though cua-driver exposes a 'drag' MCP tool. Implemented proper drag dispatching with coordinate-based and element-based targeting. Added dispatch-level validation for drag to ensure from/to coordinates or elements are provided before calling any backend. Fixes #24170 (bugs 3 and 4) --- tests/tools/test_computer_use.py | 50 +++++++++++++++++++++++++++++++ tools/computer_use/cua_backend.py | 27 ++++++++++++----- tools/computer_use/tool.py | 6 ++++ 3 files changed, 76 insertions(+), 7 deletions(-) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index 5b035950348..4a108d1ce51 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -155,6 +155,56 @@ class TestDispatch: click_kw = next(c[1] for c in noop_backend.calls if c[0] == "click") assert click_kw["button"] == "right" + def test_type_action_routes_to_type_text_backend(self, noop_backend): + """type action must call backend.type_text, not type_text_chars (issue #24170, bug 3).""" + from tools.computer_use.tool import handle_computer_use + out = handle_computer_use({"action": "type", "text": "hello"}) + parsed = json.loads(out) + assert "error" not in parsed + call_names = [c[0] for c in noop_backend.calls] + assert "type" in call_names + type_kw = next(c[1] for c in noop_backend.calls if c[0] == "type") + assert type_kw["text"] == "hello" + + def test_drag_action_routes_to_backend_by_coordinate(self, noop_backend): + """drag action must dispatch to backend.drag with coordinates (issue #24170, bug 4).""" + from tools.computer_use.tool import handle_computer_use + out = handle_computer_use({ + "action": "drag", + "from_coordinate": [100, 200], + "to_coordinate": [400, 500], + }) + parsed = json.loads(out) + assert "error" not in parsed + call_names = [c[0] for c in noop_backend.calls] + assert "drag" in call_names + drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag") + assert drag_kw["from_xy"] == (100, 200) + assert drag_kw["to_xy"] == (400, 500) + + def test_drag_action_routes_to_backend_by_element(self, noop_backend): + """drag action must dispatch to backend.drag with element indices (issue #24170, bug 4).""" + from tools.computer_use.tool import handle_computer_use + out = handle_computer_use({ + "action": "drag", + "from_element": 1, + "to_element": 5, + }) + parsed = json.loads(out) + assert "error" not in parsed + call_names = [c[0] for c in noop_backend.calls] + assert "drag" in call_names + drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag") + assert drag_kw["from_element"] == 1 + assert drag_kw["to_element"] == 5 + + def test_drag_action_requires_coordinates_or_elements(self, noop_backend): + """drag without from/to must return an error.""" + from tools.computer_use.tool import handle_computer_use + out = handle_computer_use({"action": "drag"}) + parsed = json.loads(out) + assert "error" in parsed + # --------------------------------------------------------------------------- # Safety guards (type / key block lists) diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py index 96aab60f8c7..e611350d943 100644 --- a/tools/computer_use/cua_backend.py +++ b/tools/computer_use/cua_backend.py @@ -497,9 +497,25 @@ class CuaDriverBackend(ComputerUseBackend): button: str = "left", modifiers: Optional[List[str]] = None, ) -> ActionResult: - # cua-driver does not expose a drag tool. - return ActionResult(ok=False, action="drag", - message="drag is not supported by the cua-driver backend.") + pid = self._active_pid + if pid is None: + return ActionResult(ok=False, action="drag", + message="No active window — call capture() first.") + args: Dict[str, Any] = {"pid": pid} + if from_element is not None and to_element is not None: + if self._active_window_id is None: + return ActionResult(ok=False, action="drag", + message="No active window_id for element-based drag.") + args["from_element"] = from_element + args["to_element"] = to_element + args["window_id"] = self._active_window_id + elif from_xy is not None and to_xy is not None: + args["from_x"], args["from_y"] = int(from_xy[0]), int(from_xy[1]) + args["to_x"], args["to_y"] = int(to_xy[0]), int(to_xy[1]) + else: + return ActionResult(ok=False, action="drag", + message="drag requires from_element/to_element or from_coordinate/to_coordinate.") + return self._action("drag", args) def scroll( self, @@ -534,10 +550,7 @@ class CuaDriverBackend(ComputerUseBackend): if pid is None: return ActionResult(ok=False, action="type_text", message="No active window — call capture() first.") - # Safari WebKit AXTextField does not accept AX attribute writes (type_text), - # so use type_text_chars which synthesises individual key events instead. - # This works universally across all macOS apps in background mode. - return self._action("type_text_chars", {"pid": pid, "text": text}) + return self._action("type_text", {"pid": pid, "text": text}) def key(self, keys: str) -> ActionResult: pid = self._active_pid diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py index 63a5076c171..7d2bc0cfca2 100644 --- a/tools/computer_use/tool.py +++ b/tools/computer_use/tool.py @@ -357,6 +357,12 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) -> return _maybe_follow_capture(backend, res, capture_after) if action == "drag": + has_elements = args.get("from_element") is not None and args.get("to_element") is not None + has_coords = args.get("from_coordinate") and args.get("to_coordinate") + if not has_elements and not has_coords: + return json.dumps({ + "error": "drag requires from_coordinate/to_coordinate or from_element/to_element", + }) res = backend.drag( from_element=args.get("from_element"), to_element=args.get("to_element"),