fix(computer_use): correct type_text MCP tool name and implement drag action

Bug 3: The cua_backend type_text() method called MCP tool 'type_text_chars'
which does not exist in current cua-driver. Changed to 'type_text' which is
the correct MCP tool name.

Bug 4: The drag() method returned a hardcoded 'not supported' error even
though cua-driver exposes a 'drag' MCP tool. Implemented proper drag
dispatching with coordinate-based and element-based targeting.

Added dispatch-level validation for drag to ensure from/to coordinates
or elements are provided before calling any backend.

Fixes #24170 (bugs 3 and 4)
This commit is contained in:
liuhao1024 2026-05-12 13:02:28 +08:00 committed by Teknium
parent 0ce12a9241
commit 18cd1e5c72
3 changed files with 76 additions and 7 deletions

View file

@ -155,6 +155,56 @@ class TestDispatch:
click_kw = next(c[1] for c in noop_backend.calls if c[0] == "click")
assert click_kw["button"] == "right"
def test_type_action_routes_to_type_text_backend(self, noop_backend):
"""type action must call backend.type_text, not type_text_chars (issue #24170, bug 3)."""
from tools.computer_use.tool import handle_computer_use
out = handle_computer_use({"action": "type", "text": "hello"})
parsed = json.loads(out)
assert "error" not in parsed
call_names = [c[0] for c in noop_backend.calls]
assert "type" in call_names
type_kw = next(c[1] for c in noop_backend.calls if c[0] == "type")
assert type_kw["text"] == "hello"
def test_drag_action_routes_to_backend_by_coordinate(self, noop_backend):
"""drag action must dispatch to backend.drag with coordinates (issue #24170, bug 4)."""
from tools.computer_use.tool import handle_computer_use
out = handle_computer_use({
"action": "drag",
"from_coordinate": [100, 200],
"to_coordinate": [400, 500],
})
parsed = json.loads(out)
assert "error" not in parsed
call_names = [c[0] for c in noop_backend.calls]
assert "drag" in call_names
drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag")
assert drag_kw["from_xy"] == (100, 200)
assert drag_kw["to_xy"] == (400, 500)
def test_drag_action_routes_to_backend_by_element(self, noop_backend):
"""drag action must dispatch to backend.drag with element indices (issue #24170, bug 4)."""
from tools.computer_use.tool import handle_computer_use
out = handle_computer_use({
"action": "drag",
"from_element": 1,
"to_element": 5,
})
parsed = json.loads(out)
assert "error" not in parsed
call_names = [c[0] for c in noop_backend.calls]
assert "drag" in call_names
drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag")
assert drag_kw["from_element"] == 1
assert drag_kw["to_element"] == 5
def test_drag_action_requires_coordinates_or_elements(self, noop_backend):
"""drag without from/to must return an error."""
from tools.computer_use.tool import handle_computer_use
out = handle_computer_use({"action": "drag"})
parsed = json.loads(out)
assert "error" in parsed
# ---------------------------------------------------------------------------
# Safety guards (type / key block lists)

View file

@ -497,9 +497,25 @@ class CuaDriverBackend(ComputerUseBackend):
button: str = "left",
modifiers: Optional[List[str]] = None,
) -> ActionResult:
# cua-driver does not expose a drag tool.
return ActionResult(ok=False, action="drag",
message="drag is not supported by the cua-driver backend.")
pid = self._active_pid
if pid is None:
return ActionResult(ok=False, action="drag",
message="No active window — call capture() first.")
args: Dict[str, Any] = {"pid": pid}
if from_element is not None and to_element is not None:
if self._active_window_id is None:
return ActionResult(ok=False, action="drag",
message="No active window_id for element-based drag.")
args["from_element"] = from_element
args["to_element"] = to_element
args["window_id"] = self._active_window_id
elif from_xy is not None and to_xy is not None:
args["from_x"], args["from_y"] = int(from_xy[0]), int(from_xy[1])
args["to_x"], args["to_y"] = int(to_xy[0]), int(to_xy[1])
else:
return ActionResult(ok=False, action="drag",
message="drag requires from_element/to_element or from_coordinate/to_coordinate.")
return self._action("drag", args)
def scroll(
self,
@ -534,10 +550,7 @@ class CuaDriverBackend(ComputerUseBackend):
if pid is None:
return ActionResult(ok=False, action="type_text",
message="No active window — call capture() first.")
# Safari WebKit AXTextField does not accept AX attribute writes (type_text),
# so use type_text_chars which synthesises individual key events instead.
# This works universally across all macOS apps in background mode.
return self._action("type_text_chars", {"pid": pid, "text": text})
return self._action("type_text", {"pid": pid, "text": text})
def key(self, keys: str) -> ActionResult:
pid = self._active_pid

View file

@ -357,6 +357,12 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) ->
return _maybe_follow_capture(backend, res, capture_after)
if action == "drag":
has_elements = args.get("from_element") is not None and args.get("to_element") is not None
has_coords = args.get("from_coordinate") and args.get("to_coordinate")
if not has_elements and not has_coords:
return json.dumps({
"error": "drag requires from_coordinate/to_coordinate or from_element/to_element",
})
res = backend.drag(
from_element=args.get("from_element"),
to_element=args.get("to_element"),