mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(computer_use): correct type_text MCP tool name and implement drag action
Bug 3: The cua_backend type_text() method called MCP tool 'type_text_chars' which does not exist in current cua-driver. Changed to 'type_text' which is the correct MCP tool name. Bug 4: The drag() method returned a hardcoded 'not supported' error even though cua-driver exposes a 'drag' MCP tool. Implemented proper drag dispatching with coordinate-based and element-based targeting. Added dispatch-level validation for drag to ensure from/to coordinates or elements are provided before calling any backend. Fixes #24170 (bugs 3 and 4)
This commit is contained in:
parent
0ce12a9241
commit
18cd1e5c72
3 changed files with 76 additions and 7 deletions
|
|
@ -155,6 +155,56 @@ class TestDispatch:
|
|||
click_kw = next(c[1] for c in noop_backend.calls if c[0] == "click")
|
||||
assert click_kw["button"] == "right"
|
||||
|
||||
def test_type_action_routes_to_type_text_backend(self, noop_backend):
|
||||
"""type action must call backend.type_text, not type_text_chars (issue #24170, bug 3)."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({"action": "type", "text": "hello"})
|
||||
parsed = json.loads(out)
|
||||
assert "error" not in parsed
|
||||
call_names = [c[0] for c in noop_backend.calls]
|
||||
assert "type" in call_names
|
||||
type_kw = next(c[1] for c in noop_backend.calls if c[0] == "type")
|
||||
assert type_kw["text"] == "hello"
|
||||
|
||||
def test_drag_action_routes_to_backend_by_coordinate(self, noop_backend):
|
||||
"""drag action must dispatch to backend.drag with coordinates (issue #24170, bug 4)."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({
|
||||
"action": "drag",
|
||||
"from_coordinate": [100, 200],
|
||||
"to_coordinate": [400, 500],
|
||||
})
|
||||
parsed = json.loads(out)
|
||||
assert "error" not in parsed
|
||||
call_names = [c[0] for c in noop_backend.calls]
|
||||
assert "drag" in call_names
|
||||
drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag")
|
||||
assert drag_kw["from_xy"] == (100, 200)
|
||||
assert drag_kw["to_xy"] == (400, 500)
|
||||
|
||||
def test_drag_action_routes_to_backend_by_element(self, noop_backend):
|
||||
"""drag action must dispatch to backend.drag with element indices (issue #24170, bug 4)."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({
|
||||
"action": "drag",
|
||||
"from_element": 1,
|
||||
"to_element": 5,
|
||||
})
|
||||
parsed = json.loads(out)
|
||||
assert "error" not in parsed
|
||||
call_names = [c[0] for c in noop_backend.calls]
|
||||
assert "drag" in call_names
|
||||
drag_kw = next(c[1] for c in noop_backend.calls if c[0] == "drag")
|
||||
assert drag_kw["from_element"] == 1
|
||||
assert drag_kw["to_element"] == 5
|
||||
|
||||
def test_drag_action_requires_coordinates_or_elements(self, noop_backend):
|
||||
"""drag without from/to must return an error."""
|
||||
from tools.computer_use.tool import handle_computer_use
|
||||
out = handle_computer_use({"action": "drag"})
|
||||
parsed = json.loads(out)
|
||||
assert "error" in parsed
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Safety guards (type / key block lists)
|
||||
|
|
|
|||
|
|
@ -497,9 +497,25 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
button: str = "left",
|
||||
modifiers: Optional[List[str]] = None,
|
||||
) -> ActionResult:
|
||||
# cua-driver does not expose a drag tool.
|
||||
return ActionResult(ok=False, action="drag",
|
||||
message="drag is not supported by the cua-driver backend.")
|
||||
pid = self._active_pid
|
||||
if pid is None:
|
||||
return ActionResult(ok=False, action="drag",
|
||||
message="No active window — call capture() first.")
|
||||
args: Dict[str, Any] = {"pid": pid}
|
||||
if from_element is not None and to_element is not None:
|
||||
if self._active_window_id is None:
|
||||
return ActionResult(ok=False, action="drag",
|
||||
message="No active window_id for element-based drag.")
|
||||
args["from_element"] = from_element
|
||||
args["to_element"] = to_element
|
||||
args["window_id"] = self._active_window_id
|
||||
elif from_xy is not None and to_xy is not None:
|
||||
args["from_x"], args["from_y"] = int(from_xy[0]), int(from_xy[1])
|
||||
args["to_x"], args["to_y"] = int(to_xy[0]), int(to_xy[1])
|
||||
else:
|
||||
return ActionResult(ok=False, action="drag",
|
||||
message="drag requires from_element/to_element or from_coordinate/to_coordinate.")
|
||||
return self._action("drag", args)
|
||||
|
||||
def scroll(
|
||||
self,
|
||||
|
|
@ -534,10 +550,7 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
if pid is None:
|
||||
return ActionResult(ok=False, action="type_text",
|
||||
message="No active window — call capture() first.")
|
||||
# Safari WebKit AXTextField does not accept AX attribute writes (type_text),
|
||||
# so use type_text_chars which synthesises individual key events instead.
|
||||
# This works universally across all macOS apps in background mode.
|
||||
return self._action("type_text_chars", {"pid": pid, "text": text})
|
||||
return self._action("type_text", {"pid": pid, "text": text})
|
||||
|
||||
def key(self, keys: str) -> ActionResult:
|
||||
pid = self._active_pid
|
||||
|
|
|
|||
|
|
@ -357,6 +357,12 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) ->
|
|||
return _maybe_follow_capture(backend, res, capture_after)
|
||||
|
||||
if action == "drag":
|
||||
has_elements = args.get("from_element") is not None and args.get("to_element") is not None
|
||||
has_coords = args.get("from_coordinate") and args.get("to_coordinate")
|
||||
if not has_elements and not has_coords:
|
||||
return json.dumps({
|
||||
"error": "drag requires from_coordinate/to_coordinate or from_element/to_element",
|
||||
})
|
||||
res = backend.drag(
|
||||
from_element=args.get("from_element"),
|
||||
to_element=args.get("to_element"),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue