mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Background macOS desktop control via cua-driver MCP — does NOT steal the user's cursor or keyboard focus, works with any tool-capable model. Replaces the Anthropic-native `computer_20251124` approach from the abandoned #4562 with a generic OpenAI function-calling schema plus SOM (set-of-mark) captures so Claude, GPT, Gemini, and open models can all drive the desktop via numbered element indices. ## What this adds - `tools/computer_use/` package — swappable ComputerUseBackend ABC + CuaDriverBackend (stdio MCP client to trycua/cua's cua-driver binary). - Universal `computer_use` tool with one schema for all providers. Actions: capture (som/vision/ax), click, double_click, right_click, middle_click, drag, scroll, type, key, wait, list_apps, focus_app. - Multimodal tool-result envelope (`_multimodal=True`, OpenAI-style `content: [text, image_url]` parts) that flows through handle_function_call into the tool message. Anthropic adapter converts into native `tool_result` image blocks; OpenAI-compatible providers get the parts list directly. - Image eviction in convert_messages_to_anthropic: only the 3 most recent screenshots carry real image data; older ones become text placeholders to cap per-turn token cost. - Context compressor image pruning: old multimodal tool results have their image parts stripped instead of being skipped. - Image-aware token estimation: each image counts as a flat 1500 tokens instead of its base64 char length (~1MB would have registered as ~250K tokens before). - COMPUTER_USE_GUIDANCE system-prompt block — injected when the toolset is active. - Session DB persistence strips base64 from multimodal tool messages. - Trajectory saver normalises multimodal messages to text-only. - `hermes tools` post-setup installs cua-driver via the upstream script and prints permission-grant instructions. - CLI approval callback wired so destructive computer_use actions go through the same prompt_toolkit approval dialog as terminal commands. - Hard safety guards at the tool level: blocked type patterns (curl|bash, sudo rm -rf, fork bomb), blocked key combos (empty trash, force delete, lock screen, log out). - Skill `apple/macos-computer-use/SKILL.md` — universal (model-agnostic) workflow guide. - Docs: `user-guide/features/computer-use.md` plus reference catalog entries. ## Tests 44 new tests in tests/tools/test_computer_use.py covering schema shape (universal, not Anthropic-native), dispatch routing, safety guards, multimodal envelope, Anthropic adapter conversion, screenshot eviction, context compressor pruning, image-aware token estimation, run_agent helpers, and universality guarantees. 469/469 pass across tests/tools/test_computer_use.py + the affected agent/ test suites. ## Not in this PR - `model_tools.py` provider-gating: the tool is available to every provider. Providers without multi-part tool message support will see text-only tool results (graceful degradation via `text_summary`). - Anthropic server-side `clear_tool_uses_20250919` — deferred; client-side eviction + compressor pruning cover the same cost ceiling without a beta header. ## Caveats - macOS only. cua-driver uses private SkyLight SPIs (SLEventPostToPid, SLPSPostEventRecordTo, _AXObserverAddNotificationAndCheckRemote) that can break on any macOS update. Pin with HERMES_CUA_DRIVER_VERSION. - Requires Accessibility + Screen Recording permissions — the post-setup prints the Settings path. Supersedes PR #4562 (pyautogui/Quartz foreground backend, Anthropic- native schema). Credit @0xbyt4 for the original #3816 groundwork whose context/eviction/token design is preserved here in generic form.
423 lines
16 KiB
Python
423 lines
16 KiB
Python
"""Cua-driver backend (macOS only).
|
|
|
|
Speaks MCP over stdio to `cua-driver`. The Python `mcp` SDK is async, so we
|
|
run a dedicated asyncio event loop on a background thread and marshal sync
|
|
calls through it.
|
|
|
|
Install: `/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh)"`
|
|
|
|
After install, `cua-driver` is on $PATH and supports `cua-driver mcp` (stdio
|
|
transport) which is what we invoke.
|
|
|
|
The private SkyLight SPIs cua-driver uses (SLEventPostToPid, SLPSPostEvent-
|
|
RecordTo, _AXObserverAddNotificationAndCheckRemote) are not Apple-public and
|
|
can break on OS updates. Pin the installed version via `HERMES_CUA_DRIVER_
|
|
VERSION` if you want reproducibility across an OS bump.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import base64
|
|
import json
|
|
import logging
|
|
import os
|
|
import platform
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import threading
|
|
from concurrent.futures import Future
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from tools.computer_use.backend import (
|
|
ActionResult,
|
|
CaptureResult,
|
|
ComputerUseBackend,
|
|
UIElement,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Version pinning
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# The SkyLight SPIs cua-driver calls are private. We pin a known-good version
|
|
# so OS updates don't silently change the surface area our agent depends on.
|
|
# Users on newer macOS releases may need to bump this and re-run
|
|
# `hermes tools` to take the updated binary.
|
|
PINNED_CUA_DRIVER_VERSION = os.environ.get("HERMES_CUA_DRIVER_VERSION", "0.5.0")
|
|
|
|
# Env var override for the cua-driver binary path (mostly for tests / CI).
|
|
_CUA_DRIVER_CMD = os.environ.get("HERMES_CUA_DRIVER_CMD", "cua-driver")
|
|
_CUA_DRIVER_ARGS = ["mcp"] # stdio MCP transport
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _is_macos() -> bool:
|
|
return sys.platform == "darwin"
|
|
|
|
|
|
def _is_arm_mac() -> bool:
|
|
return _is_macos() and platform.machine() == "arm64"
|
|
|
|
|
|
def cua_driver_binary_available() -> bool:
|
|
"""True if `cua-driver` is on $PATH or HERMES_CUA_DRIVER_CMD resolves."""
|
|
return bool(shutil.which(_CUA_DRIVER_CMD))
|
|
|
|
|
|
def cua_driver_install_hint() -> str:
|
|
return (
|
|
"cua-driver is not installed. Install with:\n"
|
|
' /bin/bash -c "$(curl -fsSL '
|
|
'https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh)"\n'
|
|
"Or run `hermes tools` and enable the Computer Use toolset to install it automatically."
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Asyncio bridge — one long-lived loop on a background thread
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class _AsyncBridge:
|
|
"""Runs one asyncio loop on a daemon thread; marshals coroutines from the caller."""
|
|
|
|
def __init__(self) -> None:
|
|
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
self._thread: Optional[threading.Thread] = None
|
|
self._ready = threading.Event()
|
|
|
|
def start(self) -> None:
|
|
if self._thread and self._thread.is_alive():
|
|
return
|
|
self._ready.clear()
|
|
|
|
def _run() -> None:
|
|
self._loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(self._loop)
|
|
self._ready.set()
|
|
try:
|
|
self._loop.run_forever()
|
|
finally:
|
|
try:
|
|
self._loop.close()
|
|
except Exception:
|
|
pass
|
|
|
|
self._thread = threading.Thread(target=_run, daemon=True, name="cua-driver-loop")
|
|
self._thread.start()
|
|
if not self._ready.wait(timeout=5.0):
|
|
raise RuntimeError("cua-driver asyncio bridge failed to start")
|
|
|
|
def run(self, coro, timeout: Optional[float] = 30.0) -> Any:
|
|
if not self._loop or not self._thread or not self._thread.is_alive():
|
|
raise RuntimeError("cua-driver bridge not started")
|
|
fut: Future = asyncio.run_coroutine_threadsafe(coro, self._loop)
|
|
return fut.result(timeout=timeout)
|
|
|
|
def stop(self) -> None:
|
|
if self._loop and self._loop.is_running():
|
|
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
if self._thread:
|
|
self._thread.join(timeout=2.0)
|
|
self._thread = None
|
|
self._loop = None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# MCP session (lazy, shared across tool calls)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class _CuaDriverSession:
|
|
"""Holds the mcp ClientSession. Spawned lazily; re-entered on drop."""
|
|
|
|
def __init__(self, bridge: _AsyncBridge) -> None:
|
|
self._bridge = bridge
|
|
self._session = None # mcp.ClientSession
|
|
self._exit_stack = None # AsyncExitStack for stdio_client + ClientSession
|
|
self._lock = threading.Lock()
|
|
self._started = False
|
|
|
|
def _require_started(self) -> None:
|
|
if not self._started:
|
|
raise RuntimeError("cua-driver session not started")
|
|
|
|
async def _aenter(self) -> None:
|
|
from contextlib import AsyncExitStack
|
|
from mcp import ClientSession, StdioServerParameters
|
|
from mcp.client.stdio import stdio_client
|
|
|
|
if not cua_driver_binary_available():
|
|
raise RuntimeError(cua_driver_install_hint())
|
|
|
|
params = StdioServerParameters(
|
|
command=_CUA_DRIVER_CMD,
|
|
args=_CUA_DRIVER_ARGS,
|
|
env={**os.environ}, # cua-driver needs HOME / TMPDIR
|
|
)
|
|
stack = AsyncExitStack()
|
|
read, write = await stack.enter_async_context(stdio_client(params))
|
|
session = await stack.enter_async_context(ClientSession(read, write))
|
|
await session.initialize()
|
|
self._exit_stack = stack
|
|
self._session = session
|
|
|
|
async def _aexit(self) -> None:
|
|
if self._exit_stack is not None:
|
|
try:
|
|
await self._exit_stack.aclose()
|
|
except Exception as e: # pragma: no cover
|
|
logger.warning("cua-driver shutdown error: %s", e)
|
|
self._exit_stack = None
|
|
self._session = None
|
|
|
|
def start(self) -> None:
|
|
with self._lock:
|
|
if self._started:
|
|
return
|
|
self._bridge.start()
|
|
self._bridge.run(self._aenter(), timeout=15.0)
|
|
self._started = True
|
|
|
|
def stop(self) -> None:
|
|
with self._lock:
|
|
if not self._started:
|
|
return
|
|
try:
|
|
self._bridge.run(self._aexit(), timeout=5.0)
|
|
finally:
|
|
self._started = False
|
|
|
|
# ── Tool invocation ──────────────────────────────────────────────
|
|
async def _call_tool_async(self, name: str, args: Dict[str, Any]) -> Dict[str, Any]:
|
|
result = await self._session.call_tool(name, args)
|
|
# Normalize: mcp returns content parts. We want a dict.
|
|
return _extract_tool_result(result)
|
|
|
|
def call_tool(self, name: str, args: Dict[str, Any], timeout: float = 30.0) -> Dict[str, Any]:
|
|
self._require_started()
|
|
return self._bridge.run(self._call_tool_async(name, args), timeout=timeout)
|
|
|
|
|
|
def _extract_tool_result(mcp_result: Any) -> Dict[str, Any]:
|
|
"""Convert an mcp CallToolResult into a plain dict.
|
|
|
|
cua-driver returns a mix of json-text parts and image parts. We flatten:
|
|
{"data": <parsed json from text parts>, "images": [b64, ...], "isError": bool}
|
|
"""
|
|
data: Any = None
|
|
images: List[str] = []
|
|
is_error = bool(getattr(mcp_result, "isError", False))
|
|
text_chunks: List[str] = []
|
|
for part in getattr(mcp_result, "content", []) or []:
|
|
ptype = getattr(part, "type", None)
|
|
if ptype == "text":
|
|
text_chunks.append(getattr(part, "text", "") or "")
|
|
elif ptype == "image":
|
|
b64 = getattr(part, "data", None)
|
|
if b64:
|
|
images.append(b64)
|
|
if text_chunks:
|
|
joined = "\n".join(t for t in text_chunks if t)
|
|
try:
|
|
data = json.loads(joined) if joined.strip().startswith(("{", "[")) else joined
|
|
except json.JSONDecodeError:
|
|
data = joined
|
|
return {"data": data, "images": images, "isError": is_error}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# The backend itself
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class CuaDriverBackend(ComputerUseBackend):
|
|
"""Default computer-use backend. macOS-only via cua-driver MCP."""
|
|
|
|
def __init__(self) -> None:
|
|
self._bridge = _AsyncBridge()
|
|
self._session = _CuaDriverSession(self._bridge)
|
|
|
|
# ── Lifecycle ──────────────────────────────────────────────────
|
|
def start(self) -> None:
|
|
self._session.start()
|
|
|
|
def stop(self) -> None:
|
|
try:
|
|
self._session.stop()
|
|
finally:
|
|
self._bridge.stop()
|
|
|
|
def is_available(self) -> bool:
|
|
if not _is_macos():
|
|
return False
|
|
return cua_driver_binary_available()
|
|
|
|
# ── Capture ────────────────────────────────────────────────────
|
|
def capture(self, mode: str = "som", app: Optional[str] = None) -> CaptureResult:
|
|
args: Dict[str, Any] = {"mode": mode}
|
|
if app:
|
|
args["app"] = app
|
|
out = self._session.call_tool("capture", args)
|
|
data = out["data"] if isinstance(out["data"], dict) else {}
|
|
width = int(data.get("width", 0))
|
|
height = int(data.get("height", 0))
|
|
elements_raw = data.get("elements", []) or []
|
|
elements = [_parse_element(e) for e in elements_raw if isinstance(e, dict)]
|
|
|
|
png_b64: Optional[str] = None
|
|
png_bytes_len = 0
|
|
if out["images"]:
|
|
png_b64 = out["images"][0]
|
|
try:
|
|
png_bytes_len = len(base64.b64decode(png_b64, validate=False))
|
|
except Exception:
|
|
png_bytes_len = len(png_b64) * 3 // 4
|
|
|
|
return CaptureResult(
|
|
mode=mode,
|
|
width=width,
|
|
height=height,
|
|
png_b64=png_b64,
|
|
elements=elements,
|
|
app=str(data.get("app", "") or ""),
|
|
window_title=str(data.get("window_title", "") or ""),
|
|
png_bytes_len=png_bytes_len,
|
|
)
|
|
|
|
# ── Pointer ────────────────────────────────────────────────────
|
|
def click(
|
|
self,
|
|
*,
|
|
element: Optional[int] = None,
|
|
x: Optional[int] = None,
|
|
y: Optional[int] = None,
|
|
button: str = "left",
|
|
click_count: int = 1,
|
|
modifiers: Optional[List[str]] = None,
|
|
) -> ActionResult:
|
|
args: Dict[str, Any] = {"button": button, "clickCount": click_count}
|
|
if element is not None:
|
|
args["element"] = int(element)
|
|
elif x is not None and y is not None:
|
|
args["x"] = int(x)
|
|
args["y"] = int(y)
|
|
else:
|
|
return ActionResult(ok=False, action="click",
|
|
message="click requires element= or x/y")
|
|
if modifiers:
|
|
args["modifiers"] = modifiers
|
|
return self._action("click", args)
|
|
|
|
def drag(
|
|
self,
|
|
*,
|
|
from_element: Optional[int] = None,
|
|
to_element: Optional[int] = None,
|
|
from_xy: Optional[Tuple[int, int]] = None,
|
|
to_xy: Optional[Tuple[int, int]] = None,
|
|
button: str = "left",
|
|
modifiers: Optional[List[str]] = None,
|
|
) -> ActionResult:
|
|
args: Dict[str, Any] = {"button": button}
|
|
if from_element is not None:
|
|
args["fromElement"] = int(from_element)
|
|
elif from_xy is not None:
|
|
args["fromX"], args["fromY"] = int(from_xy[0]), int(from_xy[1])
|
|
else:
|
|
return ActionResult(ok=False, action="drag", message="drag requires a source")
|
|
if to_element is not None:
|
|
args["toElement"] = int(to_element)
|
|
elif to_xy is not None:
|
|
args["toX"], args["toY"] = int(to_xy[0]), int(to_xy[1])
|
|
else:
|
|
return ActionResult(ok=False, action="drag", message="drag requires a destination")
|
|
if modifiers:
|
|
args["modifiers"] = modifiers
|
|
return self._action("drag", args)
|
|
|
|
def scroll(
|
|
self,
|
|
*,
|
|
direction: str,
|
|
amount: int = 3,
|
|
element: Optional[int] = None,
|
|
x: Optional[int] = None,
|
|
y: Optional[int] = None,
|
|
modifiers: Optional[List[str]] = None,
|
|
) -> ActionResult:
|
|
args: Dict[str, Any] = {"direction": direction, "amount": int(amount)}
|
|
if element is not None:
|
|
args["element"] = int(element)
|
|
elif x is not None and y is not None:
|
|
args["x"] = int(x)
|
|
args["y"] = int(y)
|
|
if modifiers:
|
|
args["modifiers"] = modifiers
|
|
return self._action("scroll", args)
|
|
|
|
# ── Keyboard ───────────────────────────────────────────────────
|
|
def type_text(self, text: str) -> ActionResult:
|
|
return self._action("type", {"text": text})
|
|
|
|
def key(self, keys: str) -> ActionResult:
|
|
return self._action("key", {"keys": keys})
|
|
|
|
# ── Introspection ──────────────────────────────────────────────
|
|
def list_apps(self) -> List[Dict[str, Any]]:
|
|
out = self._session.call_tool("listApps", {})
|
|
data = out["data"] if isinstance(out["data"], (list, dict)) else []
|
|
if isinstance(data, dict):
|
|
data = data.get("apps", [])
|
|
return list(data or [])
|
|
|
|
def focus_app(self, app: str, raise_window: bool = False) -> ActionResult:
|
|
return self._action("focusApp", {"app": app, "raise": bool(raise_window)})
|
|
|
|
# ── Internal ───────────────────────────────────────────────────
|
|
def _action(self, name: str, args: Dict[str, Any]) -> ActionResult:
|
|
try:
|
|
out = self._session.call_tool(name, args)
|
|
except Exception as e:
|
|
logger.exception("cua-driver %s call failed", name)
|
|
return ActionResult(ok=False, action=name, message=f"cua-driver error: {e}")
|
|
ok = not out["isError"]
|
|
message = ""
|
|
data = out["data"]
|
|
if isinstance(data, dict):
|
|
message = str(data.get("message", ""))
|
|
elif isinstance(data, str):
|
|
message = data
|
|
return ActionResult(ok=ok, action=name, message=message,
|
|
meta=data if isinstance(data, dict) else {})
|
|
|
|
|
|
def _parse_element(d: Dict[str, Any]) -> UIElement:
|
|
bounds = d.get("bounds") or (0, 0, 0, 0)
|
|
if isinstance(bounds, dict):
|
|
bounds = (
|
|
int(bounds.get("x", 0)),
|
|
int(bounds.get("y", 0)),
|
|
int(bounds.get("w", bounds.get("width", 0))),
|
|
int(bounds.get("h", bounds.get("height", 0))),
|
|
)
|
|
elif isinstance(bounds, (list, tuple)) and len(bounds) == 4:
|
|
bounds = tuple(int(v) for v in bounds)
|
|
else:
|
|
bounds = (0, 0, 0, 0)
|
|
return UIElement(
|
|
index=int(d.get("index", 0)),
|
|
role=str(d.get("role", "") or ""),
|
|
label=str(d.get("label", "") or ""),
|
|
bounds=bounds, # type: ignore[arg-type]
|
|
app=str(d.get("app", "") or ""),
|
|
pid=int(d.get("pid", 0) or 0),
|
|
window_id=int(d.get("windowId", 0) or 0),
|
|
attributes={k: v for k, v in d.items()
|
|
if k not in ("index", "role", "label", "bounds", "app", "pid", "windowId")},
|
|
)
|