"""Schema for the generic `computer_use` tool. Model-agnostic. Any tool-calling model can drive this. Vision-capable models should prefer `capture(mode='som')` then `click(element=N)` — much more reliable than pixel coordinates. Pixel coordinates remain supported for models that were trained on them (e.g. Claude's computer-use RL). """ from __future__ import annotations from typing import Any, Dict # One consolidated tool with an `action` discriminator. Keeps the schema # compact and the per-turn token cost low. COMPUTER_USE_SCHEMA: Dict[str, Any] = { "name": "computer_use", "description": ( "Drive the macOS desktop in the background — screenshots, mouse, " "keyboard, scroll, drag — without stealing the user's cursor, " "keyboard focus, or Space. Preferred workflow: call with " "action='capture' (mode='som' gives numbered element overlays), " "then click by `element` index for reliability. Pixel coordinates " "are supported for models trained on them. Works on any window — " "hidden, minimized, on another Space, or behind another app. " "macOS only; requires cua-driver to be installed." ), "parameters": { "type": "object", "properties": { "action": { "type": "string", "enum": [ "capture", "click", "double_click", "right_click", "middle_click", "drag", "scroll", "type", "key", "wait", "list_apps", "focus_app", ], "description": ( "Which action to perform. `capture` is free (no side " "effects). All other actions require approval unless " "auto-approved." ), }, # ── capture ──────────────────────────────────────────── "mode": { "type": "string", "enum": ["som", "vision", "ax"], "description": ( "Capture mode. `som` (default) is a screenshot with " "numbered overlays on every interactable element plus " "the AX tree — best for vision models, lets you click " "by element index. `vision` is a plain screenshot. " "`ax` is the accessibility tree only (no image; useful " "for text-only models)." ), }, "app": { "type": "string", "description": ( "Optional. Limit capture/action to a specific app " "(by name, e.g. 'Safari', or bundle ID, " "'com.apple.Safari'). If omitted, operates on the " "frontmost app's window or the whole screen." ), }, # ── click / drag / scroll targeting ──────────────────── "element": { "type": "integer", "description": ( "The 1-based SOM index returned by the last " "`capture(mode='som')` call. Strongly preferred over " "raw coordinates." ), }, "coordinate": { "type": "array", "items": {"type": "integer"}, "minItems": 2, "maxItems": 2, "description": ( "Pixel coordinates [x, y] in logical screen space (as " "returned by capture width/height). Only use this if " "no element index is available." ), }, "button": { "type": "string", "enum": ["left", "right", "middle"], "description": "Mouse button. Defaults to left.", }, "modifiers": { "type": "array", "items": { "type": "string", "enum": ["cmd", "shift", "option", "alt", "ctrl", "fn"], }, "description": "Modifier keys held during the action.", }, # ── drag ─────────────────────────────────────────────── "from_element": {"type": "integer", "description": "Source element index (drag)."}, "to_element": {"type": "integer", "description": "Target element index (drag)."}, "from_coordinate": { "type": "array", "items": {"type": "integer"}, "minItems": 2, "maxItems": 2, "description": "Source [x,y] (drag; use when no element available).", }, "to_coordinate": { "type": "array", "items": {"type": "integer"}, "minItems": 2, "maxItems": 2, "description": "Target [x,y] (drag; use when no element available).", }, # ── scroll ───────────────────────────────────────────── "direction": { "type": "string", "enum": ["up", "down", "left", "right"], "description": "Scroll direction.", }, "amount": { "type": "integer", "description": "Scroll wheel ticks. Default 3.", }, # ── type / key / wait ────────────────────────────────── "text": { "type": "string", "description": "Text to type (respects the current layout).", }, "keys": { "type": "string", "description": ( "Key combo, e.g. 'cmd+s', 'ctrl+alt+t', 'return', " "'escape', 'tab'. Use '+' to combine." ), }, "seconds": { "type": "number", "description": "Seconds to wait. Max 30.", }, # ── focus_app ────────────────────────────────────────── "raise_window": { "type": "boolean", "description": ( "Only for action='focus_app'. If true, brings the " "window to front (DISRUPTS the user). Default false " "— input is routed to the app without raising, " "matching the background co-work model." ), }, # ── return shape ─────────────────────────────────────── "capture_after": { "type": "boolean", "description": ( "If true, take a follow-up capture after the action " "and include it in the response. Saves a round-trip " "when you need to verify an action's effect." ), }, }, "required": ["action"], }, } def get_computer_use_schema() -> Dict[str, Any]: """Return the generic OpenAI function-calling schema.""" return COMPUTER_USE_SCHEMA