hermes-agent/pyproject.toml
Teknium b07791db05
feat(computer-use): cua-driver backend, universal any-model schema
Background macOS desktop control via cua-driver MCP — does NOT steal the
user's cursor or keyboard focus, works with any tool-capable model.

Replaces the Anthropic-native `computer_20251124` approach from the
abandoned #4562 with a generic OpenAI function-calling schema plus SOM
(set-of-mark) captures so Claude, GPT, Gemini, and open models can all
drive the desktop via numbered element indices.

## What this adds

- `tools/computer_use/` package — swappable ComputerUseBackend ABC +
  CuaDriverBackend (stdio MCP client to trycua/cua's cua-driver binary).
- Universal `computer_use` tool with one schema for all providers.
  Actions: capture (som/vision/ax), click, double_click, right_click,
  middle_click, drag, scroll, type, key, wait, list_apps, focus_app.
- Multimodal tool-result envelope (`_multimodal=True`, OpenAI-style
  `content: [text, image_url]` parts) that flows through
  handle_function_call into the tool message. Anthropic adapter converts
  into native `tool_result` image blocks; OpenAI-compatible providers
  get the parts list directly.
- Image eviction in convert_messages_to_anthropic: only the 3 most
  recent screenshots carry real image data; older ones become text
  placeholders to cap per-turn token cost.
- Context compressor image pruning: old multimodal tool results have
  their image parts stripped instead of being skipped.
- Image-aware token estimation: each image counts as a flat 1500 tokens
  instead of its base64 char length (~1MB would have registered as
  ~250K tokens before).
- COMPUTER_USE_GUIDANCE system-prompt block — injected when the toolset
  is active.
- Session DB persistence strips base64 from multimodal tool messages.
- Trajectory saver normalises multimodal messages to text-only.
- `hermes tools` post-setup installs cua-driver via the upstream script
  and prints permission-grant instructions.
- CLI approval callback wired so destructive computer_use actions go
  through the same prompt_toolkit approval dialog as terminal commands.
- Hard safety guards at the tool level: blocked type patterns
  (curl|bash, sudo rm -rf, fork bomb), blocked key combos (empty trash,
  force delete, lock screen, log out).
- Skill `apple/macos-computer-use/SKILL.md` — universal (model-agnostic)
  workflow guide.
- Docs: `user-guide/features/computer-use.md` plus reference catalog
  entries.

## Tests

44 new tests in tests/tools/test_computer_use.py covering schema
shape (universal, not Anthropic-native), dispatch routing, safety
guards, multimodal envelope, Anthropic adapter conversion, screenshot
eviction, context compressor pruning, image-aware token estimation,
run_agent helpers, and universality guarantees.

469/469 pass across tests/tools/test_computer_use.py + the affected
agent/ test suites.

## Not in this PR

- `model_tools.py` provider-gating: the tool is available to every
  provider. Providers without multi-part tool message support will see
  text-only tool results (graceful degradation via `text_summary`).
- Anthropic server-side `clear_tool_uses_20250919` — deferred;
  client-side eviction + compressor pruning cover the same cost ceiling
  without a beta header.

## Caveats

- macOS only. cua-driver uses private SkyLight SPIs
  (SLEventPostToPid, SLPSPostEventRecordTo,
  _AXObserverAddNotificationAndCheckRemote) that can break on any macOS
  update. Pin with HERMES_CUA_DRIVER_VERSION.
- Requires Accessibility + Screen Recording permissions — the post-setup
  prints the Settings path.

Supersedes PR #4562 (pyautogui/Quartz foreground backend, Anthropic-
native schema). Credit @0xbyt4 for the original #3816 groundwork whose
context/eviction/token design is preserved here in generic form.
2026-04-23 16:44:24 -07:00

166 lines
5.7 KiB
TOML

[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "hermes-agent"
version = "0.11.0"
description = "The self-improving AI agent — creates skills from experience, improves them during use, and runs anywhere"
readme = "README.md"
requires-python = ">=3.11"
authors = [{ name = "Nous Research" }]
license = { text = "MIT" }
dependencies = [
# Core — pinned to known-good ranges to limit supply chain attack surface
"openai>=2.21.0,<3",
"anthropic>=0.39.0,<1",
"python-dotenv>=1.2.1,<2",
"fire>=0.7.1,<1",
"httpx[socks]>=0.28.1,<1",
"rich>=14.3.3,<15",
"tenacity>=9.1.4,<10",
"pyyaml>=6.0.2,<7",
"requests>=2.33.0,<3", # CVE-2026-25645
"jinja2>=3.1.5,<4",
"pydantic>=2.12.5,<3",
# Interactive CLI (prompt_toolkit is used directly by cli.py)
"prompt_toolkit>=3.0.52,<4",
# Tools
"exa-py>=2.9.0,<3",
"firecrawl-py>=4.16.0,<5",
"parallel-web>=0.4.2,<1",
"fal-client>=0.13.1,<1",
# Text-to-speech (Edge TTS is free, no API key needed)
"edge-tts>=7.2.7,<8",
# Skills Hub (GitHub App JWT auth — optional, only needed for bot identity)
"PyJWT[crypto]>=2.12.0,<3", # CVE-2026-32597
]
[project.optional-dependencies]
modal = ["modal>=1.0.0,<2"]
daytona = ["daytona>=0.148.0,<1"]
dev = ["debugpy>=1.8.0,<2", "pytest>=9.0.2,<10", "pytest-asyncio>=1.3.0,<2", "pytest-xdist>=3.0,<4", "mcp>=1.2.0,<2", "ty>=0.0.1a29,<0.0.22", "ruff"]
messaging = ["python-telegram-bot[webhooks]>=22.6,<23", "discord.py[voice]>=2.7.1,<3", "aiohttp>=3.13.3,<4", "slack-bolt>=1.18.0,<2", "slack-sdk>=3.27.0,<4", "qrcode>=7.0,<8"]
cron = ["croniter>=6.0.0,<7"]
slack = ["slack-bolt>=1.18.0,<2", "slack-sdk>=3.27.0,<4"]
matrix = ["mautrix[encryption]>=0.20,<1", "Markdown>=3.6,<4", "aiosqlite>=0.20", "asyncpg>=0.29"]
cli = ["simple-term-menu>=1.0,<2"]
tts-premium = ["elevenlabs>=1.0,<2"]
voice = [
# Local STT pulls in wheel-only transitive deps (ctranslate2, onnxruntime),
# so keep it out of the base install for source-build packagers like Homebrew.
"faster-whisper>=1.0.0,<2",
"sounddevice>=0.4.6,<1",
"numpy>=1.24.0,<3",
]
pty = [
"ptyprocess>=0.7.0,<1; sys_platform != 'win32'",
"pywinpty>=2.0.0,<3; sys_platform == 'win32'",
]
honcho = ["honcho-ai>=2.0.1,<3"]
mcp = ["mcp>=1.2.0,<2"]
homeassistant = ["aiohttp>=3.9.0,<4"]
sms = ["aiohttp>=3.9.0,<4"]
# Computer use — macOS background desktop control via cua-driver (MCP stdio).
# The cua-driver binary itself is installed via `hermes tools` post-setup
# (curl install script); this extra just pins the MCP client used to talk
# to it, which is already provided by the `mcp` extra.
computer-use = ["mcp>=1.2.0,<2"]
acp = ["agent-client-protocol>=0.9.0,<1.0"]
mistral = ["mistralai>=2.3.0,<3"]
bedrock = ["boto3>=1.35.0,<2"]
termux = [
# Tested Android / Termux path: keeps the core CLI feature-rich while
# avoiding extras that currently depend on non-Android wheels (notably
# faster-whisper -> ctranslate2 via the voice extra).
"python-telegram-bot[webhooks]>=22.6,<23",
"hermes-agent[cron]",
"hermes-agent[cli]",
"hermes-agent[pty]",
"hermes-agent[mcp]",
"hermes-agent[honcho]",
"hermes-agent[acp]",
]
dingtalk = ["dingtalk-stream>=0.20,<1", "alibabacloud-dingtalk>=2.0.0", "qrcode>=7.0,<8"]
feishu = ["lark-oapi>=1.5.3,<2", "qrcode>=7.0,<8"]
web = ["fastapi>=0.104.0,<1", "uvicorn[standard]>=0.24.0,<1"]
rl = [
"atroposlib @ git+https://github.com/NousResearch/atropos.git@c20c85256e5a45ad31edf8b7276e9c5ee1995a30",
"tinker @ git+https://github.com/thinking-machines-lab/tinker.git@30517b667f18a3dfb7ef33fb56cf686d5820ba2b",
"fastapi>=0.104.0,<1",
"uvicorn[standard]>=0.24.0,<1",
"wandb>=0.15.0,<1",
]
yc-bench = ["yc-bench @ git+https://github.com/collinear-ai/yc-bench.git@bfb0c88062450f46341bd9a5298903fc2e952a5c ; python_version >= '3.12'"]
all = [
"hermes-agent[modal]",
"hermes-agent[daytona]",
"hermes-agent[messaging]",
# matrix: python-olm (required by matrix-nio[e2e]) is upstream-broken on
# modern macOS (archived libolm, C++ errors with Clang 21+). On Linux the
# [matrix] extra's own marker pulls in the [e2e] variant automatically.
"hermes-agent[matrix]; sys_platform == 'linux'",
"hermes-agent[cron]",
"hermes-agent[cli]",
"hermes-agent[dev]",
"hermes-agent[tts-premium]",
"hermes-agent[slack]",
"hermes-agent[pty]",
"hermes-agent[honcho]",
"hermes-agent[mcp]",
"hermes-agent[homeassistant]",
"hermes-agent[sms]",
"hermes-agent[acp]",
"hermes-agent[voice]",
"hermes-agent[dingtalk]",
"hermes-agent[feishu]",
"hermes-agent[mistral]",
"hermes-agent[bedrock]",
"hermes-agent[web]",
]
[project.scripts]
hermes = "hermes_cli.main:main"
hermes-agent = "run_agent:main"
hermes-acp = "acp_adapter.entry:main"
[tool.setuptools]
py-modules = ["run_agent", "model_tools", "toolsets", "batch_runner", "trajectory_compressor", "toolset_distributions", "cli", "hermes_constants", "hermes_state", "hermes_time", "hermes_logging", "rl_cli", "utils"]
[tool.setuptools.package-data]
hermes_cli = ["web_dist/**/*"]
[tool.setuptools.packages.find]
include = ["agent", "agent.*", "tools", "tools.*", "hermes_cli", "gateway", "gateway.*", "tui_gateway", "tui_gateway.*", "cron", "acp_adapter", "plugins", "plugins.*"]
[tool.pytest.ini_options]
testpaths = ["tests"]
markers = [
"integration: marks tests requiring external services (API keys, Modal, etc.)",
]
addopts = "-m 'not integration' -n auto"
[tool.ty.environment]
python-version = "3.13"
[tool.ty.rules]
unknown-argument = "warn"
redundant-cast = "ignore"
[tool.ty.src]
exclude = ["**"]
[[tool.ty.overrides]]
include = ["**"]
[tool.ty.overrides.rules]
unresolved-import = "ignore"
invalid-method-override = "ignore"
invalid-assignment = "ignore"
not-iterable = "ignore"
[tool.ruff]
exclude = ["*"]
[tool.uv]
exclude-newer = "7 days"