mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
Merge pull request #50994 from NousResearch/hermes/hermes-9fb04abd
fix(computer-use): working vision capture + whole-screen/desktop target on Windows
This commit is contained in:
commit
672ea1f894
4 changed files with 186 additions and 4 deletions
|
|
@ -45,6 +45,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
|
|||
|
||||
# Auto-extracted from noreply emails + manual overrides
|
||||
AUTHOR_MAP = {
|
||||
"jeevesassistant00@gmail.com": "jeeves-assistant", # PR #50771 (computer-use CuaDriver vision capture routing)
|
||||
"21178861+ScotterMonk@users.noreply.github.com": "ScotterMonk", # PR #50145 salvage (cron output truncation: adapter-aware chunking, #50126)
|
||||
"rrandqua@gmail.com": "TutkuEroglu", # PR #50481 salvage (AGENTS.md stale token-lock adapter path)
|
||||
"f@trycua.com": "f-trycua", # PR #50507 salvage (cross-platform computer_use; supersedes #44221/#30660)
|
||||
|
|
|
|||
|
|
@ -2139,6 +2139,123 @@ class TestStructuredElementsConsumption:
|
|||
# Markdown surface doesn't carry bounds — lossy by design.
|
||||
assert cap.elements[0].bounds == (0, 0, 0, 0)
|
||||
|
||||
def test_vision_capture_falls_back_to_get_window_state_when_screenshot_dropped(self):
|
||||
"""cua-driver >=0.5.x dropped the standalone `screenshot` MCP tool and
|
||||
folded full-window PNG capture into `get_window_state`. When the driver
|
||||
no longer advertises `screenshot`, vision capture must route through
|
||||
`get_window_state` (discarding the AX tree) and still return a PNG."""
|
||||
from tools.computer_use.cua_backend import CuaDriverBackend
|
||||
|
||||
backend = CuaDriverBackend()
|
||||
backend._session = MagicMock()
|
||||
# Modern driver: capabilities discovered, `screenshot` not advertised.
|
||||
backend._session._has_tool.return_value = False
|
||||
backend._session.capabilities_discovered = True
|
||||
|
||||
windows_payload = {
|
||||
"windows": [{
|
||||
"app_name": "Demo", "pid": 9, "window_id": 1,
|
||||
"is_on_screen": True, "title": "Demo", "z_index": 0,
|
||||
}],
|
||||
}
|
||||
png_b64 = (
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m"
|
||||
"NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
|
||||
)
|
||||
|
||||
def fake_call_tool(name, args):
|
||||
if name == "list_windows":
|
||||
return {"data": "", "images": [], "image_mime_types": [],
|
||||
"structuredContent": windows_payload, "isError": False}
|
||||
if name == "get_window_state":
|
||||
return {"data": "", "images": [png_b64],
|
||||
"image_mime_types": ["image/png"],
|
||||
"structuredContent": None, "isError": False}
|
||||
if name == "screenshot":
|
||||
raise AssertionError("driver dropped screenshot; must not be called")
|
||||
return {"data": "", "images": [], "image_mime_types": [],
|
||||
"structuredContent": None, "isError": False}
|
||||
|
||||
backend._session.call_tool.side_effect = fake_call_tool
|
||||
cap = backend.capture(mode="vision")
|
||||
|
||||
tool_names = [call.args[0] for call in backend._session.call_tool.call_args_list]
|
||||
assert tool_names == ["list_windows", "get_window_state"]
|
||||
assert cap.png_b64 == png_b64
|
||||
assert cap.image_mime_type == "image/png"
|
||||
assert cap.width == 1
|
||||
assert cap.height == 1
|
||||
# Vision mode stays free of AX element noise.
|
||||
assert cap.elements == []
|
||||
|
||||
def test_capture_app_screen_targets_desktop_window(self):
|
||||
"""capture(app='screen') resolves to the OS shell/desktop window
|
||||
(Windows Progman) rather than an application window, so 'show me my
|
||||
screen' works on cua-driver's window-oriented capture surface."""
|
||||
from tools.computer_use.cua_backend import CuaDriverBackend
|
||||
|
||||
backend = CuaDriverBackend()
|
||||
backend._session = MagicMock()
|
||||
|
||||
windows_payload = {
|
||||
"windows": [
|
||||
{"app_name": "Code", "pid": 11, "window_id": 1,
|
||||
"is_on_screen": True, "title": "editor", "z_index": 0},
|
||||
{"app_name": "Progman", "pid": 4, "window_id": 99,
|
||||
"is_on_screen": True, "title": "Program Manager", "z_index": 5},
|
||||
{"app_name": "Shell_TrayWnd", "pid": 4, "window_id": 50,
|
||||
"is_on_screen": True, "title": "Taskbar", "z_index": 4},
|
||||
],
|
||||
}
|
||||
|
||||
def fake_call_tool(name, args):
|
||||
if name == "list_windows":
|
||||
return {"data": "", "images": [], "image_mime_types": [],
|
||||
"structuredContent": windows_payload, "isError": False}
|
||||
if name == "get_window_state":
|
||||
# Should be invoked against the desktop backdrop, not Code.
|
||||
assert args["window_id"] == 99
|
||||
return {"data": "✅ Desktop — 0 elements", "images": [],
|
||||
"image_mime_types": [], "structuredContent": None,
|
||||
"isError": False}
|
||||
return {"data": "", "images": [], "image_mime_types": [],
|
||||
"structuredContent": None, "isError": False}
|
||||
|
||||
backend._session.call_tool.side_effect = fake_call_tool
|
||||
cap = backend.capture(mode="ax", app="screen")
|
||||
|
||||
assert backend._active_window_id == 99
|
||||
assert cap.app == "Progman"
|
||||
|
||||
def test_capture_app_screen_no_desktop_window_surfaces_limitation(self):
|
||||
"""When no desktop/shell window is present, capture(app='screen')
|
||||
returns a clear message about cua-driver's per-window capture limit
|
||||
instead of silently grabbing the frontmost app."""
|
||||
from tools.computer_use.cua_backend import CuaDriverBackend
|
||||
|
||||
backend = CuaDriverBackend()
|
||||
backend._session = MagicMock()
|
||||
|
||||
windows_payload = {
|
||||
"windows": [
|
||||
{"app_name": "Code", "pid": 11, "window_id": 1,
|
||||
"is_on_screen": True, "title": "editor", "z_index": 0},
|
||||
],
|
||||
}
|
||||
|
||||
def fake_call_tool(name, args):
|
||||
if name == "list_windows":
|
||||
return {"data": "", "images": [], "image_mime_types": [],
|
||||
"structuredContent": windows_payload, "isError": False}
|
||||
raise AssertionError(f"unexpected tool {name} — should short-circuit")
|
||||
|
||||
backend._session.call_tool.side_effect = fake_call_tool
|
||||
cap = backend.capture(mode="vision", app="desktop")
|
||||
|
||||
assert cap.width == 0 and cap.height == 0
|
||||
assert cap.png_b64 is None
|
||||
assert "captures one window at a time" in cap.window_title
|
||||
|
||||
|
||||
class TestCapabilityDiscovery:
|
||||
"""Surface 4 (NousResearch/hermes-agent#47072): the wrapper learns
|
||||
|
|
|
|||
|
|
@ -78,6 +78,29 @@ _CUA_DRIVER_ARGS = ["mcp"] # stdio MCP transport (fallback when the
|
|||
# driver doesn't expose `manifest` — see
|
||||
# `_resolve_mcp_invocation` below)
|
||||
|
||||
# Whole-screen / desktop capture. cua-driver is a window-oriented driver —
|
||||
# its `get_window_state` / `screenshot` tools capture a single window (by
|
||||
# pid + window_id), and there is no MCP tool that captures the entire virtual
|
||||
# desktop or an arbitrary monitor as one image. But the OS shell surfaces
|
||||
# themselves (the desktop backdrop and the taskbar/menu-bar) are real windows
|
||||
# that show up in `list_windows`, so "show me my screen" / "click the taskbar"
|
||||
# is reachable by targeting those windows. When `app` is one of these
|
||||
# sentinels, capture() resolves to the desktop/shell window instead of an
|
||||
# application window.
|
||||
_SCREEN_CAPTURE_SENTINELS = {"screen", "desktop", "fullscreen", "full screen", "all"}
|
||||
|
||||
# Known shell/desktop window identifiers across platforms. Matched
|
||||
# case-insensitively as a substring against both the window's app_name and
|
||||
# its title (cua-driver surfaces the Win32 class name / app name here).
|
||||
# Windows: Progman / WorkerW back the desktop; Shell_TrayWnd is the taskbar.
|
||||
# macOS: Finder owns the desktop; the menu bar / Dock are the shell.
|
||||
_DESKTOP_WINDOW_NAMES = (
|
||||
"progman", "workerw", "program manager", # Windows desktop
|
||||
"shell_traywnd", "taskbar", # Windows taskbar
|
||||
"finder", "desktop", "dock", # macOS desktop / shell
|
||||
)
|
||||
|
||||
|
||||
# Env var cua-driver reads to gate its anonymous usage telemetry (PostHog).
|
||||
# Setting it to "0" disables telemetry; absence => the binary's own default
|
||||
# (telemetry ON upstream).
|
||||
|
|
@ -1029,7 +1052,43 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
# returned by list_windows is the localized name (e.g. "計算機"), so
|
||||
# `app="Calculator"` legitimately matches no windows on a non-English
|
||||
# system and the caller needs to retry with the localized name.
|
||||
if app:
|
||||
if app and app.strip().lower() in _SCREEN_CAPTURE_SENTINELS:
|
||||
# Whole-screen / desktop request. cua-driver has no virtual-desktop
|
||||
# capture tool, so resolve to the OS shell/desktop window (the
|
||||
# desktop backdrop or the taskbar/menu-bar), which list_windows
|
||||
# does surface. This makes "show me my screen" and "click the
|
||||
# taskbar" work; a single image still can't span multiple monitors
|
||||
# — that's a driver limitation, not a wrapper one.
|
||||
def _is_desktop_window(w: Dict[str, Any]) -> bool:
|
||||
haystack = f"{w.get('app_name', '')} {w.get('title', '')}".lower()
|
||||
return any(name in haystack for name in _DESKTOP_WINDOW_NAMES)
|
||||
|
||||
desktop = [w for w in windows if _is_desktop_window(w)]
|
||||
if not desktop:
|
||||
return CaptureResult(
|
||||
mode=mode, width=0, height=0, png_b64=None,
|
||||
elements=[], app="",
|
||||
window_title=(
|
||||
f"<no desktop/shell window found for app={app!r}; "
|
||||
f"cua-driver captures one window at a time and exposes "
|
||||
f"no whole-virtual-desktop or per-monitor capture. "
|
||||
f"Call list_apps / capture(app='<AppName>') to target a "
|
||||
f"specific window instead. On Windows the taskbar is "
|
||||
f"'Shell_TrayWnd' and the desktop is 'Progman'.>"
|
||||
),
|
||||
png_bytes_len=0,
|
||||
)
|
||||
# Prefer the desktop backdrop (Progman/WorkerW/Finder) over the
|
||||
# taskbar when both are present, so a bare "screen" capture shows
|
||||
# the full desktop rather than just the task strip.
|
||||
windows = sorted(
|
||||
desktop,
|
||||
key=lambda w: 0 if any(
|
||||
n in f"{w.get('app_name', '')} {w.get('title', '')}".lower()
|
||||
for n in ("progman", "workerw", "program manager", "finder", "desktop")
|
||||
) else 1,
|
||||
)
|
||||
elif app:
|
||||
app_lower = app.lower()
|
||||
filtered = [w for w in windows if app_lower in w["app_name"].lower()]
|
||||
if not filtered:
|
||||
|
|
|
|||
|
|
@ -71,9 +71,14 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = {
|
|||
"type": "string",
|
||||
"description": (
|
||||
"Optional. Limit capture/action to a specific app "
|
||||
"(by name, e.g. 'Safari' or 'Notepad', or bundle ID "
|
||||
"where the platform supports it). If omitted, operates "
|
||||
"on the frontmost app's window or the whole screen."
|
||||
"(by name, e.g. 'Safari', or bundle ID, "
|
||||
"'com.apple.Safari'). If omitted, operates on the "
|
||||
"frontmost app's window. Pass app='screen' (or "
|
||||
"'desktop') to capture the OS desktop/shell surface — "
|
||||
"e.g. to see the wallpaper or click the taskbar. Note: "
|
||||
"capture is per-window; a single image cannot span "
|
||||
"multiple monitors, so on a multi-screen setup capture "
|
||||
"one window or display at a time."
|
||||
),
|
||||
},
|
||||
"max_elements": {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue