From bb694bad426a296855748f852e697d36859928a9 Mon Sep 17 00:00:00 2001 From: briandevans <252620095+briandevans@users.noreply.github.com> Date: Sat, 9 May 2026 17:17:53 -0700 Subject: [PATCH] fix(computer-use): cap AX `elements` array to prevent context blowup (#22865) `computer_use(action='capture', mode='ax')` returned the full AX element list verbatim in the JSON response. Dense Electron / Obsidian / JetBrains UIs publish 500+ AX nodes (one reproduction in #22865 returned 597 elements against Obsidian), so a single capture could consume enough context to trigger compression failures or render the session unusable. The human-readable `_format_elements` summary is already capped at 40 lines, so the truncation gap was invisible to anyone reading the summary output. Add a `max_elements` argument to the tool schema, default 100, that trims the AX `elements` array. When the cap fires, the response surfaces `total_elements` and `truncated_elements` and appends a "raise max_elements or pass app= to narrow" hint to the summary so the model knows the JSON view is partial and can re-issue with a tighter scope. Validation is centralized in `_coerce_max_elements`: missing / non-integer / sub-1 inputs fall back to the default cap, so the protection can never be silently disabled by a malformed tool-call argument. The cap only affects AX-mode JSON; `mode='som'` and `mode='vision'` keep returning a screenshot + image-aware summary unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/tools/test_computer_use.py | 106 +++++++++++++++++++++++++++++++ tools/computer_use/schema.py | 17 +++++ tools/computer_use/tool.py | 52 +++++++++++++-- 3 files changed, 168 insertions(+), 7 deletions(-) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index 7afaa7b57de..bbaaeb8fcea 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -76,6 +76,13 @@ class TestSchema: modes = set(COMPUTER_USE_SCHEMA["parameters"]["properties"]["mode"]["enum"]) assert modes == {"som", "vision", "ax"} + def test_schema_exposes_max_elements_cap_for_capture(self): + from tools.computer_use.schema import COMPUTER_USE_SCHEMA + props = COMPUTER_USE_SCHEMA["parameters"]["properties"] + assert "max_elements" in props + assert props["max_elements"]["type"] == "integer" + assert props["max_elements"].get("minimum", 1) >= 1 + class TestRegistration: def test_tool_registers_with_registry(self): @@ -337,6 +344,105 @@ class TestCaptureResponse: assert "AXButton" in text_part["text"] assert "AXTextField" in text_part["text"] + def _ax_backend_with(self, count: int): + """Construct a fake backend that yields ``count`` AX elements.""" + from tools.computer_use.backend import CaptureResult, UIElement + + elements = [ + UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1)) + for i in range(count) + ] + + class FakeBackend: + def start(self): pass + def stop(self): pass + def is_available(self): return True + def capture(self, mode="som", app=None): + return CaptureResult( + mode=mode, width=800, height=600, + png_b64="", + elements=list(elements), + app="Obsidian", + ) + def click(self, **kw): ... + def drag(self, **kw): ... + def scroll(self, **kw): ... + def type_text(self, text): ... + def key(self, keys): ... + def list_apps(self): return [] + def focus_app(self, app, raise_window=False): ... + + return FakeBackend() + + def test_capture_ax_caps_elements_at_default_for_dense_trees(self): + """Regression for #22865: an Electron-style 600-element AX tree must + not emit the entire array verbatim into the tool result. + """ + from tools.computer_use import tool as cu_tool + + fake_backend = self._ax_backend_with(600) + cu_tool.reset_backend_for_tests() + with patch.object(cu_tool, "_get_backend", return_value=fake_backend): + out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"}) + + parsed = json.loads(out) + assert parsed["mode"] == "ax" + assert parsed["total_elements"] == 600 + assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS + assert parsed["truncated_elements"] == 600 - cu_tool._DEFAULT_MAX_ELEMENTS + # Truncation must be visible in the human summary so the model knows + # the JSON view is partial and can re-issue with a tighter scope. + assert "truncated to" in parsed["summary"] + + def test_capture_ax_honors_explicit_max_elements_override(self): + from tools.computer_use import tool as cu_tool + + fake_backend = self._ax_backend_with(600) + cu_tool.reset_backend_for_tests() + with patch.object(cu_tool, "_get_backend", return_value=fake_backend): + out = cu_tool.handle_computer_use( + {"action": "capture", "mode": "ax", "max_elements": 250} + ) + + parsed = json.loads(out) + assert len(parsed["elements"]) == 250 + assert parsed["truncated_elements"] == 350 + + def test_capture_ax_below_cap_is_unchanged(self): + """Backwards-compat: small captures keep the full elements array and + do not surface a `truncated_elements` field. + """ + from tools.computer_use import tool as cu_tool + + fake_backend = self._ax_backend_with(5) + cu_tool.reset_backend_for_tests() + with patch.object(cu_tool, "_get_backend", return_value=fake_backend): + out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"}) + + parsed = json.loads(out) + assert len(parsed["elements"]) == 5 + assert parsed["total_elements"] == 5 + assert "truncated_elements" not in parsed + assert "truncated to" not in parsed["summary"] + + def test_capture_ax_invalid_max_elements_falls_back_to_default(self): + """Malformed `max_elements` (string, negative, zero) must not silently + disable the cap and re-introduce the original unbounded behavior. + """ + from tools.computer_use import tool as cu_tool + + fake_backend = self._ax_backend_with(600) + cu_tool.reset_backend_for_tests() + for bad in ("not-a-number", 0, -10): + with patch.object(cu_tool, "_get_backend", return_value=fake_backend): + out = cu_tool.handle_computer_use( + {"action": "capture", "mode": "ax", "max_elements": bad} + ) + parsed = json.loads(out) + assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS, ( + f"bad max_elements={bad!r} disabled the cap" + ) + # --------------------------------------------------------------------------- # Anthropic adapter: multimodal tool-result conversion diff --git a/tools/computer_use/schema.py b/tools/computer_use/schema.py index d8928d0dc56..e716387a6c0 100644 --- a/tools/computer_use/schema.py +++ b/tools/computer_use/schema.py @@ -75,6 +75,23 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = { "frontmost app's window or the whole screen." ), }, + "max_elements": { + "type": "integer", + "description": ( + "Optional cap on the AX `elements` array returned by " + "`action='capture'`. Default 100. Dense UIs (Electron " + "apps such as Obsidian or VS Code, JetBrains IDEs) can " + "publish 500+ AX nodes — capping prevents a single " + "capture from blowing session context. When the cap " + "trims the response, `total_elements` and " + "`truncated_elements` are surfaced in the result so " + "you can re-call with `app=` to narrow scope or raise " + "`max_elements` when the full tree is required. Has no " + "effect on `mode='som'` / `mode='vision'` (those return " + "a screenshot, not the elements array)." + ), + "minimum": 1, + }, # ── click / drag / scroll targeting ──────────────────── "element": { "type": "integer", diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py index 4912b0f979a..029963eeeac 100644 --- a/tools/computer_use/tool.py +++ b/tools/computer_use/tool.py @@ -317,7 +317,7 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) -> if mode not in {"som", "vision", "ax"}: return json.dumps({"error": f"bad mode {mode!r}; use som|vision|ax"}) cap = backend.capture(mode=mode, app=args.get("app")) - return _capture_response(cap) + return _capture_response(cap, max_elements=_coerce_max_elements(args.get("max_elements"))) if action == "wait": seconds = float(args.get("seconds", 1.0)) @@ -416,16 +416,50 @@ def _text_response(res: ActionResult) -> str: return json.dumps(payload) -def _capture_response(cap: CaptureResult) -> Any: +# Default cap for the AX `elements` array returned by capture. Dense UIs +# (Electron apps, Obsidian, JetBrains IDEs) can publish 500+ AX nodes, which +# can exhaust session context after a single capture. The model-facing +# `max_elements` argument lets callers raise this when they need the full tree. +_DEFAULT_MAX_ELEMENTS = 100 + + +def _coerce_max_elements(value: Any) -> int: + """Validate the caller-supplied ``max_elements``. + + Falls back to :data:`_DEFAULT_MAX_ELEMENTS` for missing / non-integer / + sub-1 inputs so the cap can never be silently disabled by a malformed + tool-call argument. + """ + if value is None: + return _DEFAULT_MAX_ELEMENTS + try: + n = int(value) + except (TypeError, ValueError): + return _DEFAULT_MAX_ELEMENTS + if n < 1: + return _DEFAULT_MAX_ELEMENTS + return n + + +def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEMENTS) -> Any: + total_elements = len(cap.elements) + visible_elements = cap.elements[:max_elements] + truncated_elements = max(0, total_elements - len(visible_elements)) + element_index = _format_elements(cap.elements) summary_lines = [ f"capture mode={cap.mode} {cap.width}x{cap.height}" + (f" app={cap.app}" if cap.app else "") + (f" window={cap.window_title!r}" if cap.window_title else ""), - f"{len(cap.elements)} interactable element(s):", + f"{total_elements} interactable element(s):", ] if element_index: summary_lines.extend(element_index) + if truncated_elements: + summary_lines.append( + f" (response truncated to {len(visible_elements)} of {total_elements} elements; " + f"raise max_elements or pass app= to narrow)" + ) summary = "\n".join(summary_lines) if cap.png_b64 and cap.mode != "ax": @@ -458,18 +492,22 @@ def _capture_response(cap: CaptureResult) -> Any: ], "text_summary": summary, "meta": {"mode": cap.mode, "width": cap.width, "height": cap.height, - "elements": len(cap.elements), "png_bytes": cap.png_bytes_len}, + "elements": total_elements, "png_bytes": cap.png_bytes_len}, } # AX-only (or image missing): text path. - return json.dumps({ + payload: Dict[str, Any] = { "mode": cap.mode, "width": cap.width, "height": cap.height, "app": cap.app, "window_title": cap.window_title, - "elements": [_element_to_dict(e) for e in cap.elements], + "elements": [_element_to_dict(e) for e in visible_elements], + "total_elements": total_elements, "summary": summary, - }) + } + if truncated_elements: + payload["truncated_elements"] = truncated_elements + return json.dumps(payload) # ---------------------------------------------------------------------------