fix(computer-use): cap AX elements array to prevent context blowup (#22865)

`computer_use(action='capture', mode='ax')` returned the full AX element list verbatim in the JSON response. Dense Electron / Obsidian / JetBrains UIs publish 500+ AX nodes (one reproduction in #22865 returned 597 elements against Obsidian), so a single capture could consume enough context to trigger compression failures or render the session unusable. The human-readable `_format_elements` summary is already capped at 40 lines, so the truncation gap was invisible to anyone reading the summary output. Add a `max_elements` argument to the tool schema, default 100, that trims the AX `elements` array. When the cap fires, the response surfaces `total_elements` and `truncated_elements` and appends a "raise max_elements or pass app= to narrow" hint to the summary so the model knows the JSON view is partial and can re-issue with a tighter scope. Validation is centralized in `_coerce_max_elements`: missing / non-integer / sub-1 inputs fall back to the default cap, so the protection can never be silently disabled by a malformed tool-call argument. The cap only affects AX-mode JSON; `mode='som'` and `mode='vision'` keep returning a screenshot + image-aware summary unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-29 06:31:32 +00:00 · 2026-05-09 17:17:53 -07:00 · 2026-05-09 17:17:53 -07:00 · bb694bad42
commit bb694bad42
parent 9e30ef224d
3 changed files with 168 additions and 7 deletions
--- a/tests/tools/test_computer_use.py
+++ b/tests/tools/test_computer_use.py
@ -76,6 +76,13 @@ class TestSchema:
        modes = set(COMPUTER_USE_SCHEMA["parameters"]["properties"]["mode"]["enum"])
        assert modes == {"som", "vision", "ax"}

+    def test_schema_exposes_max_elements_cap_for_capture(self):
+        from tools.computer_use.schema import COMPUTER_USE_SCHEMA
+        props = COMPUTER_USE_SCHEMA["parameters"]["properties"]
+        assert "max_elements" in props
+        assert props["max_elements"]["type"] == "integer"
+        assert props["max_elements"].get("minimum", 1) >= 1
+

 class TestRegistration:
    def test_tool_registers_with_registry(self):
@ -337,6 +344,105 @@ class TestCaptureResponse:
        assert "AXButton" in text_part["text"]
        assert "AXTextField" in text_part["text"]

+    def _ax_backend_with(self, count: int):
+        """Construct a fake backend that yields ``count`` AX elements."""
+        from tools.computer_use.backend import CaptureResult, UIElement
+
+        elements = [
+            UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
+            for i in range(count)
+        ]
+
+        class FakeBackend:
+            def start(self): pass
+            def stop(self): pass
+            def is_available(self): return True
+            def capture(self, mode="som", app=None):
+                return CaptureResult(
+                    mode=mode, width=800, height=600,
+                    png_b64="",
+                    elements=list(elements),
+                    app="Obsidian",
+                )
+            def click(self, **kw): ...
+            def drag(self, **kw): ...
+            def scroll(self, **kw): ...
+            def type_text(self, text): ...
+            def key(self, keys): ...
+            def list_apps(self): return []
+            def focus_app(self, app, raise_window=False): ...
+
+        return FakeBackend()
+
+    def test_capture_ax_caps_elements_at_default_for_dense_trees(self):
+        """Regression for #22865: an Electron-style 600-element AX tree must
+        not emit the entire array verbatim into the tool result.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(600)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
+
+        parsed = json.loads(out)
+        assert parsed["mode"] == "ax"
+        assert parsed["total_elements"] == 600
+        assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS
+        assert parsed["truncated_elements"] == 600 - cu_tool._DEFAULT_MAX_ELEMENTS
+        # Truncation must be visible in the human summary so the model knows
+        # the JSON view is partial and can re-issue with a tighter scope.
+        assert "truncated to" in parsed["summary"]
+
+    def test_capture_ax_honors_explicit_max_elements_override(self):
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(600)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use(
+                {"action": "capture", "mode": "ax", "max_elements": 250}
+            )
+
+        parsed = json.loads(out)
+        assert len(parsed["elements"]) == 250
+        assert parsed["truncated_elements"] == 350
+
+    def test_capture_ax_below_cap_is_unchanged(self):
+        """Backwards-compat: small captures keep the full elements array and
+        do not surface a `truncated_elements` field.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(5)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
+
+        parsed = json.loads(out)
+        assert len(parsed["elements"]) == 5
+        assert parsed["total_elements"] == 5
+        assert "truncated_elements" not in parsed
+        assert "truncated to" not in parsed["summary"]
+
+    def test_capture_ax_invalid_max_elements_falls_back_to_default(self):
+        """Malformed `max_elements` (string, negative, zero) must not silently
+        disable the cap and re-introduce the original unbounded behavior.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(600)
+        cu_tool.reset_backend_for_tests()
+        for bad in ("not-a-number", 0, -10):
+            with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+                out = cu_tool.handle_computer_use(
+                    {"action": "capture", "mode": "ax", "max_elements": bad}
+                )
+            parsed = json.loads(out)
+            assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS, (
+                f"bad max_elements={bad!r} disabled the cap"
+            )
+

 # ---------------------------------------------------------------------------
 # Anthropic adapter: multimodal tool-result conversion
--- a/tools/computer_use/schema.py
+++ b/tools/computer_use/schema.py
@ -75,6 +75,23 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = {
                    "frontmost app's window or the whole screen."
                ),
            },
+            "max_elements": {
+                "type": "integer",
+                "description": (
+                    "Optional cap on the AX `elements` array returned by "
+                    "`action='capture'`. Default 100. Dense UIs (Electron "
+                    "apps such as Obsidian or VS Code, JetBrains IDEs) can "
+                    "publish 500+ AX nodes — capping prevents a single "
+                    "capture from blowing session context. When the cap "
+                    "trims the response, `total_elements` and "
+                    "`truncated_elements` are surfaced in the result so "
+                    "you can re-call with `app=` to narrow scope or raise "
+                    "`max_elements` when the full tree is required. Has no "
+                    "effect on `mode='som'` / `mode='vision'` (those return "
+                    "a screenshot, not the elements array)."
+                ),
+                "minimum": 1,
+            },
            # ── click / drag / scroll targeting ────────────────────
            "element": {
                "type": "integer",
--- a/tools/computer_use/tool.py
+++ b/tools/computer_use/tool.py
@ -317,7 +317,7 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) ->
        if mode not in {"som", "vision", "ax"}:
            return json.dumps({"error": f"bad mode {mode!r}; use som|vision|ax"})
        cap = backend.capture(mode=mode, app=args.get("app"))
-        return _capture_response(cap)
+        return _capture_response(cap, max_elements=_coerce_max_elements(args.get("max_elements")))

    if action == "wait":
        seconds = float(args.get("seconds", 1.0))
@ -416,16 +416,50 @@ def _text_response(res: ActionResult) -> str:
    return json.dumps(payload)


-def _capture_response(cap: CaptureResult) -> Any:
+# Default cap for the AX `elements` array returned by capture. Dense UIs
+# (Electron apps, Obsidian, JetBrains IDEs) can publish 500+ AX nodes, which
+# can exhaust session context after a single capture. The model-facing
+# `max_elements` argument lets callers raise this when they need the full tree.
+_DEFAULT_MAX_ELEMENTS = 100
+
+
+def _coerce_max_elements(value: Any) -> int:
+    """Validate the caller-supplied ``max_elements``.
+
+    Falls back to :data:`_DEFAULT_MAX_ELEMENTS` for missing / non-integer /
+    sub-1 inputs so the cap can never be silently disabled by a malformed
+    tool-call argument.
+    """
+    if value is None:
+        return _DEFAULT_MAX_ELEMENTS
+    try:
+        n = int(value)
+    except (TypeError, ValueError):
+        return _DEFAULT_MAX_ELEMENTS
+    if n < 1:
+        return _DEFAULT_MAX_ELEMENTS
+    return n
+
+
+def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEMENTS) -> Any:
+    total_elements = len(cap.elements)
+    visible_elements = cap.elements[:max_elements]
+    truncated_elements = max(0, total_elements - len(visible_elements))
+
    element_index = _format_elements(cap.elements)
    summary_lines = [
        f"capture mode={cap.mode} {cap.width}x{cap.height}"
        + (f" app={cap.app}" if cap.app else "")
        + (f" window={cap.window_title!r}" if cap.window_title else ""),
-        f"{len(cap.elements)} interactable element(s):",
+        f"{total_elements} interactable element(s):",
    ]
    if element_index:
        summary_lines.extend(element_index)
+    if truncated_elements:
+        summary_lines.append(
+            f"  (response truncated to {len(visible_elements)} of {total_elements} elements; "
+            f"raise max_elements or pass app= to narrow)"
+        )
    summary = "\n".join(summary_lines)

    if cap.png_b64 and cap.mode != "ax":
@ -458,18 +492,22 @@ def _capture_response(cap: CaptureResult) -> Any:
            ],
            "text_summary": summary,
            "meta": {"mode": cap.mode, "width": cap.width, "height": cap.height,
-                     "elements": len(cap.elements), "png_bytes": cap.png_bytes_len},
+                     "elements": total_elements, "png_bytes": cap.png_bytes_len},
        }
    # AX-only (or image missing): text path.
-    return json.dumps({
+    payload: Dict[str, Any] = {
        "mode": cap.mode,
        "width": cap.width,
        "height": cap.height,
        "app": cap.app,
        "window_title": cap.window_title,
-        "elements": [_element_to_dict(e) for e in cap.elements],
+        "elements": [_element_to_dict(e) for e in visible_elements],
+        "total_elements": total_elements,
        "summary": summary,
-    })
+    }
+    if truncated_elements:
+        payload["truncated_elements"] = truncated_elements
+    return json.dumps(payload)


 # ---------------------------------------------------------------------------