From 280dd4513a838791462088120d24a45aedcb1c74 Mon Sep 17 00:00:00 2001
From: briandevans <252620095+briandevans@users.noreply.github.com>
Date: Mon, 11 May 2026 02:11:37 -0700
Subject: [PATCH] fix(computer-use): address Copilot review on max_elements cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four findings from Copilot's review on PR #22891, all in the AX
elements-array cap added by 22fa1ed:

1. The truncation note ("response truncated to N of M elements") was
   appended unconditionally — including in the som/vision multimodal
   path, whose response carries a screenshot rather than an `elements`
   array. The note described a payload field that wasn't present.
   Moved the note into the AX-text branch where the array actually
   appears.

2. `_format_elements(cap.elements)` ran on the full untrimmed list with
   its own `max_lines=40` cap, so a caller passing `max_elements=10`
   would see summary lines referencing `#11..#40` even though the JSON
   `elements` array only held #1..#10. Format on `visible_elements`
   instead so the summary indices always exist in the response.

3. `_coerce_max_elements` enforced a lower bound but no upper bound,
   so `max_elements=10_000_000` silently disabled the safeguard and
   reintroduced the original context-blow-up. Added a hard cap
   (`_MAX_ALLOWED_MAX_ELEMENTS = 1000`) that clamps oversized values.

4. The schema string said "Default 100" but the property carried no
   `default` field, and claimed `max_elements` had no effect on som/
   vision while the image-missing fallback path can still return an
   elements array. Added `"default": 100`, `"maximum": 1000`, and
   clarified the fallback-path wording.

Each finding gets a regression test:

- test_capture_ax_clamps_oversized_max_elements_to_hard_cap
- test_capture_ax_summary_indices_match_returned_elements
- test_capture_multimodal_summary_omits_truncation_note
- test_schema_max_elements_documents_default_and_upper_bound

Verified with `pytest tests/tools/test_computer_use.py` (53 passed,
including the 5 new cases). Confirmed each new test fails on the
pre-fix code path before applying the production change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/tools/test_computer_use.py | 102 +++++++++++++++++++++++++++++++
 tools/computer_use/schema.py     |  25 +++++---
 tools/computer_use/tool.py       |  35 ++++++++---
 3 files changed, 143 insertions(+), 19 deletions(-)

diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py
index bbaaeb8fcea..4a140c36be3 100644
--- a/tests/tools/test_computer_use.py
+++ b/tests/tools/test_computer_use.py
@@ -83,6 +83,20 @@ class TestSchema:
         assert props["max_elements"]["type"] == "integer"
         assert props["max_elements"].get("minimum", 1) >= 1
 
+    def test_schema_max_elements_documents_default_and_upper_bound(self):
+        """Schema description must agree with the runtime. The original PR
+        text said "Default 100" without a corresponding `default` field, and
+        had no upper bound — both Copilot findings.
+        """
+        from tools.computer_use.schema import COMPUTER_USE_SCHEMA
+        from tools.computer_use.tool import (
+            _DEFAULT_MAX_ELEMENTS,
+            _MAX_ALLOWED_MAX_ELEMENTS,
+        )
+        prop = COMPUTER_USE_SCHEMA["parameters"]["properties"]["max_elements"]
+        assert prop.get("default") == _DEFAULT_MAX_ELEMENTS
+        assert prop.get("maximum") == _MAX_ALLOWED_MAX_ELEMENTS
+
 
 class TestRegistration:
     def test_tool_registers_with_registry(self):
@@ -443,6 +457,94 @@ class TestCaptureResponse:
                 f"bad max_elements={bad!r} disabled the cap"
             )
 
+    def test_capture_ax_clamps_oversized_max_elements_to_hard_cap(self):
+        """A caller passing a very large `max_elements` must not be able to
+        disable the safeguard. The cap is clamped to a hard upper bound so
+        the context-blow-up protection cannot be bypassed by argument.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(5000)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use(
+                {"action": "capture", "mode": "ax", "max_elements": 10_000}
+            )
+        parsed = json.loads(out)
+        assert len(parsed["elements"]) == cu_tool._MAX_ALLOWED_MAX_ELEMENTS
+        assert parsed["total_elements"] == 5000
+        assert parsed["truncated_elements"] == 5000 - cu_tool._MAX_ALLOWED_MAX_ELEMENTS
+
+    def test_capture_ax_summary_indices_match_returned_elements(self):
+        """When `max_elements` is below the human-summary's own line cap, the
+        summary must not index elements that aren't in the returned array.
+        Otherwise the model sees `#15` in the summary and finds no matching
+        entry in `elements`.
+        """
+        from tools.computer_use import tool as cu_tool
+
+        fake_backend = self._ax_backend_with(600)
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
+            out = cu_tool.handle_computer_use(
+                {"action": "capture", "mode": "ax", "max_elements": 5}
+            )
+        parsed = json.loads(out)
+        returned_indices = {e["index"] for e in parsed["elements"]}
+        summary_lines = parsed["summary"].splitlines()
+        indexed_lines = [ln for ln in summary_lines if ln.lstrip().startswith("#")]
+        for ln in indexed_lines:
+            idx_token = ln.lstrip().split()[0].lstrip("#")
+            idx = int(idx_token)
+            assert idx in returned_indices, (
+                f"summary references #{idx} but it is absent from elements payload "
+                f"(returned: {sorted(returned_indices)})"
+            )
+
+    def test_capture_multimodal_summary_omits_truncation_note(self):
+        """The som/vision multimodal envelope returns a screenshot, not an
+        `elements` array — so a "response truncated to N of M elements"
+        claim in the summary would be inaccurate.
+        """
+        from tools.computer_use.backend import CaptureResult, UIElement
+        from tools.computer_use import tool as cu_tool
+
+        fake_png = "iVBORw0KGgo="
+        elements = [
+            UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
+            for i in range(600)
+        ]
+
+        class FakeBackend:
+            def start(self): pass
+            def stop(self): pass
+            def is_available(self): return True
+            def capture(self, mode="som", app=None):
+                return CaptureResult(
+                    mode=mode, width=800, height=600,
+                    png_b64=fake_png, elements=list(elements),
+                    app="Obsidian",
+                )
+            def click(self, **kw): ...
+            def drag(self, **kw): ...
+            def scroll(self, **kw): ...
+            def type_text(self, text): ...
+            def key(self, keys): ...
+            def list_apps(self): return []
+            def focus_app(self, app, raise_window=False): ...
+
+        cu_tool.reset_backend_for_tests()
+        with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()):
+            out = cu_tool.handle_computer_use({"action": "capture", "mode": "som"})
+
+        assert isinstance(out, dict) and out["_multimodal"] is True
+        text_part = next(p for p in out["content"] if p.get("type") == "text")
+        assert "truncated to" not in text_part["text"], (
+            "multimodal response carries an image, not an elements array; "
+            "the truncation note describes a payload field that isn't present"
+        )
+        assert "truncated to" not in out["text_summary"]
+
 
 # ---------------------------------------------------------------------------
 # Anthropic adapter: multimodal tool-result conversion
diff --git a/tools/computer_use/schema.py b/tools/computer_use/schema.py
index e716387a6c0..b39ccf06aa9 100644
--- a/tools/computer_use/schema.py
+++ b/tools/computer_use/schema.py
@@ -79,18 +79,23 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = {
                 "type": "integer",
                 "description": (
                     "Optional cap on the AX `elements` array returned by "
-                    "`action='capture'`. Default 100. Dense UIs (Electron "
-                    "apps such as Obsidian or VS Code, JetBrains IDEs) can "
-                    "publish 500+ AX nodes — capping prevents a single "
-                    "capture from blowing session context. When the cap "
-                    "trims the response, `total_elements` and "
-                    "`truncated_elements` are surfaced in the result so "
-                    "you can re-call with `app=` to narrow scope or raise "
-                    "`max_elements` when the full tree is required. Has no "
-                    "effect on `mode='som'` / `mode='vision'` (those return "
-                    "a screenshot, not the elements array)."
+                    "`action='capture'`. Default 100, hard maximum 1000. "
+                    "Dense UIs (Electron apps such as Obsidian or VS Code, "
+                    "JetBrains IDEs) can publish 500+ AX nodes — capping "
+                    "prevents a single capture from blowing session "
+                    "context. When the cap trims the response, "
+                    "`total_elements` and `truncated_elements` are "
+                    "surfaced in the result so you can re-call with "
+                    "`app=` to narrow scope or raise `max_elements` when "
+                    "the full tree is required. Has no effect on "
+                    "`mode='som'` / `mode='vision'` when a screenshot is "
+                    "included in the response; only the rare image-"
+                    "missing fallback returns an `elements` array and is "
+                    "subject to the cap."
                 ),
+                "default": 100,
                 "minimum": 1,
+                "maximum": 1000,
             },
             # ── click / drag / scroll targeting ────────────────────
             "element": {
diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py
index 029963eeeac..ab9339f4b6c 100644
--- a/tools/computer_use/tool.py
+++ b/tools/computer_use/tool.py
@@ -421,6 +421,10 @@ def _text_response(res: ActionResult) -> str:
 # can exhaust session context after a single capture. The model-facing
 # `max_elements` argument lets callers raise this when they need the full tree.
 _DEFAULT_MAX_ELEMENTS = 100
+# Hard upper bound on caller-supplied `max_elements`. Without this, a tool
+# call passing a very large integer would silently disable the safeguard and
+# reintroduce the original unbounded behavior.
+_MAX_ALLOWED_MAX_ELEMENTS = 1000
 
 
 def _coerce_max_elements(value: Any) -> int:
@@ -428,7 +432,9 @@ def _coerce_max_elements(value: Any) -> int:
 
     Falls back to :data:`_DEFAULT_MAX_ELEMENTS` for missing / non-integer /
     sub-1 inputs so the cap can never be silently disabled by a malformed
-    tool-call argument.
+    tool-call argument. Clamps oversized values to
+    :data:`_MAX_ALLOWED_MAX_ELEMENTS` so a caller cannot bypass the
+    safeguard by passing a very large integer.
     """
     if value is None:
         return _DEFAULT_MAX_ELEMENTS
@@ -438,6 +444,8 @@ def _coerce_max_elements(value: Any) -> int:
         return _DEFAULT_MAX_ELEMENTS
     if n < 1:
         return _DEFAULT_MAX_ELEMENTS
+    if n > _MAX_ALLOWED_MAX_ELEMENTS:
+        return _MAX_ALLOWED_MAX_ELEMENTS
     return n
 
 
@@ -446,7 +454,11 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME
     visible_elements = cap.elements[:max_elements]
     truncated_elements = max(0, total_elements - len(visible_elements))
 
-    element_index = _format_elements(cap.elements)
+    # Index only what's actually surfaced in the response — otherwise the
+    # human-readable summary references element indices the model cannot
+    # find in the JSON `elements` array (e.g. max_elements=10 vs the default
+    # 40-line index window).
+    element_index = _format_elements(visible_elements)
     summary_lines = [
         f"capture mode={cap.mode} {cap.width}x{cap.height}"
         + (f" app={cap.app}" if cap.app else "")
@@ -455,12 +467,6 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME
     ]
     if element_index:
         summary_lines.extend(element_index)
-    if truncated_elements:
-        summary_lines.append(
-            f"  (response truncated to {len(visible_elements)} of {total_elements} elements; "
-            f"raise max_elements or pass app= to narrow)"
-        )
-    summary = "\n".join(summary_lines)
 
     if cap.png_b64 and cap.mode != "ax":
         # Decide whether to hand the screenshot to the auxiliary.vision
@@ -483,6 +489,10 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME
         # JPEG: base64 starts with /9j/   PNG: starts with iVBOR
         _b64_prefix = cap.png_b64[:8]
         _mime = "image/jpeg" if _b64_prefix.startswith("/9j/") else "image/png"
+        # The multimodal response carries the screenshot, not the AX
+        # elements array, so a "response truncated to N of M elements"
+        # note would be inaccurate — skip it on this branch.
+        summary = "\n".join(summary_lines)
         return {
             "_multimodal": True,
             "content": [
@@ -494,7 +504,14 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME
             "meta": {"mode": cap.mode, "width": cap.width, "height": cap.height,
                      "elements": total_elements, "png_bytes": cap.png_bytes_len},
         }
-    # AX-only (or image missing): text path.
+    # AX-only (or image-missing fallback): text path actually carries the
+    # `elements` array, so the truncation note applies here.
+    if truncated_elements:
+        summary_lines.append(
+            f"  (response truncated to {len(visible_elements)} of {total_elements} elements; "
+            f"raise max_elements or pass app= to narrow)"
+        )
+    summary = "\n".join(summary_lines)
     payload: Dict[str, Any] = {
         "mode": cap.mode,
         "width": cap.width,