From 280dd4513a838791462088120d24a45aedcb1c74 Mon Sep 17 00:00:00 2001 From: briandevans <252620095+briandevans@users.noreply.github.com> Date: Mon, 11 May 2026 02:11:37 -0700 Subject: [PATCH] fix(computer-use): address Copilot review on max_elements cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four findings from Copilot's review on PR #22891, all in the AX elements-array cap added by 22fa1ed: 1. The truncation note ("response truncated to N of M elements") was appended unconditionally — including in the som/vision multimodal path, whose response carries a screenshot rather than an `elements` array. The note described a payload field that wasn't present. Moved the note into the AX-text branch where the array actually appears. 2. `_format_elements(cap.elements)` ran on the full untrimmed list with its own `max_lines=40` cap, so a caller passing `max_elements=10` would see summary lines referencing `#11..#40` even though the JSON `elements` array only held #1..#10. Format on `visible_elements` instead so the summary indices always exist in the response. 3. `_coerce_max_elements` enforced a lower bound but no upper bound, so `max_elements=10_000_000` silently disabled the safeguard and reintroduced the original context-blow-up. Added a hard cap (`_MAX_ALLOWED_MAX_ELEMENTS = 1000`) that clamps oversized values. 4. The schema string said "Default 100" but the property carried no `default` field, and claimed `max_elements` had no effect on som/ vision while the image-missing fallback path can still return an elements array. Added `"default": 100`, `"maximum": 1000`, and clarified the fallback-path wording. Each finding gets a regression test: - test_capture_ax_clamps_oversized_max_elements_to_hard_cap - test_capture_ax_summary_indices_match_returned_elements - test_capture_multimodal_summary_omits_truncation_note - test_schema_max_elements_documents_default_and_upper_bound Verified with `pytest tests/tools/test_computer_use.py` (53 passed, including the 5 new cases). Confirmed each new test fails on the pre-fix code path before applying the production change. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/tools/test_computer_use.py | 102 +++++++++++++++++++++++++++++++ tools/computer_use/schema.py | 25 +++++--- tools/computer_use/tool.py | 35 ++++++++--- 3 files changed, 143 insertions(+), 19 deletions(-) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index bbaaeb8fcea..4a140c36be3 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -83,6 +83,20 @@ class TestSchema: assert props["max_elements"]["type"] == "integer" assert props["max_elements"].get("minimum", 1) >= 1 + def test_schema_max_elements_documents_default_and_upper_bound(self): + """Schema description must agree with the runtime. The original PR + text said "Default 100" without a corresponding `default` field, and + had no upper bound — both Copilot findings. + """ + from tools.computer_use.schema import COMPUTER_USE_SCHEMA + from tools.computer_use.tool import ( + _DEFAULT_MAX_ELEMENTS, + _MAX_ALLOWED_MAX_ELEMENTS, + ) + prop = COMPUTER_USE_SCHEMA["parameters"]["properties"]["max_elements"] + assert prop.get("default") == _DEFAULT_MAX_ELEMENTS + assert prop.get("maximum") == _MAX_ALLOWED_MAX_ELEMENTS + class TestRegistration: def test_tool_registers_with_registry(self): @@ -443,6 +457,94 @@ class TestCaptureResponse: f"bad max_elements={bad!r} disabled the cap" ) + def test_capture_ax_clamps_oversized_max_elements_to_hard_cap(self): + """A caller passing a very large `max_elements` must not be able to + disable the safeguard. The cap is clamped to a hard upper bound so + the context-blow-up protection cannot be bypassed by argument. + """ + from tools.computer_use import tool as cu_tool + + fake_backend = self._ax_backend_with(5000) + cu_tool.reset_backend_for_tests() + with patch.object(cu_tool, "_get_backend", return_value=fake_backend): + out = cu_tool.handle_computer_use( + {"action": "capture", "mode": "ax", "max_elements": 10_000} + ) + parsed = json.loads(out) + assert len(parsed["elements"]) == cu_tool._MAX_ALLOWED_MAX_ELEMENTS + assert parsed["total_elements"] == 5000 + assert parsed["truncated_elements"] == 5000 - cu_tool._MAX_ALLOWED_MAX_ELEMENTS + + def test_capture_ax_summary_indices_match_returned_elements(self): + """When `max_elements` is below the human-summary's own line cap, the + summary must not index elements that aren't in the returned array. + Otherwise the model sees `#15` in the summary and finds no matching + entry in `elements`. + """ + from tools.computer_use import tool as cu_tool + + fake_backend = self._ax_backend_with(600) + cu_tool.reset_backend_for_tests() + with patch.object(cu_tool, "_get_backend", return_value=fake_backend): + out = cu_tool.handle_computer_use( + {"action": "capture", "mode": "ax", "max_elements": 5} + ) + parsed = json.loads(out) + returned_indices = {e["index"] for e in parsed["elements"]} + summary_lines = parsed["summary"].splitlines() + indexed_lines = [ln for ln in summary_lines if ln.lstrip().startswith("#")] + for ln in indexed_lines: + idx_token = ln.lstrip().split()[0].lstrip("#") + idx = int(idx_token) + assert idx in returned_indices, ( + f"summary references #{idx} but it is absent from elements payload " + f"(returned: {sorted(returned_indices)})" + ) + + def test_capture_multimodal_summary_omits_truncation_note(self): + """The som/vision multimodal envelope returns a screenshot, not an + `elements` array — so a "response truncated to N of M elements" + claim in the summary would be inaccurate. + """ + from tools.computer_use.backend import CaptureResult, UIElement + from tools.computer_use import tool as cu_tool + + fake_png = "iVBORw0KGgo=" + elements = [ + UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1)) + for i in range(600) + ] + + class FakeBackend: + def start(self): pass + def stop(self): pass + def is_available(self): return True + def capture(self, mode="som", app=None): + return CaptureResult( + mode=mode, width=800, height=600, + png_b64=fake_png, elements=list(elements), + app="Obsidian", + ) + def click(self, **kw): ... + def drag(self, **kw): ... + def scroll(self, **kw): ... + def type_text(self, text): ... + def key(self, keys): ... + def list_apps(self): return [] + def focus_app(self, app, raise_window=False): ... + + cu_tool.reset_backend_for_tests() + with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()): + out = cu_tool.handle_computer_use({"action": "capture", "mode": "som"}) + + assert isinstance(out, dict) and out["_multimodal"] is True + text_part = next(p for p in out["content"] if p.get("type") == "text") + assert "truncated to" not in text_part["text"], ( + "multimodal response carries an image, not an elements array; " + "the truncation note describes a payload field that isn't present" + ) + assert "truncated to" not in out["text_summary"] + # --------------------------------------------------------------------------- # Anthropic adapter: multimodal tool-result conversion diff --git a/tools/computer_use/schema.py b/tools/computer_use/schema.py index e716387a6c0..b39ccf06aa9 100644 --- a/tools/computer_use/schema.py +++ b/tools/computer_use/schema.py @@ -79,18 +79,23 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = { "type": "integer", "description": ( "Optional cap on the AX `elements` array returned by " - "`action='capture'`. Default 100. Dense UIs (Electron " - "apps such as Obsidian or VS Code, JetBrains IDEs) can " - "publish 500+ AX nodes — capping prevents a single " - "capture from blowing session context. When the cap " - "trims the response, `total_elements` and " - "`truncated_elements` are surfaced in the result so " - "you can re-call with `app=` to narrow scope or raise " - "`max_elements` when the full tree is required. Has no " - "effect on `mode='som'` / `mode='vision'` (those return " - "a screenshot, not the elements array)." + "`action='capture'`. Default 100, hard maximum 1000. " + "Dense UIs (Electron apps such as Obsidian or VS Code, " + "JetBrains IDEs) can publish 500+ AX nodes — capping " + "prevents a single capture from blowing session " + "context. When the cap trims the response, " + "`total_elements` and `truncated_elements` are " + "surfaced in the result so you can re-call with " + "`app=` to narrow scope or raise `max_elements` when " + "the full tree is required. Has no effect on " + "`mode='som'` / `mode='vision'` when a screenshot is " + "included in the response; only the rare image-" + "missing fallback returns an `elements` array and is " + "subject to the cap." ), + "default": 100, "minimum": 1, + "maximum": 1000, }, # ── click / drag / scroll targeting ──────────────────── "element": { diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py index 029963eeeac..ab9339f4b6c 100644 --- a/tools/computer_use/tool.py +++ b/tools/computer_use/tool.py @@ -421,6 +421,10 @@ def _text_response(res: ActionResult) -> str: # can exhaust session context after a single capture. The model-facing # `max_elements` argument lets callers raise this when they need the full tree. _DEFAULT_MAX_ELEMENTS = 100 +# Hard upper bound on caller-supplied `max_elements`. Without this, a tool +# call passing a very large integer would silently disable the safeguard and +# reintroduce the original unbounded behavior. +_MAX_ALLOWED_MAX_ELEMENTS = 1000 def _coerce_max_elements(value: Any) -> int: @@ -428,7 +432,9 @@ def _coerce_max_elements(value: Any) -> int: Falls back to :data:`_DEFAULT_MAX_ELEMENTS` for missing / non-integer / sub-1 inputs so the cap can never be silently disabled by a malformed - tool-call argument. + tool-call argument. Clamps oversized values to + :data:`_MAX_ALLOWED_MAX_ELEMENTS` so a caller cannot bypass the + safeguard by passing a very large integer. """ if value is None: return _DEFAULT_MAX_ELEMENTS @@ -438,6 +444,8 @@ def _coerce_max_elements(value: Any) -> int: return _DEFAULT_MAX_ELEMENTS if n < 1: return _DEFAULT_MAX_ELEMENTS + if n > _MAX_ALLOWED_MAX_ELEMENTS: + return _MAX_ALLOWED_MAX_ELEMENTS return n @@ -446,7 +454,11 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME visible_elements = cap.elements[:max_elements] truncated_elements = max(0, total_elements - len(visible_elements)) - element_index = _format_elements(cap.elements) + # Index only what's actually surfaced in the response — otherwise the + # human-readable summary references element indices the model cannot + # find in the JSON `elements` array (e.g. max_elements=10 vs the default + # 40-line index window). + element_index = _format_elements(visible_elements) summary_lines = [ f"capture mode={cap.mode} {cap.width}x{cap.height}" + (f" app={cap.app}" if cap.app else "") @@ -455,12 +467,6 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME ] if element_index: summary_lines.extend(element_index) - if truncated_elements: - summary_lines.append( - f" (response truncated to {len(visible_elements)} of {total_elements} elements; " - f"raise max_elements or pass app= to narrow)" - ) - summary = "\n".join(summary_lines) if cap.png_b64 and cap.mode != "ax": # Decide whether to hand the screenshot to the auxiliary.vision @@ -483,6 +489,10 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME # JPEG: base64 starts with /9j/ PNG: starts with iVBOR _b64_prefix = cap.png_b64[:8] _mime = "image/jpeg" if _b64_prefix.startswith("/9j/") else "image/png" + # The multimodal response carries the screenshot, not the AX + # elements array, so a "response truncated to N of M elements" + # note would be inaccurate — skip it on this branch. + summary = "\n".join(summary_lines) return { "_multimodal": True, "content": [ @@ -494,7 +504,14 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME "meta": {"mode": cap.mode, "width": cap.width, "height": cap.height, "elements": total_elements, "png_bytes": cap.png_bytes_len}, } - # AX-only (or image missing): text path. + # AX-only (or image-missing fallback): text path actually carries the + # `elements` array, so the truncation note applies here. + if truncated_elements: + summary_lines.append( + f" (response truncated to {len(visible_elements)} of {total_elements} elements; " + f"raise max_elements or pass app= to narrow)" + ) + summary = "\n".join(summary_lines) payload: Dict[str, Any] = { "mode": cap.mode, "width": cap.width,