fix(computer-use): cap AX elements array to prevent context blowup (#22865)

`computer_use(action='capture', mode='ax')` returned the full AX element
list verbatim in the JSON response. Dense Electron / Obsidian / JetBrains
UIs publish 500+ AX nodes (one reproduction in #22865 returned 597
elements against Obsidian), so a single capture could consume enough
context to trigger compression failures or render the session unusable.
The human-readable `_format_elements` summary is already capped at 40
lines, so the truncation gap was invisible to anyone reading the summary
output.

Add a `max_elements` argument to the tool schema, default 100, that
trims the AX `elements` array. When the cap fires, the response surfaces
`total_elements` and `truncated_elements` and appends a "raise
max_elements or pass app= to narrow" hint to the summary so the model
knows the JSON view is partial and can re-issue with a tighter scope.

Validation is centralized in `_coerce_max_elements`: missing /
non-integer / sub-1 inputs fall back to the default cap, so the
protection can never be silently disabled by a malformed tool-call
argument. The cap only affects AX-mode JSON; `mode='som'` and
`mode='vision'` keep returning a screenshot + image-aware summary
unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
briandevans 2026-05-09 17:17:53 -07:00 committed by Teknium
parent 9e30ef224d
commit bb694bad42
3 changed files with 168 additions and 7 deletions

View file

@ -76,6 +76,13 @@ class TestSchema:
modes = set(COMPUTER_USE_SCHEMA["parameters"]["properties"]["mode"]["enum"])
assert modes == {"som", "vision", "ax"}
def test_schema_exposes_max_elements_cap_for_capture(self):
from tools.computer_use.schema import COMPUTER_USE_SCHEMA
props = COMPUTER_USE_SCHEMA["parameters"]["properties"]
assert "max_elements" in props
assert props["max_elements"]["type"] == "integer"
assert props["max_elements"].get("minimum", 1) >= 1
class TestRegistration:
def test_tool_registers_with_registry(self):
@ -337,6 +344,105 @@ class TestCaptureResponse:
assert "AXButton" in text_part["text"]
assert "AXTextField" in text_part["text"]
def _ax_backend_with(self, count: int):
"""Construct a fake backend that yields ``count`` AX elements."""
from tools.computer_use.backend import CaptureResult, UIElement
elements = [
UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
for i in range(count)
]
class FakeBackend:
def start(self): pass
def stop(self): pass
def is_available(self): return True
def capture(self, mode="som", app=None):
return CaptureResult(
mode=mode, width=800, height=600,
png_b64="",
elements=list(elements),
app="Obsidian",
)
def click(self, **kw): ...
def drag(self, **kw): ...
def scroll(self, **kw): ...
def type_text(self, text): ...
def key(self, keys): ...
def list_apps(self): return []
def focus_app(self, app, raise_window=False): ...
return FakeBackend()
def test_capture_ax_caps_elements_at_default_for_dense_trees(self):
"""Regression for #22865: an Electron-style 600-element AX tree must
not emit the entire array verbatim into the tool result.
"""
from tools.computer_use import tool as cu_tool
fake_backend = self._ax_backend_with(600)
cu_tool.reset_backend_for_tests()
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
parsed = json.loads(out)
assert parsed["mode"] == "ax"
assert parsed["total_elements"] == 600
assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS
assert parsed["truncated_elements"] == 600 - cu_tool._DEFAULT_MAX_ELEMENTS
# Truncation must be visible in the human summary so the model knows
# the JSON view is partial and can re-issue with a tighter scope.
assert "truncated to" in parsed["summary"]
def test_capture_ax_honors_explicit_max_elements_override(self):
from tools.computer_use import tool as cu_tool
fake_backend = self._ax_backend_with(600)
cu_tool.reset_backend_for_tests()
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
out = cu_tool.handle_computer_use(
{"action": "capture", "mode": "ax", "max_elements": 250}
)
parsed = json.loads(out)
assert len(parsed["elements"]) == 250
assert parsed["truncated_elements"] == 350
def test_capture_ax_below_cap_is_unchanged(self):
"""Backwards-compat: small captures keep the full elements array and
do not surface a `truncated_elements` field.
"""
from tools.computer_use import tool as cu_tool
fake_backend = self._ax_backend_with(5)
cu_tool.reset_backend_for_tests()
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
parsed = json.loads(out)
assert len(parsed["elements"]) == 5
assert parsed["total_elements"] == 5
assert "truncated_elements" not in parsed
assert "truncated to" not in parsed["summary"]
def test_capture_ax_invalid_max_elements_falls_back_to_default(self):
"""Malformed `max_elements` (string, negative, zero) must not silently
disable the cap and re-introduce the original unbounded behavior.
"""
from tools.computer_use import tool as cu_tool
fake_backend = self._ax_backend_with(600)
cu_tool.reset_backend_for_tests()
for bad in ("not-a-number", 0, -10):
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
out = cu_tool.handle_computer_use(
{"action": "capture", "mode": "ax", "max_elements": bad}
)
parsed = json.loads(out)
assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS, (
f"bad max_elements={bad!r} disabled the cap"
)
# ---------------------------------------------------------------------------
# Anthropic adapter: multimodal tool-result conversion

View file

@ -75,6 +75,23 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = {
"frontmost app's window or the whole screen."
),
},
"max_elements": {
"type": "integer",
"description": (
"Optional cap on the AX `elements` array returned by "
"`action='capture'`. Default 100. Dense UIs (Electron "
"apps such as Obsidian or VS Code, JetBrains IDEs) can "
"publish 500+ AX nodes — capping prevents a single "
"capture from blowing session context. When the cap "
"trims the response, `total_elements` and "
"`truncated_elements` are surfaced in the result so "
"you can re-call with `app=` to narrow scope or raise "
"`max_elements` when the full tree is required. Has no "
"effect on `mode='som'` / `mode='vision'` (those return "
"a screenshot, not the elements array)."
),
"minimum": 1,
},
# ── click / drag / scroll targeting ────────────────────
"element": {
"type": "integer",

View file

@ -317,7 +317,7 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) ->
if mode not in {"som", "vision", "ax"}:
return json.dumps({"error": f"bad mode {mode!r}; use som|vision|ax"})
cap = backend.capture(mode=mode, app=args.get("app"))
return _capture_response(cap)
return _capture_response(cap, max_elements=_coerce_max_elements(args.get("max_elements")))
if action == "wait":
seconds = float(args.get("seconds", 1.0))
@ -416,16 +416,50 @@ def _text_response(res: ActionResult) -> str:
return json.dumps(payload)
def _capture_response(cap: CaptureResult) -> Any:
# Default cap for the AX `elements` array returned by capture. Dense UIs
# (Electron apps, Obsidian, JetBrains IDEs) can publish 500+ AX nodes, which
# can exhaust session context after a single capture. The model-facing
# `max_elements` argument lets callers raise this when they need the full tree.
_DEFAULT_MAX_ELEMENTS = 100
def _coerce_max_elements(value: Any) -> int:
"""Validate the caller-supplied ``max_elements``.
Falls back to :data:`_DEFAULT_MAX_ELEMENTS` for missing / non-integer /
sub-1 inputs so the cap can never be silently disabled by a malformed
tool-call argument.
"""
if value is None:
return _DEFAULT_MAX_ELEMENTS
try:
n = int(value)
except (TypeError, ValueError):
return _DEFAULT_MAX_ELEMENTS
if n < 1:
return _DEFAULT_MAX_ELEMENTS
return n
def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEMENTS) -> Any:
total_elements = len(cap.elements)
visible_elements = cap.elements[:max_elements]
truncated_elements = max(0, total_elements - len(visible_elements))
element_index = _format_elements(cap.elements)
summary_lines = [
f"capture mode={cap.mode} {cap.width}x{cap.height}"
+ (f" app={cap.app}" if cap.app else "")
+ (f" window={cap.window_title!r}" if cap.window_title else ""),
f"{len(cap.elements)} interactable element(s):",
f"{total_elements} interactable element(s):",
]
if element_index:
summary_lines.extend(element_index)
if truncated_elements:
summary_lines.append(
f" (response truncated to {len(visible_elements)} of {total_elements} elements; "
f"raise max_elements or pass app= to narrow)"
)
summary = "\n".join(summary_lines)
if cap.png_b64 and cap.mode != "ax":
@ -458,18 +492,22 @@ def _capture_response(cap: CaptureResult) -> Any:
],
"text_summary": summary,
"meta": {"mode": cap.mode, "width": cap.width, "height": cap.height,
"elements": len(cap.elements), "png_bytes": cap.png_bytes_len},
"elements": total_elements, "png_bytes": cap.png_bytes_len},
}
# AX-only (or image missing): text path.
return json.dumps({
payload: Dict[str, Any] = {
"mode": cap.mode,
"width": cap.width,
"height": cap.height,
"app": cap.app,
"window_title": cap.window_title,
"elements": [_element_to_dict(e) for e in cap.elements],
"elements": [_element_to_dict(e) for e in visible_elements],
"total_elements": total_elements,
"summary": summary,
})
}
if truncated_elements:
payload["truncated_elements"] = truncated_elements
return json.dumps(payload)
# ---------------------------------------------------------------------------