mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(computer-use): cap AX elements array to prevent context blowup (#22865)
`computer_use(action='capture', mode='ax')` returned the full AX element list verbatim in the JSON response. Dense Electron / Obsidian / JetBrains UIs publish 500+ AX nodes (one reproduction in #22865 returned 597 elements against Obsidian), so a single capture could consume enough context to trigger compression failures or render the session unusable. The human-readable `_format_elements` summary is already capped at 40 lines, so the truncation gap was invisible to anyone reading the summary output. Add a `max_elements` argument to the tool schema, default 100, that trims the AX `elements` array. When the cap fires, the response surfaces `total_elements` and `truncated_elements` and appends a "raise max_elements or pass app= to narrow" hint to the summary so the model knows the JSON view is partial and can re-issue with a tighter scope. Validation is centralized in `_coerce_max_elements`: missing / non-integer / sub-1 inputs fall back to the default cap, so the protection can never be silently disabled by a malformed tool-call argument. The cap only affects AX-mode JSON; `mode='som'` and `mode='vision'` keep returning a screenshot + image-aware summary unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
9e30ef224d
commit
bb694bad42
3 changed files with 168 additions and 7 deletions
|
|
@ -76,6 +76,13 @@ class TestSchema:
|
|||
modes = set(COMPUTER_USE_SCHEMA["parameters"]["properties"]["mode"]["enum"])
|
||||
assert modes == {"som", "vision", "ax"}
|
||||
|
||||
def test_schema_exposes_max_elements_cap_for_capture(self):
|
||||
from tools.computer_use.schema import COMPUTER_USE_SCHEMA
|
||||
props = COMPUTER_USE_SCHEMA["parameters"]["properties"]
|
||||
assert "max_elements" in props
|
||||
assert props["max_elements"]["type"] == "integer"
|
||||
assert props["max_elements"].get("minimum", 1) >= 1
|
||||
|
||||
|
||||
class TestRegistration:
|
||||
def test_tool_registers_with_registry(self):
|
||||
|
|
@ -337,6 +344,105 @@ class TestCaptureResponse:
|
|||
assert "AXButton" in text_part["text"]
|
||||
assert "AXTextField" in text_part["text"]
|
||||
|
||||
def _ax_backend_with(self, count: int):
|
||||
"""Construct a fake backend that yields ``count`` AX elements."""
|
||||
from tools.computer_use.backend import CaptureResult, UIElement
|
||||
|
||||
elements = [
|
||||
UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
|
||||
for i in range(count)
|
||||
]
|
||||
|
||||
class FakeBackend:
|
||||
def start(self): pass
|
||||
def stop(self): pass
|
||||
def is_available(self): return True
|
||||
def capture(self, mode="som", app=None):
|
||||
return CaptureResult(
|
||||
mode=mode, width=800, height=600,
|
||||
png_b64="",
|
||||
elements=list(elements),
|
||||
app="Obsidian",
|
||||
)
|
||||
def click(self, **kw): ...
|
||||
def drag(self, **kw): ...
|
||||
def scroll(self, **kw): ...
|
||||
def type_text(self, text): ...
|
||||
def key(self, keys): ...
|
||||
def list_apps(self): return []
|
||||
def focus_app(self, app, raise_window=False): ...
|
||||
|
||||
return FakeBackend()
|
||||
|
||||
def test_capture_ax_caps_elements_at_default_for_dense_trees(self):
|
||||
"""Regression for #22865: an Electron-style 600-element AX tree must
|
||||
not emit the entire array verbatim into the tool result.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(600)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
|
||||
|
||||
parsed = json.loads(out)
|
||||
assert parsed["mode"] == "ax"
|
||||
assert parsed["total_elements"] == 600
|
||||
assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS
|
||||
assert parsed["truncated_elements"] == 600 - cu_tool._DEFAULT_MAX_ELEMENTS
|
||||
# Truncation must be visible in the human summary so the model knows
|
||||
# the JSON view is partial and can re-issue with a tighter scope.
|
||||
assert "truncated to" in parsed["summary"]
|
||||
|
||||
def test_capture_ax_honors_explicit_max_elements_override(self):
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(600)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use(
|
||||
{"action": "capture", "mode": "ax", "max_elements": 250}
|
||||
)
|
||||
|
||||
parsed = json.loads(out)
|
||||
assert len(parsed["elements"]) == 250
|
||||
assert parsed["truncated_elements"] == 350
|
||||
|
||||
def test_capture_ax_below_cap_is_unchanged(self):
|
||||
"""Backwards-compat: small captures keep the full elements array and
|
||||
do not surface a `truncated_elements` field.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(5)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use({"action": "capture", "mode": "ax"})
|
||||
|
||||
parsed = json.loads(out)
|
||||
assert len(parsed["elements"]) == 5
|
||||
assert parsed["total_elements"] == 5
|
||||
assert "truncated_elements" not in parsed
|
||||
assert "truncated to" not in parsed["summary"]
|
||||
|
||||
def test_capture_ax_invalid_max_elements_falls_back_to_default(self):
|
||||
"""Malformed `max_elements` (string, negative, zero) must not silently
|
||||
disable the cap and re-introduce the original unbounded behavior.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(600)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
for bad in ("not-a-number", 0, -10):
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use(
|
||||
{"action": "capture", "mode": "ax", "max_elements": bad}
|
||||
)
|
||||
parsed = json.loads(out)
|
||||
assert len(parsed["elements"]) == cu_tool._DEFAULT_MAX_ELEMENTS, (
|
||||
f"bad max_elements={bad!r} disabled the cap"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Anthropic adapter: multimodal tool-result conversion
|
||||
|
|
|
|||
|
|
@ -75,6 +75,23 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = {
|
|||
"frontmost app's window or the whole screen."
|
||||
),
|
||||
},
|
||||
"max_elements": {
|
||||
"type": "integer",
|
||||
"description": (
|
||||
"Optional cap on the AX `elements` array returned by "
|
||||
"`action='capture'`. Default 100. Dense UIs (Electron "
|
||||
"apps such as Obsidian or VS Code, JetBrains IDEs) can "
|
||||
"publish 500+ AX nodes — capping prevents a single "
|
||||
"capture from blowing session context. When the cap "
|
||||
"trims the response, `total_elements` and "
|
||||
"`truncated_elements` are surfaced in the result so "
|
||||
"you can re-call with `app=` to narrow scope or raise "
|
||||
"`max_elements` when the full tree is required. Has no "
|
||||
"effect on `mode='som'` / `mode='vision'` (those return "
|
||||
"a screenshot, not the elements array)."
|
||||
),
|
||||
"minimum": 1,
|
||||
},
|
||||
# ── click / drag / scroll targeting ────────────────────
|
||||
"element": {
|
||||
"type": "integer",
|
||||
|
|
|
|||
|
|
@ -317,7 +317,7 @@ def _dispatch(backend: ComputerUseBackend, action: str, args: Dict[str, Any]) ->
|
|||
if mode not in {"som", "vision", "ax"}:
|
||||
return json.dumps({"error": f"bad mode {mode!r}; use som|vision|ax"})
|
||||
cap = backend.capture(mode=mode, app=args.get("app"))
|
||||
return _capture_response(cap)
|
||||
return _capture_response(cap, max_elements=_coerce_max_elements(args.get("max_elements")))
|
||||
|
||||
if action == "wait":
|
||||
seconds = float(args.get("seconds", 1.0))
|
||||
|
|
@ -416,16 +416,50 @@ def _text_response(res: ActionResult) -> str:
|
|||
return json.dumps(payload)
|
||||
|
||||
|
||||
def _capture_response(cap: CaptureResult) -> Any:
|
||||
# Default cap for the AX `elements` array returned by capture. Dense UIs
|
||||
# (Electron apps, Obsidian, JetBrains IDEs) can publish 500+ AX nodes, which
|
||||
# can exhaust session context after a single capture. The model-facing
|
||||
# `max_elements` argument lets callers raise this when they need the full tree.
|
||||
_DEFAULT_MAX_ELEMENTS = 100
|
||||
|
||||
|
||||
def _coerce_max_elements(value: Any) -> int:
|
||||
"""Validate the caller-supplied ``max_elements``.
|
||||
|
||||
Falls back to :data:`_DEFAULT_MAX_ELEMENTS` for missing / non-integer /
|
||||
sub-1 inputs so the cap can never be silently disabled by a malformed
|
||||
tool-call argument.
|
||||
"""
|
||||
if value is None:
|
||||
return _DEFAULT_MAX_ELEMENTS
|
||||
try:
|
||||
n = int(value)
|
||||
except (TypeError, ValueError):
|
||||
return _DEFAULT_MAX_ELEMENTS
|
||||
if n < 1:
|
||||
return _DEFAULT_MAX_ELEMENTS
|
||||
return n
|
||||
|
||||
|
||||
def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEMENTS) -> Any:
|
||||
total_elements = len(cap.elements)
|
||||
visible_elements = cap.elements[:max_elements]
|
||||
truncated_elements = max(0, total_elements - len(visible_elements))
|
||||
|
||||
element_index = _format_elements(cap.elements)
|
||||
summary_lines = [
|
||||
f"capture mode={cap.mode} {cap.width}x{cap.height}"
|
||||
+ (f" app={cap.app}" if cap.app else "")
|
||||
+ (f" window={cap.window_title!r}" if cap.window_title else ""),
|
||||
f"{len(cap.elements)} interactable element(s):",
|
||||
f"{total_elements} interactable element(s):",
|
||||
]
|
||||
if element_index:
|
||||
summary_lines.extend(element_index)
|
||||
if truncated_elements:
|
||||
summary_lines.append(
|
||||
f" (response truncated to {len(visible_elements)} of {total_elements} elements; "
|
||||
f"raise max_elements or pass app= to narrow)"
|
||||
)
|
||||
summary = "\n".join(summary_lines)
|
||||
|
||||
if cap.png_b64 and cap.mode != "ax":
|
||||
|
|
@ -458,18 +492,22 @@ def _capture_response(cap: CaptureResult) -> Any:
|
|||
],
|
||||
"text_summary": summary,
|
||||
"meta": {"mode": cap.mode, "width": cap.width, "height": cap.height,
|
||||
"elements": len(cap.elements), "png_bytes": cap.png_bytes_len},
|
||||
"elements": total_elements, "png_bytes": cap.png_bytes_len},
|
||||
}
|
||||
# AX-only (or image missing): text path.
|
||||
return json.dumps({
|
||||
payload: Dict[str, Any] = {
|
||||
"mode": cap.mode,
|
||||
"width": cap.width,
|
||||
"height": cap.height,
|
||||
"app": cap.app,
|
||||
"window_title": cap.window_title,
|
||||
"elements": [_element_to_dict(e) for e in cap.elements],
|
||||
"elements": [_element_to_dict(e) for e in visible_elements],
|
||||
"total_elements": total_elements,
|
||||
"summary": summary,
|
||||
})
|
||||
}
|
||||
if truncated_elements:
|
||||
payload["truncated_elements"] = truncated_elements
|
||||
return json.dumps(payload)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue