mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-02 07:11:49 +00:00
fix(computer-use): address Copilot review on max_elements cap
Four findings from Copilot's review on PR #22891, all in the AX elements-array cap added by 22fa1ed: 1. The truncation note ("response truncated to N of M elements") was appended unconditionally — including in the som/vision multimodal path, whose response carries a screenshot rather than an `elements` array. The note described a payload field that wasn't present. Moved the note into the AX-text branch where the array actually appears. 2. `_format_elements(cap.elements)` ran on the full untrimmed list with its own `max_lines=40` cap, so a caller passing `max_elements=10` would see summary lines referencing `#11..#40` even though the JSON `elements` array only held #1..#10. Format on `visible_elements` instead so the summary indices always exist in the response. 3. `_coerce_max_elements` enforced a lower bound but no upper bound, so `max_elements=10_000_000` silently disabled the safeguard and reintroduced the original context-blow-up. Added a hard cap (`_MAX_ALLOWED_MAX_ELEMENTS = 1000`) that clamps oversized values. 4. The schema string said "Default 100" but the property carried no `default` field, and claimed `max_elements` had no effect on som/ vision while the image-missing fallback path can still return an elements array. Added `"default": 100`, `"maximum": 1000`, and clarified the fallback-path wording. Each finding gets a regression test: - test_capture_ax_clamps_oversized_max_elements_to_hard_cap - test_capture_ax_summary_indices_match_returned_elements - test_capture_multimodal_summary_omits_truncation_note - test_schema_max_elements_documents_default_and_upper_bound Verified with `pytest tests/tools/test_computer_use.py` (53 passed, including the 5 new cases). Confirmed each new test fails on the pre-fix code path before applying the production change. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bb694bad42
commit
280dd4513a
3 changed files with 143 additions and 19 deletions
|
|
@ -83,6 +83,20 @@ class TestSchema:
|
|||
assert props["max_elements"]["type"] == "integer"
|
||||
assert props["max_elements"].get("minimum", 1) >= 1
|
||||
|
||||
def test_schema_max_elements_documents_default_and_upper_bound(self):
|
||||
"""Schema description must agree with the runtime. The original PR
|
||||
text said "Default 100" without a corresponding `default` field, and
|
||||
had no upper bound — both Copilot findings.
|
||||
"""
|
||||
from tools.computer_use.schema import COMPUTER_USE_SCHEMA
|
||||
from tools.computer_use.tool import (
|
||||
_DEFAULT_MAX_ELEMENTS,
|
||||
_MAX_ALLOWED_MAX_ELEMENTS,
|
||||
)
|
||||
prop = COMPUTER_USE_SCHEMA["parameters"]["properties"]["max_elements"]
|
||||
assert prop.get("default") == _DEFAULT_MAX_ELEMENTS
|
||||
assert prop.get("maximum") == _MAX_ALLOWED_MAX_ELEMENTS
|
||||
|
||||
|
||||
class TestRegistration:
|
||||
def test_tool_registers_with_registry(self):
|
||||
|
|
@ -443,6 +457,94 @@ class TestCaptureResponse:
|
|||
f"bad max_elements={bad!r} disabled the cap"
|
||||
)
|
||||
|
||||
def test_capture_ax_clamps_oversized_max_elements_to_hard_cap(self):
|
||||
"""A caller passing a very large `max_elements` must not be able to
|
||||
disable the safeguard. The cap is clamped to a hard upper bound so
|
||||
the context-blow-up protection cannot be bypassed by argument.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(5000)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use(
|
||||
{"action": "capture", "mode": "ax", "max_elements": 10_000}
|
||||
)
|
||||
parsed = json.loads(out)
|
||||
assert len(parsed["elements"]) == cu_tool._MAX_ALLOWED_MAX_ELEMENTS
|
||||
assert parsed["total_elements"] == 5000
|
||||
assert parsed["truncated_elements"] == 5000 - cu_tool._MAX_ALLOWED_MAX_ELEMENTS
|
||||
|
||||
def test_capture_ax_summary_indices_match_returned_elements(self):
|
||||
"""When `max_elements` is below the human-summary's own line cap, the
|
||||
summary must not index elements that aren't in the returned array.
|
||||
Otherwise the model sees `#15` in the summary and finds no matching
|
||||
entry in `elements`.
|
||||
"""
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_backend = self._ax_backend_with(600)
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=fake_backend):
|
||||
out = cu_tool.handle_computer_use(
|
||||
{"action": "capture", "mode": "ax", "max_elements": 5}
|
||||
)
|
||||
parsed = json.loads(out)
|
||||
returned_indices = {e["index"] for e in parsed["elements"]}
|
||||
summary_lines = parsed["summary"].splitlines()
|
||||
indexed_lines = [ln for ln in summary_lines if ln.lstrip().startswith("#")]
|
||||
for ln in indexed_lines:
|
||||
idx_token = ln.lstrip().split()[0].lstrip("#")
|
||||
idx = int(idx_token)
|
||||
assert idx in returned_indices, (
|
||||
f"summary references #{idx} but it is absent from elements payload "
|
||||
f"(returned: {sorted(returned_indices)})"
|
||||
)
|
||||
|
||||
def test_capture_multimodal_summary_omits_truncation_note(self):
|
||||
"""The som/vision multimodal envelope returns a screenshot, not an
|
||||
`elements` array — so a "response truncated to N of M elements"
|
||||
claim in the summary would be inaccurate.
|
||||
"""
|
||||
from tools.computer_use.backend import CaptureResult, UIElement
|
||||
from tools.computer_use import tool as cu_tool
|
||||
|
||||
fake_png = "iVBORw0KGgo="
|
||||
elements = [
|
||||
UIElement(index=i + 1, role="AXButton", label=f"el-{i}", bounds=(0, 0, 1, 1))
|
||||
for i in range(600)
|
||||
]
|
||||
|
||||
class FakeBackend:
|
||||
def start(self): pass
|
||||
def stop(self): pass
|
||||
def is_available(self): return True
|
||||
def capture(self, mode="som", app=None):
|
||||
return CaptureResult(
|
||||
mode=mode, width=800, height=600,
|
||||
png_b64=fake_png, elements=list(elements),
|
||||
app="Obsidian",
|
||||
)
|
||||
def click(self, **kw): ...
|
||||
def drag(self, **kw): ...
|
||||
def scroll(self, **kw): ...
|
||||
def type_text(self, text): ...
|
||||
def key(self, keys): ...
|
||||
def list_apps(self): return []
|
||||
def focus_app(self, app, raise_window=False): ...
|
||||
|
||||
cu_tool.reset_backend_for_tests()
|
||||
with patch.object(cu_tool, "_get_backend", return_value=FakeBackend()):
|
||||
out = cu_tool.handle_computer_use({"action": "capture", "mode": "som"})
|
||||
|
||||
assert isinstance(out, dict) and out["_multimodal"] is True
|
||||
text_part = next(p for p in out["content"] if p.get("type") == "text")
|
||||
assert "truncated to" not in text_part["text"], (
|
||||
"multimodal response carries an image, not an elements array; "
|
||||
"the truncation note describes a payload field that isn't present"
|
||||
)
|
||||
assert "truncated to" not in out["text_summary"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Anthropic adapter: multimodal tool-result conversion
|
||||
|
|
|
|||
|
|
@ -79,18 +79,23 @@ COMPUTER_USE_SCHEMA: Dict[str, Any] = {
|
|||
"type": "integer",
|
||||
"description": (
|
||||
"Optional cap on the AX `elements` array returned by "
|
||||
"`action='capture'`. Default 100. Dense UIs (Electron "
|
||||
"apps such as Obsidian or VS Code, JetBrains IDEs) can "
|
||||
"publish 500+ AX nodes — capping prevents a single "
|
||||
"capture from blowing session context. When the cap "
|
||||
"trims the response, `total_elements` and "
|
||||
"`truncated_elements` are surfaced in the result so "
|
||||
"you can re-call with `app=` to narrow scope or raise "
|
||||
"`max_elements` when the full tree is required. Has no "
|
||||
"effect on `mode='som'` / `mode='vision'` (those return "
|
||||
"a screenshot, not the elements array)."
|
||||
"`action='capture'`. Default 100, hard maximum 1000. "
|
||||
"Dense UIs (Electron apps such as Obsidian or VS Code, "
|
||||
"JetBrains IDEs) can publish 500+ AX nodes — capping "
|
||||
"prevents a single capture from blowing session "
|
||||
"context. When the cap trims the response, "
|
||||
"`total_elements` and `truncated_elements` are "
|
||||
"surfaced in the result so you can re-call with "
|
||||
"`app=` to narrow scope or raise `max_elements` when "
|
||||
"the full tree is required. Has no effect on "
|
||||
"`mode='som'` / `mode='vision'` when a screenshot is "
|
||||
"included in the response; only the rare image-"
|
||||
"missing fallback returns an `elements` array and is "
|
||||
"subject to the cap."
|
||||
),
|
||||
"default": 100,
|
||||
"minimum": 1,
|
||||
"maximum": 1000,
|
||||
},
|
||||
# ── click / drag / scroll targeting ────────────────────
|
||||
"element": {
|
||||
|
|
|
|||
|
|
@ -421,6 +421,10 @@ def _text_response(res: ActionResult) -> str:
|
|||
# can exhaust session context after a single capture. The model-facing
|
||||
# `max_elements` argument lets callers raise this when they need the full tree.
|
||||
_DEFAULT_MAX_ELEMENTS = 100
|
||||
# Hard upper bound on caller-supplied `max_elements`. Without this, a tool
|
||||
# call passing a very large integer would silently disable the safeguard and
|
||||
# reintroduce the original unbounded behavior.
|
||||
_MAX_ALLOWED_MAX_ELEMENTS = 1000
|
||||
|
||||
|
||||
def _coerce_max_elements(value: Any) -> int:
|
||||
|
|
@ -428,7 +432,9 @@ def _coerce_max_elements(value: Any) -> int:
|
|||
|
||||
Falls back to :data:`_DEFAULT_MAX_ELEMENTS` for missing / non-integer /
|
||||
sub-1 inputs so the cap can never be silently disabled by a malformed
|
||||
tool-call argument.
|
||||
tool-call argument. Clamps oversized values to
|
||||
:data:`_MAX_ALLOWED_MAX_ELEMENTS` so a caller cannot bypass the
|
||||
safeguard by passing a very large integer.
|
||||
"""
|
||||
if value is None:
|
||||
return _DEFAULT_MAX_ELEMENTS
|
||||
|
|
@ -438,6 +444,8 @@ def _coerce_max_elements(value: Any) -> int:
|
|||
return _DEFAULT_MAX_ELEMENTS
|
||||
if n < 1:
|
||||
return _DEFAULT_MAX_ELEMENTS
|
||||
if n > _MAX_ALLOWED_MAX_ELEMENTS:
|
||||
return _MAX_ALLOWED_MAX_ELEMENTS
|
||||
return n
|
||||
|
||||
|
||||
|
|
@ -446,7 +454,11 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME
|
|||
visible_elements = cap.elements[:max_elements]
|
||||
truncated_elements = max(0, total_elements - len(visible_elements))
|
||||
|
||||
element_index = _format_elements(cap.elements)
|
||||
# Index only what's actually surfaced in the response — otherwise the
|
||||
# human-readable summary references element indices the model cannot
|
||||
# find in the JSON `elements` array (e.g. max_elements=10 vs the default
|
||||
# 40-line index window).
|
||||
element_index = _format_elements(visible_elements)
|
||||
summary_lines = [
|
||||
f"capture mode={cap.mode} {cap.width}x{cap.height}"
|
||||
+ (f" app={cap.app}" if cap.app else "")
|
||||
|
|
@ -455,12 +467,6 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME
|
|||
]
|
||||
if element_index:
|
||||
summary_lines.extend(element_index)
|
||||
if truncated_elements:
|
||||
summary_lines.append(
|
||||
f" (response truncated to {len(visible_elements)} of {total_elements} elements; "
|
||||
f"raise max_elements or pass app= to narrow)"
|
||||
)
|
||||
summary = "\n".join(summary_lines)
|
||||
|
||||
if cap.png_b64 and cap.mode != "ax":
|
||||
# Decide whether to hand the screenshot to the auxiliary.vision
|
||||
|
|
@ -483,6 +489,10 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME
|
|||
# JPEG: base64 starts with /9j/ PNG: starts with iVBOR
|
||||
_b64_prefix = cap.png_b64[:8]
|
||||
_mime = "image/jpeg" if _b64_prefix.startswith("/9j/") else "image/png"
|
||||
# The multimodal response carries the screenshot, not the AX
|
||||
# elements array, so a "response truncated to N of M elements"
|
||||
# note would be inaccurate — skip it on this branch.
|
||||
summary = "\n".join(summary_lines)
|
||||
return {
|
||||
"_multimodal": True,
|
||||
"content": [
|
||||
|
|
@ -494,7 +504,14 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME
|
|||
"meta": {"mode": cap.mode, "width": cap.width, "height": cap.height,
|
||||
"elements": total_elements, "png_bytes": cap.png_bytes_len},
|
||||
}
|
||||
# AX-only (or image missing): text path.
|
||||
# AX-only (or image-missing fallback): text path actually carries the
|
||||
# `elements` array, so the truncation note applies here.
|
||||
if truncated_elements:
|
||||
summary_lines.append(
|
||||
f" (response truncated to {len(visible_elements)} of {total_elements} elements; "
|
||||
f"raise max_elements or pass app= to narrow)"
|
||||
)
|
||||
summary = "\n".join(summary_lines)
|
||||
payload: Dict[str, Any] = {
|
||||
"mode": cap.mode,
|
||||
"width": cap.width,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue