From 0e2873a77d221649f613cca266612406e63e5870 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 21 May 2026 18:07:09 -0700 Subject: [PATCH] fix(computer_use): build summary once before aux-vision routing branch The cherry-pick of #22891 (max_elements cap) reshuffled _capture_response so summary was assigned inside both the multimodal and AX branches, but #30126's aux-vision routing call (_route_capture_through_aux_vision) fires BEFORE either branch and references the not-yet-bound name. Compute summary once up-front, keep the AX-branch rebuild for the truncation note. --- tools/computer_use/tool.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py index ab9339f4b6c..82c6792eeb9 100644 --- a/tools/computer_use/tool.py +++ b/tools/computer_use/tool.py @@ -467,6 +467,12 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME ] if element_index: summary_lines.extend(element_index) + # Multimodal and AX paths both reference `summary`; build it once up-front + # so the aux-vision routing branch (which fires before either path is + # selected) has a valid value to hand to _route_capture_through_aux_vision. + # The AX path appends the "truncated to N of M" note to summary_lines + # below and rebuilds; the multimodal path keeps this version untouched. + summary = "\n".join(summary_lines) if cap.png_b64 and cap.mode != "ax": # Decide whether to hand the screenshot to the auxiliary.vision @@ -492,7 +498,6 @@ def _capture_response(cap: CaptureResult, max_elements: int = _DEFAULT_MAX_ELEME # The multimodal response carries the screenshot, not the AX # elements array, so a "response truncated to N of M elements" # note would be inaccurate — skip it on this branch. - summary = "\n".join(summary_lines) return { "_multimodal": True, "content": [