From fbdca64f73476001b8909e542a4343e50f242e85 Mon Sep 17 00:00:00 2001 From: Rodrigo Date: Sat, 9 May 2026 15:19:26 -0300 Subject: [PATCH] fix(computer-use): skip capture_after when action failed (ok=False) _maybe_follow_capture() issued a follow-up screenshot unconditionally when capture_after=True, even when res.ok=False. The model then received a normal-looking screenshot alongside an error message, and in practice it often ignored ok=False and proceeded as if the action had succeeded. Fix: return _text_response(res) early when res.ok is False so the model receives only the error and can decide how to recover. Tests added: - test_capture_after_skipped_when_action_failed: patches click to return ok=False and asserts no capture call is issued. - test_capture_after_fires_when_action_succeeds: ensures the happy path still triggers the follow-up capture. --- tests/tools/test_computer_use.py | 33 ++++++++++++++++++++++++++++++++ tools/computer_use/tool.py | 5 +++++ 2 files changed, 38 insertions(+) diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py index 49d5bbdfbf9..44a97db47ac 100644 --- a/tests/tools/test_computer_use.py +++ b/tests/tools/test_computer_use.py @@ -240,6 +240,39 @@ class TestDispatch: out = handle_computer_use({"action": "set_value"}) parsed = json.loads(out) assert "error" in parsed + def test_capture_after_skipped_when_action_failed(self, noop_backend): + """capture_after must not fire when res.ok=False (regression guard). + + A follow-up screenshot after a failed action shows the screen in a + normal state, misleading the model into thinking the action succeeded. + """ + from unittest.mock import patch + from tools.computer_use.backend import ActionResult + from tools.computer_use.tool import handle_computer_use + + # Make click() return a failure. + with patch.object(noop_backend, "click", + return_value=ActionResult(ok=False, action="click", + message="element not found")): + out = handle_computer_use({"action": "click", "element": 99, + "capture_after": True}) + + parsed = json.loads(out) + # Should return the error, not a multimodal capture. + assert parsed.get("ok") is False + assert parsed.get("action") == "click" + # No follow-up capture should have been issued. + capture_calls = [c for c in noop_backend.calls if c[0] == "capture"] + assert len(capture_calls) == 0, "capture must not be called after a failed action" + + def test_capture_after_fires_when_action_succeeds(self, noop_backend): + """capture_after must trigger for successful actions.""" + from tools.computer_use.tool import handle_computer_use + out = handle_computer_use({"action": "click", "element": 1, + "capture_after": True}) + # Noop backend returns ok=True, so capture should have been called. + capture_calls = [c for c in noop_backend.calls if c[0] == "capture"] + assert len(capture_calls) == 1 # --------------------------------------------------------------------------- diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py index 41308733df1..abb14ebd878 100644 --- a/tools/computer_use/tool.py +++ b/tools/computer_use/tool.py @@ -675,6 +675,11 @@ def _maybe_follow_capture( ) -> Any: if not do_capture: return _text_response(res) + # Skip the follow-up capture when the action itself failed: showing a + # normal-looking screenshot after a failure misleads the model into thinking + # the action succeeded. Return the error text instead. + if not res.ok: + return _text_response(res) try: # Preserve the app context established by the preceding capture/focus_app so # that capture_after=True re-captures the same app rather than the frontmost