From fbdca64f73476001b8909e542a4343e50f242e85 Mon Sep 17 00:00:00 2001
From: Rodrigo <fabianoeq@gmail.com>
Date: Sat, 9 May 2026 15:19:26 -0300
Subject: [PATCH] fix(computer-use): skip capture_after when action failed
 (ok=False)

_maybe_follow_capture() issued a follow-up screenshot unconditionally
when capture_after=True, even when res.ok=False. The model then received
a normal-looking screenshot alongside an error message, and in practice
it often ignored ok=False and proceeded as if the action had succeeded.

Fix: return _text_response(res) early when res.ok is False so the model
receives only the error and can decide how to recover.

Tests added:
- test_capture_after_skipped_when_action_failed: patches click to return
  ok=False and asserts no capture call is issued.
- test_capture_after_fires_when_action_succeeds: ensures the happy path
  still triggers the follow-up capture.
---
 tests/tools/test_computer_use.py | 33 ++++++++++++++++++++++++++++++++
 tools/computer_use/tool.py       |  5 +++++
 2 files changed, 38 insertions(+)

diff --git a/tests/tools/test_computer_use.py b/tests/tools/test_computer_use.py
index 49d5bbdfbf9..44a97db47ac 100644
--- a/tests/tools/test_computer_use.py
+++ b/tests/tools/test_computer_use.py
@@ -240,6 +240,39 @@ class TestDispatch:
         out = handle_computer_use({"action": "set_value"})
         parsed = json.loads(out)
         assert "error" in parsed
+    def test_capture_after_skipped_when_action_failed(self, noop_backend):
+        """capture_after must not fire when res.ok=False (regression guard).
+
+        A follow-up screenshot after a failed action shows the screen in a
+        normal state, misleading the model into thinking the action succeeded.
+        """
+        from unittest.mock import patch
+        from tools.computer_use.backend import ActionResult
+        from tools.computer_use.tool import handle_computer_use
+
+        # Make click() return a failure.
+        with patch.object(noop_backend, "click",
+                          return_value=ActionResult(ok=False, action="click",
+                                                    message="element not found")):
+            out = handle_computer_use({"action": "click", "element": 99,
+                                       "capture_after": True})
+
+        parsed = json.loads(out)
+        # Should return the error, not a multimodal capture.
+        assert parsed.get("ok") is False
+        assert parsed.get("action") == "click"
+        # No follow-up capture should have been issued.
+        capture_calls = [c for c in noop_backend.calls if c[0] == "capture"]
+        assert len(capture_calls) == 0, "capture must not be called after a failed action"
+
+    def test_capture_after_fires_when_action_succeeds(self, noop_backend):
+        """capture_after must trigger for successful actions."""
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({"action": "click", "element": 1,
+                                   "capture_after": True})
+        # Noop backend returns ok=True, so capture should have been called.
+        capture_calls = [c for c in noop_backend.calls if c[0] == "capture"]
+        assert len(capture_calls) == 1
 
 
 # ---------------------------------------------------------------------------
diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py
index 41308733df1..abb14ebd878 100644
--- a/tools/computer_use/tool.py
+++ b/tools/computer_use/tool.py
@@ -675,6 +675,11 @@ def _maybe_follow_capture(
 ) -> Any:
     if not do_capture:
         return _text_response(res)
+    # Skip the follow-up capture when the action itself failed: showing a
+    # normal-looking screenshot after a failure misleads the model into thinking
+    # the action succeeded. Return the error text instead.
+    if not res.ok:
+        return _text_response(res)
     try:
         # Preserve the app context established by the preceding capture/focus_app so
         # that capture_after=True re-captures the same app rather than the frontmost