fix(computer-use): skip capture_after when action failed (ok=False)

_maybe_follow_capture() issued a follow-up screenshot unconditionally when capture_after=True, even when res.ok=False. The model then received a normal-looking screenshot alongside an error message, and in practice it often ignored ok=False and proceeded as if the action had succeeded. Fix: return _text_response(res) early when res.ok is False so the model receives only the error and can decide how to recover. Tests added: - test_capture_after_skipped_when_action_failed: patches click to return ok=False and asserts no capture call is issued. - test_capture_after_fires_when_action_succeeds: ensures the happy path still triggers the follow-up capture.
2026-07-21 16:18:55 +00:00 · 2026-05-09 15:19:26 -03:00 · 2026-05-09 15:19:26 -03:00 · fbdca64f73
commit fbdca64f73
parent 07b7cf6fe4
2 changed files with 38 additions and 0 deletions
--- a/tests/tools/test_computer_use.py
+++ b/tests/tools/test_computer_use.py
@ -240,6 +240,39 @@ class TestDispatch:
        out = handle_computer_use({"action": "set_value"})
        parsed = json.loads(out)
        assert "error" in parsed
+    def test_capture_after_skipped_when_action_failed(self, noop_backend):
+        """capture_after must not fire when res.ok=False (regression guard).
+
+        A follow-up screenshot after a failed action shows the screen in a
+        normal state, misleading the model into thinking the action succeeded.
+        """
+        from unittest.mock import patch
+        from tools.computer_use.backend import ActionResult
+        from tools.computer_use.tool import handle_computer_use
+
+        # Make click() return a failure.
+        with patch.object(noop_backend, "click",
+                          return_value=ActionResult(ok=False, action="click",
+                                                    message="element not found")):
+            out = handle_computer_use({"action": "click", "element": 99,
+                                       "capture_after": True})
+
+        parsed = json.loads(out)
+        # Should return the error, not a multimodal capture.
+        assert parsed.get("ok") is False
+        assert parsed.get("action") == "click"
+        # No follow-up capture should have been issued.
+        capture_calls = [c for c in noop_backend.calls if c[0] == "capture"]
+        assert len(capture_calls) == 0, "capture must not be called after a failed action"
+
+    def test_capture_after_fires_when_action_succeeds(self, noop_backend):
+        """capture_after must trigger for successful actions."""
+        from tools.computer_use.tool import handle_computer_use
+        out = handle_computer_use({"action": "click", "element": 1,
+                                   "capture_after": True})
+        # Noop backend returns ok=True, so capture should have been called.
+        capture_calls = [c for c in noop_backend.calls if c[0] == "capture"]
+        assert len(capture_calls) == 1


 # ---------------------------------------------------------------------------
--- a/tools/computer_use/tool.py
+++ b/tools/computer_use/tool.py
@ -675,6 +675,11 @@ def _maybe_follow_capture(
 ) -> Any:
    if not do_capture:
        return _text_response(res)
+    # Skip the follow-up capture when the action itself failed: showing a
+    # normal-looking screenshot after a failure misleads the model into thinking
+    # the action succeeded. Return the error text instead.
+    if not res.ok:
+        return _text_response(res)
    try:
        # Preserve the app context established by the preceding capture/focus_app so
        # that capture_after=True re-captures the same app rather than the frontmost