diff --git a/agent/display.py b/agent/display.py index e9a19ff6192..6c5c970aeff 100644 --- a/agent/display.py +++ b/agent/display.py @@ -14,6 +14,7 @@ from difflib import unified_diff from pathlib import Path from utils import safe_json_loads +from agent.tool_result_classification import file_mutation_result_landed # ANSI escape codes for coloring tool failure indicators _RED = "\033[31m" @@ -810,6 +811,8 @@ def _detect_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str] """ if result is None: return False, "" + if file_mutation_result_landed(tool_name, result): + return False, "" if tool_name == "terminal": data = safe_json_loads(result) diff --git a/agent/tool_guardrails.py b/agent/tool_guardrails.py index 3c85d782090..5a9ddd507ba 100644 --- a/agent/tool_guardrails.py +++ b/agent/tool_guardrails.py @@ -14,6 +14,7 @@ from dataclasses import dataclass, field from typing import Any, Mapping from utils import safe_json_loads +from agent.tool_result_classification import file_mutation_result_landed IDEMPOTENT_TOOL_NAMES = frozenset( @@ -196,6 +197,8 @@ def classify_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str """ if result is None: return False, "" + if file_mutation_result_landed(tool_name, result): + return False, "" if tool_name == "terminal": data = safe_json_loads(result) diff --git a/agent/tool_result_classification.py b/agent/tool_result_classification.py new file mode 100644 index 00000000000..e136e2964da --- /dev/null +++ b/agent/tool_result_classification.py @@ -0,0 +1,26 @@ +"""Shared helpers for classifying tool result payloads.""" + +from __future__ import annotations + +import json +from typing import Any + + +FILE_MUTATING_TOOL_NAMES = frozenset({"write_file", "patch"}) + + +def file_mutation_result_landed(tool_name: str, result: Any) -> bool: + """Return True when a file mutation result proves the write landed.""" + if tool_name not in FILE_MUTATING_TOOL_NAMES or not isinstance(result, str): + return False + try: + data = json.loads(result.strip()) + except Exception: + return False + if not isinstance(data, dict) or data.get("error"): + return False + if tool_name == "write_file": + return "bytes_written" in data + if tool_name == "patch": + return data.get("success") is True + return False diff --git a/run_agent.py b/run_agent.py index f0597c90880..a2185300931 100644 --- a/run_agent.py +++ b/run_agent.py @@ -181,6 +181,7 @@ from agent.tool_guardrails import ( append_toolguard_guidance, toolguard_synthetic_result, ) +from agent.tool_result_classification import file_mutation_result_landed from agent.trajectory import ( convert_scratchpad_to_think, has_incomplete_scratchpad, save_trajectory as _save_trajectory_to_file, @@ -5347,7 +5348,8 @@ class AIAgent: targets = _extract_file_mutation_targets(tool_name, args) if not targets: return - if is_error: + landed = file_mutation_result_landed(tool_name, result) + if is_error and not landed: preview = _extract_error_preview(result) for path in targets: # Keep the FIRST error we saw for a given path unless we diff --git a/tests/agent/test_display.py b/tests/agent/test_display.py index c6ad837af97..5e18fa17e0c 100644 --- a/tests/agent/test_display.py +++ b/tests/agent/test_display.py @@ -1,6 +1,7 @@ """Tests for agent/display.py — build_tool_preview() and inline diff previews.""" import os +import json import pytest from unittest.mock import MagicMock, patch @@ -149,6 +150,27 @@ class TestCuteToolMessagePreviewLength: assert path in line assert "..." not in line + def test_write_file_lint_error_result_is_not_marked_failed(self): + result = json.dumps({ + "bytes_written": 12, + "lint": {"status": "error", "output": "SyntaxError: invalid syntax"}, + }) + + line = get_cute_tool_message("write_file", {"path": "/tmp/a.py"}, 0.1, result=result) + + assert "[error]" not in line + + def test_patch_lsp_diagnostics_result_is_not_marked_failed(self): + result = json.dumps({ + "success": True, + "diff": "--- a/tmp.py\n+++ b/tmp.py\n", + "lsp_diagnostics": "ERROR [1:1] type mismatch", + }) + + line = get_cute_tool_message("patch", {"path": "/tmp/a.py"}, 0.1, result=result) + + assert "[error]" not in line + class TestEditDiffPreview: def test_extract_edit_diff_for_patch(self): diff --git a/tests/agent/test_tool_guardrails.py b/tests/agent/test_tool_guardrails.py index c50be56f43e..26593b7ef62 100644 --- a/tests/agent/test_tool_guardrails.py +++ b/tests/agent/test_tool_guardrails.py @@ -7,6 +7,7 @@ from agent.tool_guardrails import ( ToolCallGuardrailController, ToolCallSignature, canonical_tool_args, + classify_tool_failure, ) @@ -131,6 +132,21 @@ def test_success_resets_exact_signature_failure_streak(): assert controller.before_call("web_search", args).action == "allow" +def test_file_mutation_lint_error_result_is_not_a_tool_failure(): + write_result = json.dumps({ + "bytes_written": 12, + "lint": {"status": "error", "output": "SyntaxError: invalid syntax"}, + }) + patch_result = json.dumps({ + "success": True, + "diff": "--- a/tmp.py\n+++ b/tmp.py\n", + "lsp_diagnostics": "ERROR [1:1] type mismatch", + }) + + assert classify_tool_failure("write_file", write_result) == (False, "") + assert classify_tool_failure("patch", patch_result) == (False, "") + + def test_same_tool_varying_args_warns_by_default_without_halting(): controller = ToolCallGuardrailController( ToolCallGuardrailConfig(same_tool_failure_warn_after=2, same_tool_failure_halt_after=3) diff --git a/tests/agent/test_tool_result_classification.py b/tests/agent/test_tool_result_classification.py new file mode 100644 index 00000000000..2b4b5b150cf --- /dev/null +++ b/tests/agent/test_tool_result_classification.py @@ -0,0 +1,30 @@ +"""Tests for shared tool result classification helpers.""" + +import json + +from agent.tool_result_classification import file_mutation_result_landed + + +def test_write_file_with_nested_lint_error_counts_as_landed(): + result = json.dumps({ + "bytes_written": 12, + "lint": {"status": "error", "output": "SyntaxError: invalid syntax"}, + }) + + assert file_mutation_result_landed("write_file", result) is True + + +def test_patch_with_nested_lsp_diagnostics_counts_as_landed(): + result = json.dumps({ + "success": True, + "diff": "--- a/tmp.py\n+++ b/tmp.py\n", + "lsp_diagnostics": "ERROR [1:1] type mismatch", + }) + + assert file_mutation_result_landed("patch", result) is True + + +def test_top_level_file_mutation_error_does_not_count_as_landed(): + result = json.dumps({"success": True, "error": "post-write verification failed"}) + + assert file_mutation_result_landed("patch", result) is False diff --git a/tests/run_agent/test_file_mutation_verifier.py b/tests/run_agent/test_file_mutation_verifier.py index fca002d2314..73684ad1c2e 100644 --- a/tests/run_agent/test_file_mutation_verifier.py +++ b/tests/run_agent/test_file_mutation_verifier.py @@ -166,6 +166,56 @@ class TestRecordFileMutationResult: ) assert agent._turn_failed_file_mutations == {} + def test_write_file_with_lint_error_counts_as_landed(self): + agent = _bare_agent() + agent._record_file_mutation_result( + "write_file", + {"path": "/tmp/a.py", "content": "bad"}, + json.dumps({"error": "write failed"}), + is_error=True, + ) + assert "/tmp/a.py" in agent._turn_failed_file_mutations + + result = json.dumps({ + "bytes_written": 24, + "lint": {"status": "error", "output": "SyntaxError: invalid syntax"}, + }) + + agent._record_file_mutation_result( + "write_file", + {"path": "/tmp/a.py", "content": "def nope(:\n"}, + result, + is_error=True, + ) + + assert agent._turn_failed_file_mutations == {} + + def test_patch_with_lsp_diagnostics_counts_as_landed(self): + agent = _bare_agent() + agent._record_file_mutation_result( + "patch", + {"mode": "replace", "path": "/tmp/a.py", "old_string": "x", "new_string": "y"}, + json.dumps({"error": "Could not find old_string"}), + is_error=True, + ) + assert "/tmp/a.py" in agent._turn_failed_file_mutations + + result = json.dumps({ + "success": True, + "diff": "--- a/tmp.py\n+++ b/tmp.py\n", + "files_modified": ["/tmp/a.py"], + "lsp_diagnostics": "ERROR [1:1] type mismatch", + }) + + agent._record_file_mutation_result( + "patch", + {"mode": "replace", "path": "/tmp/a.py", "old_string": "x", "new_string": "y"}, + result, + is_error=True, + ) + + assert agent._turn_failed_file_mutations == {} + def test_repeated_failure_keeps_first_error(self): agent = _bare_agent() agent._record_file_mutation_result(