fix: classify landed file mutations with diagnostics

2026-05-18 04:41:56 +00:00 · 2026-05-13 11:36:07 +02:00 · 2026-05-13 11:36:07 +02:00 · da0ddbf88a
commit da0ddbf88a
parent 71c6dd0dcf
8 changed files with 153 additions and 1 deletions
--- a/agent/display.py
+++ b/agent/display.py
@ -14,6 +14,7 @@ from difflib import unified_diff
 from pathlib import Path
 from utils import safe_json_loads
 from agent.tool_result_classification import file_mutation_result_landed
 # ANSI escape codes for coloring tool failure indicators
 _RED = "\033[31m"
@ -810,6 +811,8 @@ def _detect_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str]
    """
    if result is None:
        return False, ""
    if file_mutation_result_landed(tool_name, result):
        return False, ""
    if tool_name == "terminal":
        data = safe_json_loads(result)
--- a/agent/tool_guardrails.py
+++ b/agent/tool_guardrails.py
@ -14,6 +14,7 @@ from dataclasses import dataclass, field
 from typing import Any, Mapping
 from utils import safe_json_loads
 from agent.tool_result_classification import file_mutation_result_landed
 IDEMPOTENT_TOOL_NAMES = frozenset(
@ -196,6 +197,8 @@ def classify_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str
    """
    if result is None:
        return False, ""
    if file_mutation_result_landed(tool_name, result):
        return False, ""
    if tool_name == "terminal":
        data = safe_json_loads(result)
--- a/agent/tool_result_classification.py
+++ b/agent/tool_result_classification.py
@ -0,0 +1,26 @@
 """Shared helpers for classifying tool result payloads."""
 from __future__ import annotations
 import json
 from typing import Any
 FILE_MUTATING_TOOL_NAMES = frozenset({"write_file", "patch"})
 def file_mutation_result_landed(tool_name: str, result: Any) -> bool:
    """Return True when a file mutation result proves the write landed."""
    if tool_name not in FILE_MUTATING_TOOL_NAMES or not isinstance(result, str):
        return False
    try:
        data = json.loads(result.strip())
    except Exception:
        return False
    if not isinstance(data, dict) or data.get("error"):
        return False
    if tool_name == "write_file":
        return "bytes_written" in data
    if tool_name == "patch":
        return data.get("success") is True
    return False
--- a/run_agent.py
+++ b/run_agent.py
@ -181,6 +181,7 @@ from agent.tool_guardrails import (
    append_toolguard_guidance,
    toolguard_synthetic_result,
 )
 from agent.tool_result_classification import file_mutation_result_landed
 from agent.trajectory import (
    convert_scratchpad_to_think, has_incomplete_scratchpad,
    save_trajectory as _save_trajectory_to_file,
@ -5347,7 +5348,8 @@ class AIAgent:
        targets = _extract_file_mutation_targets(tool_name, args)
        if not targets:
            return
-        if is_error:
+        landed = file_mutation_result_landed(tool_name, result)
        if is_error and not landed:
            preview = _extract_error_preview(result)
            for path in targets:
                # Keep the FIRST error we saw for a given path unless we
--- a/tests/agent/test_display.py
+++ b/tests/agent/test_display.py
@ -1,6 +1,7 @@
 """Tests for agent/display.py — build_tool_preview() and inline diff previews."""
 import os
 import json
 import pytest
 from unittest.mock import MagicMock, patch
@ -149,6 +150,27 @@ class TestCuteToolMessagePreviewLength:
        assert path in line
        assert "..." not in line
    def test_write_file_lint_error_result_is_not_marked_failed(self):
        result = json.dumps({
            "bytes_written": 12,
            "lint": {"status": "error", "output": "SyntaxError: invalid syntax"},
        })
        line = get_cute_tool_message("write_file", {"path": "/tmp/a.py"}, 0.1, result=result)
        assert "[error]" not in line
    def test_patch_lsp_diagnostics_result_is_not_marked_failed(self):
        result = json.dumps({
            "success": True,
            "diff": "--- a/tmp.py\n+++ b/tmp.py\n",
            "lsp_diagnostics": "<diagnostics>ERROR [1:1] type mismatch</diagnostics>",
        })
        line = get_cute_tool_message("patch", {"path": "/tmp/a.py"}, 0.1, result=result)
        assert "[error]" not in line
 class TestEditDiffPreview:
    def test_extract_edit_diff_for_patch(self):
--- a/tests/agent/test_tool_guardrails.py
+++ b/tests/agent/test_tool_guardrails.py
@ -7,6 +7,7 @@ from agent.tool_guardrails import (
    ToolCallGuardrailController,
    ToolCallSignature,
    canonical_tool_args,
    classify_tool_failure,
 )
@ -131,6 +132,21 @@ def test_success_resets_exact_signature_failure_streak():
    assert controller.before_call("web_search", args).action == "allow"
 def test_file_mutation_lint_error_result_is_not_a_tool_failure():
    write_result = json.dumps({
        "bytes_written": 12,
        "lint": {"status": "error", "output": "SyntaxError: invalid syntax"},
    })
    patch_result = json.dumps({
        "success": True,
        "diff": "--- a/tmp.py\n+++ b/tmp.py\n",
        "lsp_diagnostics": "<diagnostics>ERROR [1:1] type mismatch</diagnostics>",
    })
    assert classify_tool_failure("write_file", write_result) == (False, "")
    assert classify_tool_failure("patch", patch_result) == (False, "")
 def test_same_tool_varying_args_warns_by_default_without_halting():
    controller = ToolCallGuardrailController(
        ToolCallGuardrailConfig(same_tool_failure_warn_after=2, same_tool_failure_halt_after=3)
--- a/tests/agent/test_tool_result_classification.py
+++ b/tests/agent/test_tool_result_classification.py
@ -0,0 +1,30 @@
 """Tests for shared tool result classification helpers."""
 import json
 from agent.tool_result_classification import file_mutation_result_landed
 def test_write_file_with_nested_lint_error_counts_as_landed():
    result = json.dumps({
        "bytes_written": 12,
        "lint": {"status": "error", "output": "SyntaxError: invalid syntax"},
    })
    assert file_mutation_result_landed("write_file", result) is True
 def test_patch_with_nested_lsp_diagnostics_counts_as_landed():
    result = json.dumps({
        "success": True,
        "diff": "--- a/tmp.py\n+++ b/tmp.py\n",
        "lsp_diagnostics": "<diagnostics>ERROR [1:1] type mismatch</diagnostics>",
    })
    assert file_mutation_result_landed("patch", result) is True
 def test_top_level_file_mutation_error_does_not_count_as_landed():
    result = json.dumps({"success": True, "error": "post-write verification failed"})
    assert file_mutation_result_landed("patch", result) is False
--- a/tests/run_agent/test_file_mutation_verifier.py
+++ b/tests/run_agent/test_file_mutation_verifier.py
@ -166,6 +166,56 @@ class TestRecordFileMutationResult:
        )
        assert agent._turn_failed_file_mutations == {}
    def test_write_file_with_lint_error_counts_as_landed(self):
        agent = _bare_agent()
        agent._record_file_mutation_result(
            "write_file",
            {"path": "/tmp/a.py", "content": "bad"},
            json.dumps({"error": "write failed"}),
            is_error=True,
        )
        assert "/tmp/a.py" in agent._turn_failed_file_mutations
        result = json.dumps({
            "bytes_written": 24,
            "lint": {"status": "error", "output": "SyntaxError: invalid syntax"},
        })
        agent._record_file_mutation_result(
            "write_file",
            {"path": "/tmp/a.py", "content": "def nope(:\n"},
            result,
            is_error=True,
        )
        assert agent._turn_failed_file_mutations == {}
    def test_patch_with_lsp_diagnostics_counts_as_landed(self):
        agent = _bare_agent()
        agent._record_file_mutation_result(
            "patch",
            {"mode": "replace", "path": "/tmp/a.py", "old_string": "x", "new_string": "y"},
            json.dumps({"error": "Could not find old_string"}),
            is_error=True,
        )
        assert "/tmp/a.py" in agent._turn_failed_file_mutations
        result = json.dumps({
            "success": True,
            "diff": "--- a/tmp.py\n+++ b/tmp.py\n",
            "files_modified": ["/tmp/a.py"],
            "lsp_diagnostics": "<diagnostics>ERROR [1:1] type mismatch</diagnostics>",
        })
        agent._record_file_mutation_result(
            "patch",
            {"mode": "replace", "path": "/tmp/a.py", "old_string": "x", "new_string": "y"},
            result,
            is_error=True,
        )
        assert agent._turn_failed_file_mutations == {}
    def test_repeated_failure_keeps_first_error(self):
        agent = _bare_agent()
        agent._record_file_mutation_result(