fix(patch): harden V4A patch parser and fuzzy match — 9 correctness bugs

- Bug 1: replace read_file(limit=10000) with read_file_raw in _apply_update, preventing silent truncation of files >2000 lines and corruption of lines >2000 chars; add read_file_raw to FileOperations abstract interface and ShellFileOperations - Bug 2: split apply_v4a_operations into validate-then-apply phases; if any hunk fails validation, zero writes occur (was: continue after failure, leaving filesystem partially modified) - Bug 3: parse_v4a_patch now returns an error for begin-marker-with-no-ops, empty file paths, and moves missing a destination (was: always returned error=None) - Bug 4: raise strategy 7 (block anchor) single-candidate similarity threshold from 0.10 to 0.50, eliminating false-positive matches in repetitive code - Bug 5: add _strategy_unicode_normalized (new strategy 7) with position mapping via _build_orig_to_norm_map; smart quotes and em-dashes in LLM-generated patches now match via strategies 1-6 before falling through to fuzzy strategies - Bug 6: extend fuzzy_find_and_replace to return 4-tuple (content, count, error, strategy); update all 5 call sites across patch_parser.py, file_operations.py, and skill_manager_tool.py - Bug 7: guard in _apply_update returns error when addition-only context hint is ambiguous (>1 occurrences); validation phase errors on both 0 and >1 - Bug 8: _apply_delete returns error (not silent success) on missing file - Bug 9: _validate_operations checks source existence and destination absence for MOVE operations before any write occurs
2026-04-25 00:51:20 +00:00 · 2026-04-10 00:11:07 +02:00 · 2026-04-10 00:11:07 +02:00 · 0e939af7c2
commit 0e939af7c2
parent 475cbce775
7 changed files with 761 additions and 119 deletions
--- a/tools/fuzzy_match.py
+++ b/tools/fuzzy_match.py
@ -21,7 +21,7 @@ Multi-occurrence matching is handled via the replace_all flag.
 Usage:
    from tools.fuzzy_match import fuzzy_find_and_replace
    
-    new_content, match_count, error = fuzzy_find_and_replace(
+    new_content, match_count, strategy, error = fuzzy_find_and_replace(
        content="def foo():\\n    pass",
        old_string="def foo():",
        new_string="def bar():",
@ -48,27 +48,27 @@ def _unicode_normalize(text: str) -> str:


 def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
-                            replace_all: bool = False) -> Tuple[str, int, Optional[str]]:
+                            replace_all: bool = False) -> Tuple[str, int, Optional[str], Optional[str]]:
    """
    Find and replace text using a chain of increasingly fuzzy matching strategies.
-    
+
    Args:
        content: The file content to search in
        old_string: The text to find
        new_string: The replacement text
        replace_all: If True, replace all occurrences; if False, require uniqueness
-    
+
    Returns:
-        Tuple of (new_content, match_count, error_message)
-        - If successful: (modified_content, number_of_replacements, None)
-        - If failed: (original_content, 0, error_description)
+        Tuple of (new_content, match_count, strategy_name, error_message)
+        - If successful: (modified_content, number_of_replacements, strategy_used, None)
+        - If failed: (original_content, 0, None, error_description)
    """
    if not old_string:
-        return content, 0, "old_string cannot be empty"
-    
+        return content, 0, None, "old_string cannot be empty"
+
    if old_string == new_string:
-        return content, 0, "old_string and new_string are identical"
-    
+        return content, 0, None, "old_string and new_string are identical"
+
    # Try each matching strategy in order
    strategies: List[Tuple[str, Callable]] = [
        ("exact", _strategy_exact),
@ -77,27 +77,28 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
        ("indentation_flexible", _strategy_indentation_flexible),
        ("escape_normalized", _strategy_escape_normalized),
        ("trimmed_boundary", _strategy_trimmed_boundary),
+        ("unicode_normalized", _strategy_unicode_normalized),
        ("block_anchor", _strategy_block_anchor),
        ("context_aware", _strategy_context_aware),
    ]
-    
-    for _strategy_name, strategy_fn in strategies:
+
+    for strategy_name, strategy_fn in strategies:
        matches = strategy_fn(content, old_string)
-        
+
        if matches:
            # Found matches with this strategy
            if len(matches) > 1 and not replace_all:
-                return content, 0, (
+                return content, 0, None, (
                    f"Found {len(matches)} matches for old_string. "
                    f"Provide more context to make it unique, or use replace_all=True."
                )
-            
+
            # Perform replacement
            new_content = _apply_replacements(content, matches, new_string)
-            return new_content, len(matches), None
-    
+            return new_content, len(matches), strategy_name, None
+
    # No strategy found a match
-    return content, 0, "Could not find a match for old_string in the file"
+    return content, 0, None, "Could not find a match for old_string in the file"


 def _apply_replacements(content: str, matches: List[Tuple[int, int]], new_string: str) -> str:
@ -258,9 +259,90 @@ def _strategy_trimmed_boundary(content: str, pattern: str) -> List[Tuple[int, in
    return matches


+def _build_orig_to_norm_map(original: str) -> List[int]:
+    """Build a list mapping each original character index to its normalized index.
+
+    Because UNICODE_MAP replacements may expand characters (e.g. em-dash → '--',
+    ellipsis → '...'), the normalised string can be longer than the original.
+    This map lets us convert positions in the normalised string back to the
+    corresponding positions in the original string.
+
+    Returns a list of length ``len(original) + 1``; entry ``i`` is the
+    normalised index that character ``i`` maps to.
+    """
+    result: List[int] = []
+    norm_pos = 0
+    for char in original:
+        result.append(norm_pos)
+        repl = UNICODE_MAP.get(char)
+        norm_pos += len(repl) if repl is not None else 1
+    result.append(norm_pos)  # sentinel: one past the last character
+    return result
+
+
+def _map_positions_norm_to_orig(
+    orig_to_norm: List[int],
+    norm_matches: List[Tuple[int, int]],
+) -> List[Tuple[int, int]]:
+    """Convert (start, end) positions in the normalised string to original positions."""
+    # Invert the map: norm_pos -> first original position with that norm_pos
+    norm_to_orig_start: dict[int, int] = {}
+    for orig_pos, norm_pos in enumerate(orig_to_norm[:-1]):
+        if norm_pos not in norm_to_orig_start:
+            norm_to_orig_start[norm_pos] = orig_pos
+
+    results: List[Tuple[int, int]] = []
+    orig_len = len(orig_to_norm) - 1  # number of original characters
+
+    for norm_start, norm_end in norm_matches:
+        if norm_start not in norm_to_orig_start:
+            continue
+        orig_start = norm_to_orig_start[norm_start]
+
+        # Walk forward until orig_to_norm[orig_end] >= norm_end
+        orig_end = orig_start
+        while orig_end < orig_len and orig_to_norm[orig_end] < norm_end:
+            orig_end += 1
+
+        results.append((orig_start, orig_end))
+
+    return results
+
+
+def _strategy_unicode_normalized(content: str, pattern: str) -> List[Tuple[int, int]]:
+    """Strategy 7: Unicode normalisation.
+
+    Normalises smart quotes, em/en-dashes, ellipsis, and non-breaking spaces
+    to their ASCII equivalents in both *content* and *pattern*, then runs
+    exact and line_trimmed matching on the normalised copies.
+
+    Positions are mapped back to the *original* string via
+    ``_build_orig_to_norm_map`` — necessary because some UNICODE_MAP
+    replacements expand a single character into multiple ASCII characters,
+    making a naïve position copy incorrect.
+    """
+    # Normalize both sides. Either the content or the pattern (or both) may
+    # carry unicode variants — e.g. content has an em-dash that should match
+    # the LLM's ASCII '--', or vice-versa.  Skip only when neither changes.
+    norm_pattern = _unicode_normalize(pattern)
+    norm_content = _unicode_normalize(content)
+    if norm_content == content and norm_pattern == pattern:
+        return []
+
+    norm_matches = _strategy_exact(norm_content, norm_pattern)
+    if not norm_matches:
+        norm_matches = _strategy_line_trimmed(norm_content, norm_pattern)
+
+    if not norm_matches:
+        return []
+
+    orig_to_norm = _build_orig_to_norm_map(content)
+    return _map_positions_norm_to_orig(orig_to_norm, norm_matches)
+
+
 def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:
    """
-    Strategy 7: Match by anchoring on first and last lines.
+    Strategy 8: Match by anchoring on first and last lines.
    Adjusted with permissive thresholds and unicode normalization.
    """
    # Normalize both strings for comparison while keeping original content for offset calculation
@ -290,8 +372,10 @@ def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:
    matches = []
    candidate_count = len(potential_matches)
    
-    # Thresholding logic: 0.10 for unique matches (max flexibility), 0.30 for multiple candidates
-    threshold = 0.10 if candidate_count == 1 else 0.30
+    # Thresholding logic: 0.50 for unique matches, 0.70 for multiple candidates.
+    # Previous values (0.10 / 0.30) were dangerously loose — a 10% middle-section
+    # similarity could match completely unrelated blocks.
+    threshold = 0.50 if candidate_count == 1 else 0.70

    for i in potential_matches:
        if pattern_line_count <= 2:
@ -314,7 +398,7 @@ def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:

 def _strategy_context_aware(content: str, pattern: str) -> List[Tuple[int, int]]:
    """
-    Strategy 8: Line-by-line similarity with 50% threshold.
+    Strategy 9: Line-by-line similarity with 50% threshold.
    
    Finds blocks where at least 50% of lines have high similarity.
    """