mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(patch): harden V4A patch parser and fuzzy match — 9 correctness bugs
- Bug 1: replace read_file(limit=10000) with read_file_raw in _apply_update, preventing silent truncation of files >2000 lines and corruption of lines >2000 chars; add read_file_raw to FileOperations abstract interface and ShellFileOperations - Bug 2: split apply_v4a_operations into validate-then-apply phases; if any hunk fails validation, zero writes occur (was: continue after failure, leaving filesystem partially modified) - Bug 3: parse_v4a_patch now returns an error for begin-marker-with-no-ops, empty file paths, and moves missing a destination (was: always returned error=None) - Bug 4: raise strategy 7 (block anchor) single-candidate similarity threshold from 0.10 to 0.50, eliminating false-positive matches in repetitive code - Bug 5: add _strategy_unicode_normalized (new strategy 7) with position mapping via _build_orig_to_norm_map; smart quotes and em-dashes in LLM-generated patches now match via strategies 1-6 before falling through to fuzzy strategies - Bug 6: extend fuzzy_find_and_replace to return 4-tuple (content, count, error, strategy); update all 5 call sites across patch_parser.py, file_operations.py, and skill_manager_tool.py - Bug 7: guard in _apply_update returns error when addition-only context hint is ambiguous (>1 occurrences); validation phase errors on both 0 and >1 - Bug 8: _apply_delete returns error (not silent success) on missing file - Bug 9: _validate_operations checks source existence and destination absence for MOVE operations before any write occurs
This commit is contained in:
parent
475cbce775
commit
0e939af7c2
7 changed files with 761 additions and 119 deletions
|
|
@ -21,7 +21,7 @@ Multi-occurrence matching is handled via the replace_all flag.
|
|||
Usage:
|
||||
from tools.fuzzy_match import fuzzy_find_and_replace
|
||||
|
||||
new_content, match_count, error = fuzzy_find_and_replace(
|
||||
new_content, match_count, strategy, error = fuzzy_find_and_replace(
|
||||
content="def foo():\\n pass",
|
||||
old_string="def foo():",
|
||||
new_string="def bar():",
|
||||
|
|
@ -48,27 +48,27 @@ def _unicode_normalize(text: str) -> str:
|
|||
|
||||
|
||||
def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
|
||||
replace_all: bool = False) -> Tuple[str, int, Optional[str]]:
|
||||
replace_all: bool = False) -> Tuple[str, int, Optional[str], Optional[str]]:
|
||||
"""
|
||||
Find and replace text using a chain of increasingly fuzzy matching strategies.
|
||||
|
||||
|
||||
Args:
|
||||
content: The file content to search in
|
||||
old_string: The text to find
|
||||
new_string: The replacement text
|
||||
replace_all: If True, replace all occurrences; if False, require uniqueness
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (new_content, match_count, error_message)
|
||||
- If successful: (modified_content, number_of_replacements, None)
|
||||
- If failed: (original_content, 0, error_description)
|
||||
Tuple of (new_content, match_count, strategy_name, error_message)
|
||||
- If successful: (modified_content, number_of_replacements, strategy_used, None)
|
||||
- If failed: (original_content, 0, None, error_description)
|
||||
"""
|
||||
if not old_string:
|
||||
return content, 0, "old_string cannot be empty"
|
||||
|
||||
return content, 0, None, "old_string cannot be empty"
|
||||
|
||||
if old_string == new_string:
|
||||
return content, 0, "old_string and new_string are identical"
|
||||
|
||||
return content, 0, None, "old_string and new_string are identical"
|
||||
|
||||
# Try each matching strategy in order
|
||||
strategies: List[Tuple[str, Callable]] = [
|
||||
("exact", _strategy_exact),
|
||||
|
|
@ -77,27 +77,28 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
|
|||
("indentation_flexible", _strategy_indentation_flexible),
|
||||
("escape_normalized", _strategy_escape_normalized),
|
||||
("trimmed_boundary", _strategy_trimmed_boundary),
|
||||
("unicode_normalized", _strategy_unicode_normalized),
|
||||
("block_anchor", _strategy_block_anchor),
|
||||
("context_aware", _strategy_context_aware),
|
||||
]
|
||||
|
||||
for _strategy_name, strategy_fn in strategies:
|
||||
|
||||
for strategy_name, strategy_fn in strategies:
|
||||
matches = strategy_fn(content, old_string)
|
||||
|
||||
|
||||
if matches:
|
||||
# Found matches with this strategy
|
||||
if len(matches) > 1 and not replace_all:
|
||||
return content, 0, (
|
||||
return content, 0, None, (
|
||||
f"Found {len(matches)} matches for old_string. "
|
||||
f"Provide more context to make it unique, or use replace_all=True."
|
||||
)
|
||||
|
||||
|
||||
# Perform replacement
|
||||
new_content = _apply_replacements(content, matches, new_string)
|
||||
return new_content, len(matches), None
|
||||
|
||||
return new_content, len(matches), strategy_name, None
|
||||
|
||||
# No strategy found a match
|
||||
return content, 0, "Could not find a match for old_string in the file"
|
||||
return content, 0, None, "Could not find a match for old_string in the file"
|
||||
|
||||
|
||||
def _apply_replacements(content: str, matches: List[Tuple[int, int]], new_string: str) -> str:
|
||||
|
|
@ -258,9 +259,90 @@ def _strategy_trimmed_boundary(content: str, pattern: str) -> List[Tuple[int, in
|
|||
return matches
|
||||
|
||||
|
||||
def _build_orig_to_norm_map(original: str) -> List[int]:
|
||||
"""Build a list mapping each original character index to its normalized index.
|
||||
|
||||
Because UNICODE_MAP replacements may expand characters (e.g. em-dash → '--',
|
||||
ellipsis → '...'), the normalised string can be longer than the original.
|
||||
This map lets us convert positions in the normalised string back to the
|
||||
corresponding positions in the original string.
|
||||
|
||||
Returns a list of length ``len(original) + 1``; entry ``i`` is the
|
||||
normalised index that character ``i`` maps to.
|
||||
"""
|
||||
result: List[int] = []
|
||||
norm_pos = 0
|
||||
for char in original:
|
||||
result.append(norm_pos)
|
||||
repl = UNICODE_MAP.get(char)
|
||||
norm_pos += len(repl) if repl is not None else 1
|
||||
result.append(norm_pos) # sentinel: one past the last character
|
||||
return result
|
||||
|
||||
|
||||
def _map_positions_norm_to_orig(
|
||||
orig_to_norm: List[int],
|
||||
norm_matches: List[Tuple[int, int]],
|
||||
) -> List[Tuple[int, int]]:
|
||||
"""Convert (start, end) positions in the normalised string to original positions."""
|
||||
# Invert the map: norm_pos -> first original position with that norm_pos
|
||||
norm_to_orig_start: dict[int, int] = {}
|
||||
for orig_pos, norm_pos in enumerate(orig_to_norm[:-1]):
|
||||
if norm_pos not in norm_to_orig_start:
|
||||
norm_to_orig_start[norm_pos] = orig_pos
|
||||
|
||||
results: List[Tuple[int, int]] = []
|
||||
orig_len = len(orig_to_norm) - 1 # number of original characters
|
||||
|
||||
for norm_start, norm_end in norm_matches:
|
||||
if norm_start not in norm_to_orig_start:
|
||||
continue
|
||||
orig_start = norm_to_orig_start[norm_start]
|
||||
|
||||
# Walk forward until orig_to_norm[orig_end] >= norm_end
|
||||
orig_end = orig_start
|
||||
while orig_end < orig_len and orig_to_norm[orig_end] < norm_end:
|
||||
orig_end += 1
|
||||
|
||||
results.append((orig_start, orig_end))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _strategy_unicode_normalized(content: str, pattern: str) -> List[Tuple[int, int]]:
|
||||
"""Strategy 7: Unicode normalisation.
|
||||
|
||||
Normalises smart quotes, em/en-dashes, ellipsis, and non-breaking spaces
|
||||
to their ASCII equivalents in both *content* and *pattern*, then runs
|
||||
exact and line_trimmed matching on the normalised copies.
|
||||
|
||||
Positions are mapped back to the *original* string via
|
||||
``_build_orig_to_norm_map`` — necessary because some UNICODE_MAP
|
||||
replacements expand a single character into multiple ASCII characters,
|
||||
making a naïve position copy incorrect.
|
||||
"""
|
||||
# Normalize both sides. Either the content or the pattern (or both) may
|
||||
# carry unicode variants — e.g. content has an em-dash that should match
|
||||
# the LLM's ASCII '--', or vice-versa. Skip only when neither changes.
|
||||
norm_pattern = _unicode_normalize(pattern)
|
||||
norm_content = _unicode_normalize(content)
|
||||
if norm_content == content and norm_pattern == pattern:
|
||||
return []
|
||||
|
||||
norm_matches = _strategy_exact(norm_content, norm_pattern)
|
||||
if not norm_matches:
|
||||
norm_matches = _strategy_line_trimmed(norm_content, norm_pattern)
|
||||
|
||||
if not norm_matches:
|
||||
return []
|
||||
|
||||
orig_to_norm = _build_orig_to_norm_map(content)
|
||||
return _map_positions_norm_to_orig(orig_to_norm, norm_matches)
|
||||
|
||||
|
||||
def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Strategy 7: Match by anchoring on first and last lines.
|
||||
Strategy 8: Match by anchoring on first and last lines.
|
||||
Adjusted with permissive thresholds and unicode normalization.
|
||||
"""
|
||||
# Normalize both strings for comparison while keeping original content for offset calculation
|
||||
|
|
@ -290,8 +372,10 @@ def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|||
matches = []
|
||||
candidate_count = len(potential_matches)
|
||||
|
||||
# Thresholding logic: 0.10 for unique matches (max flexibility), 0.30 for multiple candidates
|
||||
threshold = 0.10 if candidate_count == 1 else 0.30
|
||||
# Thresholding logic: 0.50 for unique matches, 0.70 for multiple candidates.
|
||||
# Previous values (0.10 / 0.30) were dangerously loose — a 10% middle-section
|
||||
# similarity could match completely unrelated blocks.
|
||||
threshold = 0.50 if candidate_count == 1 else 0.70
|
||||
|
||||
for i in potential_matches:
|
||||
if pattern_line_count <= 2:
|
||||
|
|
@ -314,7 +398,7 @@ def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|||
|
||||
def _strategy_context_aware(content: str, pattern: str) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Strategy 8: Line-by-line similarity with 50% threshold.
|
||||
Strategy 9: Line-by-line similarity with 50% threshold.
|
||||
|
||||
Finds blocks where at least 50% of lines have high similarity.
|
||||
"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue