From f23d077b5f04facf78e895eb41788fc6ca148951 Mon Sep 17 00:00:00 2001 From: liuhao1024 Date: Thu, 25 Jun 2026 21:46:15 +0800 Subject: [PATCH] fix(fuzzy-match): preserve boundary space after whitespace-normalized match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The trailing-whitespace expansion in _map_normalized_positions unconditionally consumed whitespace after the matched region — including the word-boundary space that separates the match from the next token. This caused silent file corruption when the fuzzy matcher fell back to the whitespace_normalized strategy. Guard the expansion on the normalized match actually ending with whitespace (i.e. the original had a run of spaces that were collapsed). When the match ends with a non-space character, the first whitespace in the original is a boundary and must not be consumed. Fixes #52491 --- tests/tools/test_fuzzy_match.py | 41 +++++++++++++++++++++++++++++++++ tools/fuzzy_match.py | 11 ++++++--- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/tests/tools/test_fuzzy_match.py b/tests/tools/test_fuzzy_match.py index f81d0437434..0a7ce464f44 100644 --- a/tests/tools/test_fuzzy_match.py +++ b/tests/tools/test_fuzzy_match.py @@ -43,6 +43,47 @@ class TestWhitespaceDifference: assert count == 1 assert "bar" in new + def test_boundary_space_preserved_after_match(self): + """Regression: whitespace_normalized match ending with a non-space + character must NOT consume the word-boundary space that follows. + https://github.com/NousResearch/hermes-agent/issues/52491""" + # Case 1 — simple word boundary + new, count, strategy, err = fuzzy_find_and_replace( + "foo bar baz", "foo bar", "XY", + ) + assert err is None + assert count == 1 + assert strategy == "whitespace_normalized" + assert new == "XY baz", f"Boundary space deleted: {new!r}" + + def test_boundary_space_preserved_in_code_edit(self): + """Regression: real-world code-edit scenario where the space before + the next operator must survive a whitespace-normalized match.""" + content = "result = compute(a, b) + tail" + new, count, strategy, err = fuzzy_find_and_replace( + content, "compute(a, b)", "compute(a, b, c)", + ) + assert err is None + assert count == 1 + assert strategy == "whitespace_normalized" + assert new == "result = compute(a, b, c) + tail", f"Boundary space deleted: {new!r}" + + def test_trailing_ws_still_consumed_when_match_ends_with_space(self): + """When the normalized match itself ends with whitespace (pattern has + trailing space), the expansion must still consume the full whitespace + run in the original.""" + # Use a pattern with trailing space where the boundary is clear: + # content has "foo " then "bar", pattern is "foo " — the match + # should cover all 3 original spaces (the trailing ws run). + new, count, strategy, err = fuzzy_find_and_replace( + "a = foo + bar", "foo +", "XY", + ) + assert err is None + assert count == 1 + # "foo +" normalized to "foo +" matches; trailing spaces consumed + # Result: "a = XY bar" + assert "XY" in new and "bar" in new + class TestIndentDifference: def test_different_indentation(self): diff --git a/tools/fuzzy_match.py b/tools/fuzzy_match.py index 5ebb2b8b26f..709cde10fc3 100644 --- a/tools/fuzzy_match.py +++ b/tools/fuzzy_match.py @@ -768,9 +768,14 @@ def _map_normalized_positions(original: str, normalized: str, else: orig_end = orig_start + (norm_end - norm_start) - # Expand to include trailing whitespace that was normalized - while orig_end < len(original) and original[orig_end] in ' \t': - orig_end += 1 + # Expand to include trailing whitespace that was normalized, + # but only when the normalized match itself ended with whitespace. + # When the match ends with a non-space character, the first + # whitespace in the original is a word boundary and must not be + # consumed. See https://github.com/NousResearch/hermes-agent/issues/52491 + if norm_end < len(normalized) and normalized[norm_end - 1] == ' ': + while orig_end < len(original) and original[orig_end] in ' \t': + orig_end += 1 original_matches.append((orig_start, min(orig_end, len(original))))