fix(tools): unescape common sequences in new_string when escape_normalized matches

When the patch tool matches via the escape_normalized strategy, old_string
contains literal \t, \n, \r sequences that get unescaped to match real
control characters in the file. However, new_string was written as-is,
leaving literal backslash sequences in the output.

Add _unescape_common_sequences() helper and apply it to new_string when
the matching strategy is escape_normalized. This ensures LLM-generated
tab/newline sequences become real bytes in the patched file.

Fixes #33733
This commit is contained in:
liuhao1024 2026-05-28 16:10:00 +08:00 committed by Teknium
parent 10ee4a729b
commit e9f3f2b34a
2 changed files with 94 additions and 1 deletions

View file

@ -429,3 +429,75 @@ class TestFormatNoMatchHint:
)
assert result == ""
class TestEscapeNormalizedNewString:
"""Regression tests for unescaping common sequences in new_string
when the match succeeded via the escape_normalized strategy.
Issue #33733: LLMs overwhelmingly represent tabs as the two-character
sequence \\t (backslash + t) in JSON tool-call arguments. When the file
contains real tab bytes (0x09), the escape_normalized strategy matches
old_string by unescaping it but new_string was written as-is, leaving
literal \\t characters in the file.
"""
def test_tab_in_new_string_unescaped(self):
"""File has real tab, model sends literal \\t in new_string."""
content = "def hello():\n\tprint(\"before\")\n"
old_string = "def hello():\n\\tprint(\"before\")\n"
new_string = "def hello():\n\\tprint(\"after\")\n"
new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
assert err is None, f"Unexpected error: {err}"
assert count == 1
assert strategy == "escape_normalized"
# Must contain a real tab, not literal \t
assert "\tprint(\"after\")" in new
assert "\\t" not in new
def test_newline_in_new_string_unescaped(self):
"""File has real newlines, model sends literal \\n in new_string."""
content = "line1\nline2\nline3\n"
old_string = "line1\\nline2\\n"
new_string = "line1\\nreplaced\\n"
new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
assert err is None, f"Unexpected error: {err}"
assert count == 1
assert strategy == "escape_normalized"
assert "replaced" in new
assert "\\n" not in new
def test_carriage_return_in_new_string_unescaped(self):
"""File has real CR, model sends literal \\r in new_string."""
content = "line1\r\nline2\r\n"
old_string = "line1\\r\\nline2\\r\\n"
new_string = "replaced\\r\\n"
new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
assert err is None, f"Unexpected error: {err}"
assert count == 1
assert strategy == "escape_normalized"
assert "replaced\r" in new
def test_mixed_escape_sequences_in_new_string(self):
"""Model sends \\t and \\n together in new_string."""
content = "def foo():\n\tpass\n"
old_string = "def foo():\\n\\tpass\\n"
new_string = "def bar():\\n\\treturn 1\\n"
new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
assert err is None, f"Unexpected error: {err}"
assert count == 1
assert strategy == "escape_normalized"
assert "\treturn 1" in new
assert "\\t" not in new
assert "\\n" not in new
def test_exact_match_preserves_literal_escapes(self):
"""When matching via exact strategy, literal \\t in new_string should
NOT be unescaped the file genuinely contains backslash-t characters."""
content = "path = \"C:\\\\Users\\\"\n"
old_string = "path = \"C:\\\\Users\\\"\n"
new_string = "path = \"D:\\\\data\\\"\n"
new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
assert err is None
assert strategy == "exact"
# Literal backslashes should be preserved
assert "\\\\" in new

View file

@ -113,8 +113,16 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
# old_string/new_string — e.g. LLM used 2-space indent but the
# file is 4-space. Shift new_string by the indentation delta so
# the replacement matches the file's actual indent pattern.
effective_new = new_string
if strategy_name == "escape_normalized":
# The escape_normalized strategy matched because old_string
# contained literal \t/\n/\r that were unescaped to match
# real control characters in the file. Apply the same
# unescaping to new_string so we don't write literal
# backslash sequences where the file has real tabs/newlines.
effective_new = _unescape_common_sequences(new_string)
new_content = _apply_replacements(
content, matches, new_string,
content, matches, effective_new,
old_string=old_string if strategy_name != "exact" else None,
)
return new_content, len(matches), strategy_name, None
@ -247,6 +255,19 @@ def _reindent_replacement(file_region: str, old_string: str, new_string: str) ->
return "\n".join(out_lines)
def _unescape_common_sequences(s: str) -> str:
"""Unescape common C-style escape sequences that LLMs produce literally.
When the model sends ``\\t`` (two characters: backslash + t) instead of a
real tab byte (0x09), the patch tool would write the literal characters.
This helper converts common escape sequences to their actual byte values.
Only call this when the matching strategy confirmed that the file already
contains real control characters (i.e. ``escape_normalized`` matched).
"""
return s.replace('\\t', '\t').replace('\\n', '\n').replace('\\r', '\r')
def _apply_replacements(content: str, matches: List[Tuple[int, int]],
new_string: str, old_string: Optional[str] = None) -> str:
"""