fix(tools): unescape common sequences in new_string when escape_normalized matches

When the patch tool matches via the escape_normalized strategy, old_string
contains literal \t, \n, \r sequences that get unescaped to match real
control characters in the file. However, new_string was written as-is,
leaving literal backslash sequences in the output.

Add _unescape_common_sequences() helper and apply it to new_string when
the matching strategy is escape_normalized. This ensures LLM-generated
tab/newline sequences become real bytes in the patched file.

Fixes #33733
This commit is contained in:
liuhao1024 2026-05-28 16:10:00 +08:00 committed by Teknium
parent 10ee4a729b
commit e9f3f2b34a
2 changed files with 94 additions and 1 deletions

View file

@ -113,8 +113,16 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
# old_string/new_string — e.g. LLM used 2-space indent but the
# file is 4-space. Shift new_string by the indentation delta so
# the replacement matches the file's actual indent pattern.
effective_new = new_string
if strategy_name == "escape_normalized":
# The escape_normalized strategy matched because old_string
# contained literal \t/\n/\r that were unescaped to match
# real control characters in the file. Apply the same
# unescaping to new_string so we don't write literal
# backslash sequences where the file has real tabs/newlines.
effective_new = _unescape_common_sequences(new_string)
new_content = _apply_replacements(
content, matches, new_string,
content, matches, effective_new,
old_string=old_string if strategy_name != "exact" else None,
)
return new_content, len(matches), strategy_name, None
@ -247,6 +255,19 @@ def _reindent_replacement(file_region: str, old_string: str, new_string: str) ->
return "\n".join(out_lines)
def _unescape_common_sequences(s: str) -> str:
"""Unescape common C-style escape sequences that LLMs produce literally.
When the model sends ``\\t`` (two characters: backslash + t) instead of a
real tab byte (0x09), the patch tool would write the literal characters.
This helper converts common escape sequences to their actual byte values.
Only call this when the matching strategy confirmed that the file already
contains real control characters (i.e. ``escape_normalized`` matched).
"""
return s.replace('\\t', '\t').replace('\\n', '\n').replace('\\r', '\r')
def _apply_replacements(content: str, matches: List[Tuple[int, int]],
new_string: str, old_string: Optional[str] = None) -> str:
"""