fix(patch): widen new_string \t/\r unescape to all match strategies (#33733)

Extends @liuhao1024's escape-normalized fix so the patch tool also recovers when old_string carries a real tab byte and matches via the `exact` strategy — which is the headline reproduction in the issue and the most common case in practice (LLMs frequently get old_string right because they re-read the file, but still serialize new_string's tabs as two-character `\t`). Instead of gating on the match strategy, decide per-sequence by looking at the *matched region of the file*: only convert `\t` -> tab and `\r` -> CR when the file region we're replacing actually contains the corresponding control byte. That mirrors the region-based heuristic in `_detect_escape_drift` and keeps legitimate writes of the literal two-character string `"\t"` (e.g. patching `sep = "\t"` in Python source) untouched — those files have a backslash+t in the matched region, not a real tab, so new_string passes through verbatim. `\n` is still excluded because newlines serialize correctly through JSON and unescaping would corrupt source escape sequences far more often than help. E2E verified against the live `patch` tool: tab-indented file + literal `\t` in new_string under both `exact` (Variant 1) and `escape_normalized` (Variant 2) strategies now produces real tab bytes; a Python source line containing `sep = "\t"` (legitimate literal backslash-t) survives a patch unchanged. Tests updated to cover both strategies and the legitimate-literal case, and to assert that `\n` is intentionally preserved. Refs #33733
2026-07-13 14:02:16 +00:00 · 2026-05-28 03:02:52 -07:00 · 2026-05-28 03:02:52 -07:00 · 78be458608
commit 78be458608
parent e9f3f2b34a
2 changed files with 130 additions and 51 deletions
--- a/tests/tools/test_fuzzy_match.py
+++ b/tests/tools/test_fuzzy_match.py
@ -431,18 +431,25 @@ class TestFormatNoMatchHint:


 class TestEscapeNormalizedNewString:
-    """Regression tests for unescaping common sequences in new_string
-    when the match succeeded via the escape_normalized strategy.
+    """Regression tests for unescaping common sequences in new_string when
+    the matched region of the file contains real control characters.

    Issue #33733: LLMs overwhelmingly represent tabs as the two-character
-    sequence \\t (backslash + t) in JSON tool-call arguments. When the file
-    contains real tab bytes (0x09), the escape_normalized strategy matches
-    old_string by unescaping it — but new_string was written as-is, leaving
-    literal \\t characters in the file.
+    sequence ``\\t`` (backslash + t) in JSON tool-call arguments. When the
+    file already contains real tab bytes (0x09), writing new_string
+    verbatim leaves literal ``\\t`` characters and corrupts the file.
+
+    The fix unescapes ``\\t`` -> tab and ``\\r`` -> CR in new_string when
+    the matched file region actually contains those control characters,
+    regardless of which match strategy fired. ``\\n`` is excluded because
+    newlines serialize correctly through JSON.
    """

-    def test_tab_in_new_string_unescaped(self):
-        """File has real tab, model sends literal \\t in new_string."""
+    def test_tab_in_new_string_unescaped_under_escape_normalized(self):
+        """File has real tab, model sends literal \\t in BOTH old and new.
+
+        Match strategy is ``escape_normalized``.
+        """
        content = "def hello():\n\tprint(\"before\")\n"
        old_string = "def hello():\n\\tprint(\"before\")\n"
        new_string = "def hello():\n\\tprint(\"after\")\n"
@ -450,21 +457,25 @@ class TestEscapeNormalizedNewString:
        assert err is None, f"Unexpected error: {err}"
        assert count == 1
        assert strategy == "escape_normalized"
-        # Must contain a real tab, not literal \t
        assert "\tprint(\"after\")" in new
        assert "\\t" not in new

-    def test_newline_in_new_string_unescaped(self):
-        """File has real newlines, model sends literal \\n in new_string."""
-        content = "line1\nline2\nline3\n"
-        old_string = "line1\\nline2\\n"
-        new_string = "line1\\nreplaced\\n"
+    def test_tab_in_new_string_unescaped_under_exact(self):
+        """File has real tab, old_string has real tab too (matches via
+        ``exact``), but new_string still arrives with literal ``\\t``.
+
+        This is the issue's headline reproduction — the previous fix that
+        gated on ``strategy_name == "escape_normalized"`` missed this case.
+        """
+        content = "def hello():\n\tprint(\"before\")\n"
+        old_string = "\tprint(\"before\")"           # real tab
+        new_string = "\\tprint(\"after\")"           # literal backslash + t
        new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
        assert err is None, f"Unexpected error: {err}"
        assert count == 1
-        assert strategy == "escape_normalized"
-        assert "replaced" in new
-        assert "\\n" not in new
+        assert strategy == "exact"
+        assert "\tprint(\"after\")" in new
+        assert "\\t" not in new

    def test_carriage_return_in_new_string_unescaped(self):
        """File has real CR, model sends literal \\r in new_string."""
@ -477,27 +488,59 @@ class TestEscapeNormalizedNewString:
        assert strategy == "escape_normalized"
        assert "replaced\r" in new

-    def test_mixed_escape_sequences_in_new_string(self):
-        """Model sends \\t and \\n together in new_string."""
+    def test_newline_in_new_string_NOT_unescaped(self):
+        """``\\n`` is intentionally left alone — newlines serialize correctly
+        through JSON, and unescaping would corrupt source-code escape
+        sequences far more often than help.
+        """
+        content = "line1\nline2\n"
+        old_string = "line1\nline2"
+        new_string = "alpha\\nbeta"                 # literal backslash + n
+        new, count, _, err = fuzzy_find_and_replace(content, old_string, new_string)
+        assert err is None, f"Unexpected error: {err}"
+        assert count == 1
+        # The literal two-character sequence ``\n`` must survive verbatim.
+        assert "alpha\\nbeta" in new
+        # And there should be no real newline added where ``\\n`` sat.
+        assert "alpha\nbeta" not in new
+
+    def test_mixed_tab_and_newline_only_tab_unescaped(self):
+        """When new_string contains both \\t and \\n, only \\t is converted."""
        content = "def foo():\n\tpass\n"
-        old_string = "def foo():\\n\\tpass\\n"
+        old_string = "def foo():\n\tpass\n"
        new_string = "def bar():\\n\\treturn 1\\n"
+        new, count, _, err = fuzzy_find_and_replace(content, old_string, new_string)
+        assert err is None, f"Unexpected error: {err}"
+        assert count == 1
+        # \t -> real tab
+        assert "\treturn 1" in new
+        assert "\\t" not in new
+        # \n preserved as literal backslash-n
+        assert "\\n" in new
+
+    def test_exact_match_preserves_literal_backslash_t_in_string_literal(self):
+        """If the matched region of the file does NOT contain a real tab,
+        new_string's literal ``\\t`` is preserved — the file genuinely uses
+        a backslash-t sequence (e.g. a Python source line ``sep = "\\t"``).
+        """
+        content = 'sep = "\\t"\n'                   # source contains backslash + t
+        old_string = 'sep = "\\t"\n'
+        new_string = 'sep = "\\tab"\n'              # still backslash + t literal
        new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
        assert err is None, f"Unexpected error: {err}"
        assert count == 1
-        assert strategy == "escape_normalized"
-        assert "\treturn 1" in new
-        assert "\\t" not in new
-        assert "\\n" not in new
-
-    def test_exact_match_preserves_literal_escapes(self):
-        """When matching via exact strategy, literal \\t in new_string should
-        NOT be unescaped — the file genuinely contains backslash-t characters."""
-        content = "path = \"C:\\\\Users\\\"\n"
-        old_string = "path = \"C:\\\\Users\\\"\n"
-        new_string = "path = \"D:\\\\data\\\"\n"
-        new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
-        assert err is None
        assert strategy == "exact"
-        # Literal backslashes should be preserved
-        assert "\\\\" in new
+        # File still has the literal two-char ``\t`` — no tab byte injected.
+        assert 'sep = "\\tab"' in new
+        assert "\t" not in new
+
+    def test_no_escape_sequences_passthrough(self):
+        """When new_string has no \\t or \\r, the helper is a no-op."""
+        content = "def foo():\n    return 1\n"
+        old_string = "def foo():\n    return 1\n"
+        new_string = "def foo():\n    return 2\n"
+        new, count, _, err = fuzzy_find_and_replace(content, old_string, new_string)
+        assert err is None
+        assert count == 1
+        assert "return 2" in new
+
--- a/tools/fuzzy_match.py
+++ b/tools/fuzzy_match.py
@ -113,14 +113,27 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
            # old_string/new_string — e.g. LLM used 2-space indent but the
            # file is 4-space. Shift new_string by the indentation delta so
            # the replacement matches the file's actual indent pattern.
-            effective_new = new_string
-            if strategy_name == "escape_normalized":
-                # The escape_normalized strategy matched because old_string
-                # contained literal \t/\n/\r that were unescaped to match
-                # real control characters in the file. Apply the same
-                # unescaping to new_string so we don't write literal
-                # backslash sequences where the file has real tabs/newlines.
-                effective_new = _unescape_common_sequences(new_string)
+            # LLMs frequently serialize tabs / carriage returns in JSON
+            # tool-call arguments as the two-character sequences ``\t`` and
+            # ``\r`` (backslash + letter) instead of the real control bytes.
+            # If we write new_string verbatim, the file ends up with literal
+            # backslash sequences where the surrounding code uses real tabs.
+            #
+            # Strategy: only unescape when the matched region of the file
+            # *actually contains* the corresponding real control character.
+            # That mirrors the region-based heuristic in
+            # ``_detect_escape_drift`` and keeps legitimate writes of the
+            # literal two-character string ``"\t"`` (e.g. patching Python
+            # source that contains a tab string literal in source text)
+            # untouched — those files have a backslash+t in the matched
+            # region, not a real tab, so we leave new_string alone.
+            #
+            # ``\n`` is intentionally excluded: newlines serialize correctly
+            # through JSON, and rewriting backslash-n would mangle escape
+            # sequences in source code constants far more often than help.
+            effective_new = _maybe_unescape_new_string(
+                new_string, content, matches,
+            )
            new_content = _apply_replacements(
                content, matches, effective_new,
                old_string=old_string if strategy_name != "exact" else None,
@ -255,17 +268,40 @@ def _reindent_replacement(file_region: str, old_string: str, new_string: str) ->
    return "\n".join(out_lines)


-def _unescape_common_sequences(s: str) -> str:
-    """Unescape common C-style escape sequences that LLMs produce literally.
+def _maybe_unescape_new_string(new_string: str,
+                               content: str,
+                               matches: List[Tuple[int, int]]) -> str:
+    """Conditionally unescape ``\\t``/``\\r`` in new_string.

-    When the model sends ``\\t`` (two characters: backslash + t) instead of a
-    real tab byte (0x09), the patch tool would write the literal characters.
-    This helper converts common escape sequences to their actual byte values.
+    LLMs frequently send the two-character sequences ``\\t`` (backslash + t)
+    and ``\\r`` (backslash + r) inside JSON tool-call arguments where they
+    meant a real tab or carriage-return byte. Writing the string verbatim
+    corrupts tab-indented files with literal backslash-letter pairs.

-    Only call this when the matching strategy confirmed that the file already
-    contains real control characters (i.e. ``escape_normalized`` matched).
+    The unescape is only applied per-sequence when the *matched region of
+    the file* actually contains the corresponding control character — that
+    is, we only convert ``\\t`` -> tab when the file region we're replacing
+    contains a real tab byte. Files that legitimately contain the literal
+    two-character string ``"\\t"`` (e.g. a Python source line that defines
+    ``sep = "\\t"``) get a backslash+t in the matched region instead of a
+    tab, so we leave new_string alone.
+
+    ``\\n`` is intentionally excluded: newlines serialize correctly through
+    JSON and rewriting backslash-n would corrupt escape sequences in
+    string literals far more often than it would help.
    """
-    return s.replace('\\t', '\t').replace('\\n', '\n').replace('\\r', '\r')
+    # Cheap pre-check — bail out unless new_string actually contains one of
+    # the suspect sequences. Keeps the common case free.
+    if "\\t" not in new_string and "\\r" not in new_string:
+        return new_string
+
+    matched_regions = "".join(content[start:end] for start, end in matches)
+    out = new_string
+    if "\\t" in out and "\t" in matched_regions:
+        out = out.replace("\\t", "\t")
+    if "\\r" in out and "\r" in matched_regions:
+        out = out.replace("\\r", "\r")
+    return out


 def _apply_replacements(content: str, matches: List[Tuple[int, int]],