mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(patch): widen new_string \t/\r unescape to all match strategies (#33733)
Extends @liuhao1024's escape-normalized fix so the patch tool also recovers when old_string carries a real tab byte and matches via the `exact` strategy — which is the headline reproduction in the issue and the most common case in practice (LLMs frequently get old_string right because they re-read the file, but still serialize new_string's tabs as two-character `\t`). Instead of gating on the match strategy, decide per-sequence by looking at the *matched region of the file*: only convert `\t` -> tab and `\r` -> CR when the file region we're replacing actually contains the corresponding control byte. That mirrors the region-based heuristic in `_detect_escape_drift` and keeps legitimate writes of the literal two-character string `"\t"` (e.g. patching `sep = "\t"` in Python source) untouched — those files have a backslash+t in the matched region, not a real tab, so new_string passes through verbatim. `\n` is still excluded because newlines serialize correctly through JSON and unescaping would corrupt source escape sequences far more often than help. E2E verified against the live `patch` tool: tab-indented file + literal `\t` in new_string under both `exact` (Variant 1) and `escape_normalized` (Variant 2) strategies now produces real tab bytes; a Python source line containing `sep = "\t"` (legitimate literal backslash-t) survives a patch unchanged. Tests updated to cover both strategies and the legitimate-literal case, and to assert that `\n` is intentionally preserved. Refs #33733
This commit is contained in:
parent
e9f3f2b34a
commit
78be458608
2 changed files with 130 additions and 51 deletions
|
|
@ -431,18 +431,25 @@ class TestFormatNoMatchHint:
|
|||
|
||||
|
||||
class TestEscapeNormalizedNewString:
|
||||
"""Regression tests for unescaping common sequences in new_string
|
||||
when the match succeeded via the escape_normalized strategy.
|
||||
"""Regression tests for unescaping common sequences in new_string when
|
||||
the matched region of the file contains real control characters.
|
||||
|
||||
Issue #33733: LLMs overwhelmingly represent tabs as the two-character
|
||||
sequence \\t (backslash + t) in JSON tool-call arguments. When the file
|
||||
contains real tab bytes (0x09), the escape_normalized strategy matches
|
||||
old_string by unescaping it — but new_string was written as-is, leaving
|
||||
literal \\t characters in the file.
|
||||
sequence ``\\t`` (backslash + t) in JSON tool-call arguments. When the
|
||||
file already contains real tab bytes (0x09), writing new_string
|
||||
verbatim leaves literal ``\\t`` characters and corrupts the file.
|
||||
|
||||
The fix unescapes ``\\t`` -> tab and ``\\r`` -> CR in new_string when
|
||||
the matched file region actually contains those control characters,
|
||||
regardless of which match strategy fired. ``\\n`` is excluded because
|
||||
newlines serialize correctly through JSON.
|
||||
"""
|
||||
|
||||
def test_tab_in_new_string_unescaped(self):
|
||||
"""File has real tab, model sends literal \\t in new_string."""
|
||||
def test_tab_in_new_string_unescaped_under_escape_normalized(self):
|
||||
"""File has real tab, model sends literal \\t in BOTH old and new.
|
||||
|
||||
Match strategy is ``escape_normalized``.
|
||||
"""
|
||||
content = "def hello():\n\tprint(\"before\")\n"
|
||||
old_string = "def hello():\n\\tprint(\"before\")\n"
|
||||
new_string = "def hello():\n\\tprint(\"after\")\n"
|
||||
|
|
@ -450,21 +457,25 @@ class TestEscapeNormalizedNewString:
|
|||
assert err is None, f"Unexpected error: {err}"
|
||||
assert count == 1
|
||||
assert strategy == "escape_normalized"
|
||||
# Must contain a real tab, not literal \t
|
||||
assert "\tprint(\"after\")" in new
|
||||
assert "\\t" not in new
|
||||
|
||||
def test_newline_in_new_string_unescaped(self):
|
||||
"""File has real newlines, model sends literal \\n in new_string."""
|
||||
content = "line1\nline2\nline3\n"
|
||||
old_string = "line1\\nline2\\n"
|
||||
new_string = "line1\\nreplaced\\n"
|
||||
def test_tab_in_new_string_unescaped_under_exact(self):
|
||||
"""File has real tab, old_string has real tab too (matches via
|
||||
``exact``), but new_string still arrives with literal ``\\t``.
|
||||
|
||||
This is the issue's headline reproduction — the previous fix that
|
||||
gated on ``strategy_name == "escape_normalized"`` missed this case.
|
||||
"""
|
||||
content = "def hello():\n\tprint(\"before\")\n"
|
||||
old_string = "\tprint(\"before\")" # real tab
|
||||
new_string = "\\tprint(\"after\")" # literal backslash + t
|
||||
new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
|
||||
assert err is None, f"Unexpected error: {err}"
|
||||
assert count == 1
|
||||
assert strategy == "escape_normalized"
|
||||
assert "replaced" in new
|
||||
assert "\\n" not in new
|
||||
assert strategy == "exact"
|
||||
assert "\tprint(\"after\")" in new
|
||||
assert "\\t" not in new
|
||||
|
||||
def test_carriage_return_in_new_string_unescaped(self):
|
||||
"""File has real CR, model sends literal \\r in new_string."""
|
||||
|
|
@ -477,27 +488,59 @@ class TestEscapeNormalizedNewString:
|
|||
assert strategy == "escape_normalized"
|
||||
assert "replaced\r" in new
|
||||
|
||||
def test_mixed_escape_sequences_in_new_string(self):
|
||||
"""Model sends \\t and \\n together in new_string."""
|
||||
def test_newline_in_new_string_NOT_unescaped(self):
|
||||
"""``\\n`` is intentionally left alone — newlines serialize correctly
|
||||
through JSON, and unescaping would corrupt source-code escape
|
||||
sequences far more often than help.
|
||||
"""
|
||||
content = "line1\nline2\n"
|
||||
old_string = "line1\nline2"
|
||||
new_string = "alpha\\nbeta" # literal backslash + n
|
||||
new, count, _, err = fuzzy_find_and_replace(content, old_string, new_string)
|
||||
assert err is None, f"Unexpected error: {err}"
|
||||
assert count == 1
|
||||
# The literal two-character sequence ``\n`` must survive verbatim.
|
||||
assert "alpha\\nbeta" in new
|
||||
# And there should be no real newline added where ``\\n`` sat.
|
||||
assert "alpha\nbeta" not in new
|
||||
|
||||
def test_mixed_tab_and_newline_only_tab_unescaped(self):
|
||||
"""When new_string contains both \\t and \\n, only \\t is converted."""
|
||||
content = "def foo():\n\tpass\n"
|
||||
old_string = "def foo():\\n\\tpass\\n"
|
||||
old_string = "def foo():\n\tpass\n"
|
||||
new_string = "def bar():\\n\\treturn 1\\n"
|
||||
new, count, _, err = fuzzy_find_and_replace(content, old_string, new_string)
|
||||
assert err is None, f"Unexpected error: {err}"
|
||||
assert count == 1
|
||||
# \t -> real tab
|
||||
assert "\treturn 1" in new
|
||||
assert "\\t" not in new
|
||||
# \n preserved as literal backslash-n
|
||||
assert "\\n" in new
|
||||
|
||||
def test_exact_match_preserves_literal_backslash_t_in_string_literal(self):
|
||||
"""If the matched region of the file does NOT contain a real tab,
|
||||
new_string's literal ``\\t`` is preserved — the file genuinely uses
|
||||
a backslash-t sequence (e.g. a Python source line ``sep = "\\t"``).
|
||||
"""
|
||||
content = 'sep = "\\t"\n' # source contains backslash + t
|
||||
old_string = 'sep = "\\t"\n'
|
||||
new_string = 'sep = "\\tab"\n' # still backslash + t literal
|
||||
new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
|
||||
assert err is None, f"Unexpected error: {err}"
|
||||
assert count == 1
|
||||
assert strategy == "escape_normalized"
|
||||
assert "\treturn 1" in new
|
||||
assert "\\t" not in new
|
||||
assert "\\n" not in new
|
||||
|
||||
def test_exact_match_preserves_literal_escapes(self):
|
||||
"""When matching via exact strategy, literal \\t in new_string should
|
||||
NOT be unescaped — the file genuinely contains backslash-t characters."""
|
||||
content = "path = \"C:\\\\Users\\\"\n"
|
||||
old_string = "path = \"C:\\\\Users\\\"\n"
|
||||
new_string = "path = \"D:\\\\data\\\"\n"
|
||||
new, count, strategy, err = fuzzy_find_and_replace(content, old_string, new_string)
|
||||
assert err is None
|
||||
assert strategy == "exact"
|
||||
# Literal backslashes should be preserved
|
||||
assert "\\\\" in new
|
||||
# File still has the literal two-char ``\t`` — no tab byte injected.
|
||||
assert 'sep = "\\tab"' in new
|
||||
assert "\t" not in new
|
||||
|
||||
def test_no_escape_sequences_passthrough(self):
|
||||
"""When new_string has no \\t or \\r, the helper is a no-op."""
|
||||
content = "def foo():\n return 1\n"
|
||||
old_string = "def foo():\n return 1\n"
|
||||
new_string = "def foo():\n return 2\n"
|
||||
new, count, _, err = fuzzy_find_and_replace(content, old_string, new_string)
|
||||
assert err is None
|
||||
assert count == 1
|
||||
assert "return 2" in new
|
||||
|
||||
|
|
|
|||
|
|
@ -113,14 +113,27 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
|
|||
# old_string/new_string — e.g. LLM used 2-space indent but the
|
||||
# file is 4-space. Shift new_string by the indentation delta so
|
||||
# the replacement matches the file's actual indent pattern.
|
||||
effective_new = new_string
|
||||
if strategy_name == "escape_normalized":
|
||||
# The escape_normalized strategy matched because old_string
|
||||
# contained literal \t/\n/\r that were unescaped to match
|
||||
# real control characters in the file. Apply the same
|
||||
# unescaping to new_string so we don't write literal
|
||||
# backslash sequences where the file has real tabs/newlines.
|
||||
effective_new = _unescape_common_sequences(new_string)
|
||||
# LLMs frequently serialize tabs / carriage returns in JSON
|
||||
# tool-call arguments as the two-character sequences ``\t`` and
|
||||
# ``\r`` (backslash + letter) instead of the real control bytes.
|
||||
# If we write new_string verbatim, the file ends up with literal
|
||||
# backslash sequences where the surrounding code uses real tabs.
|
||||
#
|
||||
# Strategy: only unescape when the matched region of the file
|
||||
# *actually contains* the corresponding real control character.
|
||||
# That mirrors the region-based heuristic in
|
||||
# ``_detect_escape_drift`` and keeps legitimate writes of the
|
||||
# literal two-character string ``"\t"`` (e.g. patching Python
|
||||
# source that contains a tab string literal in source text)
|
||||
# untouched — those files have a backslash+t in the matched
|
||||
# region, not a real tab, so we leave new_string alone.
|
||||
#
|
||||
# ``\n`` is intentionally excluded: newlines serialize correctly
|
||||
# through JSON, and rewriting backslash-n would mangle escape
|
||||
# sequences in source code constants far more often than help.
|
||||
effective_new = _maybe_unescape_new_string(
|
||||
new_string, content, matches,
|
||||
)
|
||||
new_content = _apply_replacements(
|
||||
content, matches, effective_new,
|
||||
old_string=old_string if strategy_name != "exact" else None,
|
||||
|
|
@ -255,17 +268,40 @@ def _reindent_replacement(file_region: str, old_string: str, new_string: str) ->
|
|||
return "\n".join(out_lines)
|
||||
|
||||
|
||||
def _unescape_common_sequences(s: str) -> str:
|
||||
"""Unescape common C-style escape sequences that LLMs produce literally.
|
||||
def _maybe_unescape_new_string(new_string: str,
|
||||
content: str,
|
||||
matches: List[Tuple[int, int]]) -> str:
|
||||
"""Conditionally unescape ``\\t``/``\\r`` in new_string.
|
||||
|
||||
When the model sends ``\\t`` (two characters: backslash + t) instead of a
|
||||
real tab byte (0x09), the patch tool would write the literal characters.
|
||||
This helper converts common escape sequences to their actual byte values.
|
||||
LLMs frequently send the two-character sequences ``\\t`` (backslash + t)
|
||||
and ``\\r`` (backslash + r) inside JSON tool-call arguments where they
|
||||
meant a real tab or carriage-return byte. Writing the string verbatim
|
||||
corrupts tab-indented files with literal backslash-letter pairs.
|
||||
|
||||
Only call this when the matching strategy confirmed that the file already
|
||||
contains real control characters (i.e. ``escape_normalized`` matched).
|
||||
The unescape is only applied per-sequence when the *matched region of
|
||||
the file* actually contains the corresponding control character — that
|
||||
is, we only convert ``\\t`` -> tab when the file region we're replacing
|
||||
contains a real tab byte. Files that legitimately contain the literal
|
||||
two-character string ``"\\t"`` (e.g. a Python source line that defines
|
||||
``sep = "\\t"``) get a backslash+t in the matched region instead of a
|
||||
tab, so we leave new_string alone.
|
||||
|
||||
``\\n`` is intentionally excluded: newlines serialize correctly through
|
||||
JSON and rewriting backslash-n would corrupt escape sequences in
|
||||
string literals far more often than it would help.
|
||||
"""
|
||||
return s.replace('\\t', '\t').replace('\\n', '\n').replace('\\r', '\r')
|
||||
# Cheap pre-check — bail out unless new_string actually contains one of
|
||||
# the suspect sequences. Keeps the common case free.
|
||||
if "\\t" not in new_string and "\\r" not in new_string:
|
||||
return new_string
|
||||
|
||||
matched_regions = "".join(content[start:end] for start, end in matches)
|
||||
out = new_string
|
||||
if "\\t" in out and "\t" in matched_regions:
|
||||
out = out.replace("\\t", "\t")
|
||||
if "\\r" in out and "\r" in matched_regions:
|
||||
out = out.replace("\\r", "\r")
|
||||
return out
|
||||
|
||||
|
||||
def _apply_replacements(content: str, matches: List[Tuple[int, int]],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue