From 2d444fc84d2f656554417f877bcf3d91166d405a Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 24 Apr 2026 15:06:41 -0700 Subject: [PATCH] fix(run_agent): handle unescaped control chars in tool_call arguments (#15356) Extends _repair_tool_call_arguments() to cover the most common local-model JSON corruption pattern: llama.cpp/Ollama backends emit literal tabs and newlines inside JSON string values (memory save summaries, file contents, etc.). Previously fell through to '{}' replacement, losing the call. Adds two repair passes: - Pass 0: json.loads(strict=False) + re-serialise to canonical wire form - Pass 4: escape 0x00-0x1F control chars inside string values, then retry Ports the core utility from #12068 / PR #12093 without the larger plumbing change (that PR also replaced json.loads at 8 call sites; current main's _repair_tool_call_arguments is already the single chokepoint, so the upgrade happens transparently for every existing caller). Credit: @truenorth-lj for the original utility design. 4 new regression tests covering literal newlines, tabs, re-serialisation to strict=True-valid output, and the trailing-comma + control-char combination case. --- run_agent.py | 74 +++++++++++++++++++ .../test_repair_tool_call_arguments.py | 36 +++++++++ 2 files changed, 110 insertions(+) diff --git a/run_agent.py b/run_agent.py index 9ba1f2b31e..56da8f2aae 100644 --- a/run_agent.py +++ b/run_agent.py @@ -502,6 +502,48 @@ def _sanitize_messages_surrogates(messages: list) -> bool: return found +def _escape_invalid_chars_in_json_strings(raw: str) -> str: + """Escape unescaped control chars inside JSON string values. + + Walks the raw JSON character-by-character, tracking whether we are + inside a double-quoted string. Inside strings, replaces literal + control characters (0x00-0x1F) that aren't already part of an escape + sequence with their ``\\uXXXX`` equivalents. Pass-through for everything + else. + + Ported from #12093 — complements the other repair passes in + ``_repair_tool_call_arguments`` when ``json.loads(strict=False)`` is + not enough (e.g. llama.cpp backends that emit literal apostrophes or + tabs alongside other malformations). + """ + out: list[str] = [] + in_string = False + i = 0 + n = len(raw) + while i < n: + ch = raw[i] + if in_string: + if ch == "\\" and i + 1 < n: + # Already-escaped char — pass through as-is + out.append(ch) + out.append(raw[i + 1]) + i += 2 + continue + if ch == '"': + in_string = False + out.append(ch) + elif ord(ch) < 0x20: + out.append(f"\\u{ord(ch):04x}") + else: + out.append(ch) + else: + if ch == '"': + in_string = True + out.append(ch) + i += 1 + return "".join(out) + + def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str: """Attempt to repair malformed tool_call argument JSON. @@ -523,6 +565,23 @@ def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str: logger.warning("Sanitized Python-None tool_call arguments for %s", tool_name) return "{}" + # Repair pass 0: llama.cpp backends sometimes emit literal control + # characters (tabs, newlines) inside JSON string values. json.loads + # with strict=False accepts these and lets us re-serialise the + # result into wire-valid JSON without any string surgery. This is + # the most common local-model repair case (#12068). + try: + parsed = json.loads(raw_stripped, strict=False) + reserialised = json.dumps(parsed, separators=(",", ":")) + if reserialised != raw_stripped: + logger.warning( + "Repaired unescaped control chars in tool_call arguments for %s", + tool_name, + ) + return reserialised + except (json.JSONDecodeError, TypeError, ValueError): + pass + # Attempt common JSON repairs fixed = raw_stripped # 1. Strip trailing commas before } or ] @@ -557,6 +616,21 @@ def _repair_tool_call_arguments(raw_args: str, tool_name: str = "?") -> str: except json.JSONDecodeError: pass + # Repair pass 4: escape unescaped control chars inside JSON strings, + # then retry. Catches cases where strict=False alone fails because + # other malformations are present too. + try: + escaped = _escape_invalid_chars_in_json_strings(fixed) + if escaped != fixed: + json.loads(escaped) + logger.warning( + "Repaired control-char-laced tool_call arguments for %s: %s → %s", + tool_name, raw_stripped[:80], escaped[:80], + ) + return escaped + except (json.JSONDecodeError, TypeError, ValueError): + pass + # Last resort: replace with empty object so the API request doesn't # crash the entire session. logger.warning( diff --git a/tests/run_agent/test_repair_tool_call_arguments.py b/tests/run_agent/test_repair_tool_call_arguments.py index 3b8d86d144..c282397fcc 100644 --- a/tests/run_agent/test_repair_tool_call_arguments.py +++ b/tests/run_agent/test_repair_tool_call_arguments.py @@ -105,3 +105,39 @@ class TestRepairToolCallArguments: result = _repair_tool_call_arguments(raw, "terminal") # Should at least be valid JSON, even if background is lost json.loads(result) + + # -- Stage 0: strict=False (literal control chars in strings) -- + # llama.cpp backends sometimes emit literal tabs/newlines inside JSON + # string values. strict=False accepts these; we re-serialise to the + # canonical wire form (#12068). + + def test_literal_newline_inside_string_value(self): + raw = '{"summary": "line one\nline two"}' + result = _repair_tool_call_arguments(raw, "t") + parsed = json.loads(result) + assert parsed == {"summary": "line one\nline two"} + + def test_literal_tab_inside_string_value(self): + raw = '{"summary": "col1\tcol2"}' + result = _repair_tool_call_arguments(raw, "t") + parsed = json.loads(result) + assert parsed == {"summary": "col1\tcol2"} + + def test_literal_control_char_reserialised_to_wire_form(self): + """After repair, the output must parse under strict=True.""" + raw = '{"msg": "has\tliteral\ttabs"}' + result = _repair_tool_call_arguments(raw, "t") + # strict=True must now accept this + parsed = json.loads(result) + assert parsed["msg"] == "has\tliteral\ttabs" + + # -- Stage 4: control-char escape fallback -- + + def test_control_chars_with_trailing_comma(self): + """strict=False fails due to trailing comma, but brace-count pass + + control-char escape rescues it.""" + raw = '{"msg": "line\none",}' + result = _repair_tool_call_arguments(raw, "t") + parsed = json.loads(result) + assert "line" in parsed["msg"] +