From 627f8a5f1dab2847a5fb97fa79daa6d0bc96d8bd Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 16 May 2026 00:57:39 -0700
Subject: [PATCH] security: sanitize tool error strings before injecting into
 model context (#26823)

Adds _sanitize_tool_error() in model_tools and routes both error paths
through it: registry.dispatch's try/except (the primary path for tool
exceptions) and handle_function_call's outer except (defense in depth).

Stripping targets structural framing tokens that the model itself can
react to even though json.dumps already handles wire-layer escaping:
XML role tags (tool_call, function_call, result, response, output,
input, system, assistant, user), CDATA sections, and markdown code
fences. Caps message body at 2000 chars and wraps with [TOOL_ERROR]
prefix.

Defense-in-depth: a tool exception carrying '<tool_call>...' won't
break message framing (json escapes it), but the model still reads
those tokens and they nudge it toward role-confusion framing.

Ported from ironclaw#1639 (one piece of #3838's three-feature scout).
The truncated-tool-call (#1632) and empty-response-recovery (#1677,
#1720) pieces are skipped because main now implements both far more
thoroughly (run_agent.py L8147/L12209/L13012 for truncation retry +
length rewrite; L4500/L15090+ for empty-response scaffolding stripper,
multi-stage nudge, fallback model activation).
---
 model_tools.py                    |  45 +++++++++-
 tests/test_sanitize_tool_error.py | 137 ++++++++++++++++++++++++++++++
 tools/registry.py                 |  11 ++-
 3 files changed, 191 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_sanitize_tool_error.py

diff --git a/model_tools.py b/model_tools.py
index db19bb67e53..1cbc83096ac 100644
--- a/model_tools.py
+++ b/model_tools.py
@@ -21,6 +21,7 @@ Public API (signatures preserved from the original 2,400-line version):
 """
 
 import json
+import re
 import asyncio
 import logging
 import threading
@@ -485,6 +486,48 @@ _AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task"}
 _READ_SEARCH_TOOLS = {"read_file", "search_files"}
 
 
+# =========================================================================
+# Tool error sanitization
+# =========================================================================
+#
+# Tool exceptions can carry arbitrary text into the model's context as the
+# `tool` message content. json.dumps() handles quote/backslash escaping so a
+# raw injection of `</tool_call>` won't break message framing, but the model
+# still *reads* those tokens and they can confuse downstream tool-call
+# parsing or, in adversarial cases, nudge it toward role-confusion framing.
+#
+# This helper strips structural framing tokens (XML role tags, CDATA,
+# markdown code fences) and caps the message at a sane upper bound before it
+# becomes part of the conversation. It's defense-in-depth — the json layer
+# already prevents framing escape — but cheap and worth having.
+#
+# Ported from ironclaw#1639.
+_TOOL_ERROR_ROLE_TAG_RE = re.compile(
+    r'</?(?:tool_call|function_call|result|response|output|input|system|assistant|user)>',
+    re.IGNORECASE,
+)
+_TOOL_ERROR_FENCE_OPEN_RE = re.compile(r'^\s*```(?:json|xml|html|markdown)?\s*', re.MULTILINE)
+_TOOL_ERROR_FENCE_CLOSE_RE = re.compile(r'\s*```\s*$', re.MULTILINE)
+_TOOL_ERROR_CDATA_RE = re.compile(r'<!\[CDATA\[.*?\]\]>', re.DOTALL)
+_TOOL_ERROR_MAX_LEN = 2000
+
+
+def _sanitize_tool_error(error_msg: str) -> str:
+    """Strip structural framing tokens from a tool error before showing it to the model.
+
+    See _TOOL_ERROR_ROLE_TAG_RE docstring above for rationale.
+    """
+    if not error_msg:
+        return "[TOOL_ERROR] "
+    sanitized = _TOOL_ERROR_ROLE_TAG_RE.sub("", error_msg)
+    sanitized = _TOOL_ERROR_FENCE_OPEN_RE.sub("", sanitized)
+    sanitized = _TOOL_ERROR_FENCE_CLOSE_RE.sub("", sanitized)
+    sanitized = _TOOL_ERROR_CDATA_RE.sub("", sanitized)
+    if len(sanitized) > _TOOL_ERROR_MAX_LEN:
+        sanitized = sanitized[:_TOOL_ERROR_MAX_LEN - 3] + "..."
+    return f"[TOOL_ERROR] {sanitized}"
+
+
 # =========================================================================
 # Tool argument type coercion
 # =========================================================================
@@ -824,7 +867,7 @@ def handle_function_call(
     except Exception as e:
         error_msg = f"Error executing {function_name}: {str(e)}"
         logger.exception(error_msg)
-        return json.dumps({"error": error_msg}, ensure_ascii=False)
+        return json.dumps({"error": _sanitize_tool_error(error_msg)}, ensure_ascii=False)
 
 
 # =============================================================================
diff --git a/tests/test_sanitize_tool_error.py b/tests/test_sanitize_tool_error.py
new file mode 100644
index 00000000000..3a0685bf3d7
--- /dev/null
+++ b/tests/test_sanitize_tool_error.py
@@ -0,0 +1,137 @@
+"""Tests for `_sanitize_tool_error` in model_tools.
+
+Ported from ironclaw#1639 — defense-in-depth on tool exception strings before
+they enter the model's `tool` message content. Note that `json.dumps()` in
+`handle_function_call` already handles quote/backslash escaping at the wire
+layer; this helper exists to strip structural framing tokens the model
+itself might react to (XML role tags, CDATA, markdown code fences) and to
+cap pathological lengths.
+"""
+from __future__ import annotations
+
+from model_tools import _sanitize_tool_error, _TOOL_ERROR_MAX_LEN
+
+
+class TestRoleTagStripping:
+    def test_strips_tool_call_tags(self):
+        out = _sanitize_tool_error("bad <tool_call>injected</tool_call> happened")
+        assert "<tool_call>" not in out
+        assert "</tool_call>" not in out
+        assert "bad injected happened" in out
+
+    def test_strips_function_call_tags(self):
+        out = _sanitize_tool_error("<function_call>x</function_call>")
+        assert "<function_call>" not in out
+        assert "</function_call>" not in out
+
+    def test_strips_role_tags(self):
+        # Each of these should be stripped
+        for tag in ("system", "assistant", "user", "result", "response", "output", "input"):
+            raw = f"prefix <{tag}>hi</{tag}> suffix"
+            out = _sanitize_tool_error(raw)
+            assert f"<{tag}>" not in out, f"failed to strip <{tag}>"
+            assert f"</{tag}>" not in out, f"failed to strip </{tag}>"
+
+    def test_role_tag_strip_is_case_insensitive(self):
+        out = _sanitize_tool_error("<TOOL_CALL>x</Tool_Call>")
+        assert "<" not in out.replace("[TOOL_ERROR]", "")  # only the prefix bracket survives
+
+    def test_unrelated_xml_kept(self):
+        # We intentionally only strip the role-like tag whitelist, not all XML
+        out = _sanitize_tool_error("Error parsing <ParseError>line 5</ParseError>")
+        assert "<ParseError>" in out
+
+
+class TestCDATAStripping:
+    def test_strips_cdata(self):
+        out = _sanitize_tool_error("error: <![CDATA[malicious]]> here")
+        assert "<![CDATA[" not in out
+        assert "]]>" not in out
+
+    def test_strips_multiline_cdata(self):
+        out = _sanitize_tool_error("a\n<![CDATA[line1\nline2]]>\nb")
+        assert "CDATA" not in out
+        assert "a" in out and "b" in out
+
+
+class TestCodeFenceStripping:
+    def test_strips_leading_fence_with_lang(self):
+        out = _sanitize_tool_error("```json\n{\"x\": 1}")
+        assert not out.replace("[TOOL_ERROR] ", "").startswith("```")
+
+    def test_strips_trailing_fence(self):
+        out = _sanitize_tool_error("payload\n```")
+        assert not out.rstrip().endswith("```")
+
+    def test_strips_bare_fence(self):
+        out = _sanitize_tool_error("```\nstuff")
+        assert "```" not in out.split("\n")[0]
+
+
+class TestTruncation:
+    def test_caps_long_input(self):
+        long = "A" * (_TOOL_ERROR_MAX_LEN * 2)
+        out = _sanitize_tool_error(long)
+        # Total length is prefix + truncated body
+        body = out[len("[TOOL_ERROR] "):]
+        assert len(body) == _TOOL_ERROR_MAX_LEN
+        assert body.endswith("...")
+
+    def test_does_not_truncate_short_input(self):
+        msg = "short error"
+        out = _sanitize_tool_error(msg)
+        assert "..." not in out
+        assert msg in out
+
+
+class TestEnvelope:
+    def test_wraps_with_prefix(self):
+        out = _sanitize_tool_error("oh no")
+        assert out.startswith("[TOOL_ERROR] ")
+
+    def test_empty_input(self):
+        out = _sanitize_tool_error("")
+        assert out == "[TOOL_ERROR] "
+
+    def test_preserves_normal_error_text(self):
+        msg = "Error executing read_file: FileNotFoundError: /tmp/missing"
+        out = _sanitize_tool_error(msg)
+        assert msg in out
+
+
+class TestHandleFunctionCallIntegration:
+    """Verify handle_function_call routes exception-path errors through the sanitizer.
+
+    Note: the "Unknown tool: ..." early-return in tools/registry.py is a
+    *different* code path from `except Exception` in handle_function_call —
+    that one returns directly without sanitization (and there's nothing to
+    sanitize in a hardcoded format string anyway). This test exercises the
+    real exception path by passing args that make a known tool raise.
+    """
+
+    def test_exception_path_error_is_sanitized(self):
+        import json
+        from model_tools import handle_function_call
+        from tools.registry import registry as _registry
+
+        # Force a known tool to raise with a payload containing role tags.
+        def boom(_args, **_kwargs):
+            raise RuntimeError("<tool_call>injected</tool_call> boom")
+
+        all_tools = _registry.get_all_tool_names()
+        assert all_tools, "no tools registered — test environment broken"
+        target = all_tools[0]
+        original = _registry._tools[target].handler
+        _registry._tools[target].handler = boom
+        try:
+            result_str = handle_function_call(target, {})
+        finally:
+            _registry._tools[target].handler = original
+
+        payload = json.loads(result_str)
+        assert "error" in payload, payload
+        assert payload["error"].startswith("[TOOL_ERROR] "), payload["error"]
+        # Role-tag stripping carried through
+        assert "<tool_call>" not in payload["error"]
+        assert "</tool_call>" not in payload["error"]
+        assert "boom" in payload["error"]
diff --git a/tools/registry.py b/tools/registry.py
index 2639eac74ed..7bb92e85f96 100644
--- a/tools/registry.py
+++ b/tools/registry.py
@@ -404,7 +404,16 @@ class ToolRegistry:
             return entry.handler(args, **kwargs)
         except Exception as e:
             logger.exception("Tool %s dispatch error: %s", name, e)
-            return json.dumps({"error": f"Tool execution failed: {type(e).__name__}: {e}"})
+            # Route through the sanitizer so framing tokens / CDATA / fences
+            # in exception strings don't reach the model as structural noise.
+            # See model_tools._sanitize_tool_error for rationale.
+            raw = f"Tool execution failed: {type(e).__name__}: {e}"
+            try:
+                from model_tools import _sanitize_tool_error
+                sanitized = _sanitize_tool_error(raw)
+            except Exception:
+                sanitized = raw  # defensive: never let the sanitizer block error propagation
+            return json.dumps({"error": sanitized})
 
     # ------------------------------------------------------------------
     # Query helpers  (replace redundant dicts in model_tools.py)