security: sanitize tool error strings before injecting into model context (#26823)

Adds _sanitize_tool_error() in model_tools and routes both error paths
through it: registry.dispatch's try/except (the primary path for tool
exceptions) and handle_function_call's outer except (defense in depth).

Stripping targets structural framing tokens that the model itself can
react to even though json.dumps already handles wire-layer escaping:
XML role tags (tool_call, function_call, result, response, output,
input, system, assistant, user), CDATA sections, and markdown code
fences. Caps message body at 2000 chars and wraps with [TOOL_ERROR]
prefix.

Defense-in-depth: a tool exception carrying '<tool_call>...' won't
break message framing (json escapes it), but the model still reads
those tokens and they nudge it toward role-confusion framing.

Ported from ironclaw#1639 (one piece of #3838's three-feature scout).
The truncated-tool-call (#1632) and empty-response-recovery (#1677,
#1720) pieces are skipped because main now implements both far more
thoroughly (run_agent.py L8147/L12209/L13012 for truncation retry +
length rewrite; L4500/L15090+ for empty-response scaffolding stripper,
multi-stage nudge, fallback model activation).
This commit is contained in:
Teknium 2026-05-16 00:57:39 -07:00 committed by GitHub
parent 70b663504f
commit 627f8a5f1d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 191 additions and 2 deletions

View file

@ -21,6 +21,7 @@ Public API (signatures preserved from the original 2,400-line version):
"""
import json
import re
import asyncio
import logging
import threading
@ -485,6 +486,48 @@ _AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task"}
_READ_SEARCH_TOOLS = {"read_file", "search_files"}
# =========================================================================
# Tool error sanitization
# =========================================================================
#
# Tool exceptions can carry arbitrary text into the model's context as the
# `tool` message content. json.dumps() handles quote/backslash escaping so a
# raw injection of `</tool_call>` won't break message framing, but the model
# still *reads* those tokens and they can confuse downstream tool-call
# parsing or, in adversarial cases, nudge it toward role-confusion framing.
#
# This helper strips structural framing tokens (XML role tags, CDATA,
# markdown code fences) and caps the message at a sane upper bound before it
# becomes part of the conversation. It's defense-in-depth — the json layer
# already prevents framing escape — but cheap and worth having.
#
# Ported from ironclaw#1639.
_TOOL_ERROR_ROLE_TAG_RE = re.compile(
r'</?(?:tool_call|function_call|result|response|output|input|system|assistant|user)>',
re.IGNORECASE,
)
_TOOL_ERROR_FENCE_OPEN_RE = re.compile(r'^\s*```(?:json|xml|html|markdown)?\s*', re.MULTILINE)
_TOOL_ERROR_FENCE_CLOSE_RE = re.compile(r'\s*```\s*$', re.MULTILINE)
_TOOL_ERROR_CDATA_RE = re.compile(r'<!\[CDATA\[.*?\]\]>', re.DOTALL)
_TOOL_ERROR_MAX_LEN = 2000
def _sanitize_tool_error(error_msg: str) -> str:
"""Strip structural framing tokens from a tool error before showing it to the model.
See _TOOL_ERROR_ROLE_TAG_RE docstring above for rationale.
"""
if not error_msg:
return "[TOOL_ERROR] "
sanitized = _TOOL_ERROR_ROLE_TAG_RE.sub("", error_msg)
sanitized = _TOOL_ERROR_FENCE_OPEN_RE.sub("", sanitized)
sanitized = _TOOL_ERROR_FENCE_CLOSE_RE.sub("", sanitized)
sanitized = _TOOL_ERROR_CDATA_RE.sub("", sanitized)
if len(sanitized) > _TOOL_ERROR_MAX_LEN:
sanitized = sanitized[:_TOOL_ERROR_MAX_LEN - 3] + "..."
return f"[TOOL_ERROR] {sanitized}"
# =========================================================================
# Tool argument type coercion
# =========================================================================
@ -824,7 +867,7 @@ def handle_function_call(
except Exception as e:
error_msg = f"Error executing {function_name}: {str(e)}"
logger.exception(error_msg)
return json.dumps({"error": error_msg}, ensure_ascii=False)
return json.dumps({"error": _sanitize_tool_error(error_msg)}, ensure_ascii=False)
# =============================================================================

View file

@ -0,0 +1,137 @@
"""Tests for `_sanitize_tool_error` in model_tools.
Ported from ironclaw#1639 — defense-in-depth on tool exception strings before
they enter the model's `tool` message content. Note that `json.dumps()` in
`handle_function_call` already handles quote/backslash escaping at the wire
layer; this helper exists to strip structural framing tokens the model
itself might react to (XML role tags, CDATA, markdown code fences) and to
cap pathological lengths.
"""
from __future__ import annotations
from model_tools import _sanitize_tool_error, _TOOL_ERROR_MAX_LEN
class TestRoleTagStripping:
def test_strips_tool_call_tags(self):
out = _sanitize_tool_error("bad <tool_call>injected</tool_call> happened")
assert "<tool_call>" not in out
assert "</tool_call>" not in out
assert "bad injected happened" in out
def test_strips_function_call_tags(self):
out = _sanitize_tool_error("<function_call>x</function_call>")
assert "<function_call>" not in out
assert "</function_call>" not in out
def test_strips_role_tags(self):
# Each of these should be stripped
for tag in ("system", "assistant", "user", "result", "response", "output", "input"):
raw = f"prefix <{tag}>hi</{tag}> suffix"
out = _sanitize_tool_error(raw)
assert f"<{tag}>" not in out, f"failed to strip <{tag}>"
assert f"</{tag}>" not in out, f"failed to strip </{tag}>"
def test_role_tag_strip_is_case_insensitive(self):
out = _sanitize_tool_error("<TOOL_CALL>x</Tool_Call>")
assert "<" not in out.replace("[TOOL_ERROR]", "") # only the prefix bracket survives
def test_unrelated_xml_kept(self):
# We intentionally only strip the role-like tag whitelist, not all XML
out = _sanitize_tool_error("Error parsing <ParseError>line 5</ParseError>")
assert "<ParseError>" in out
class TestCDATAStripping:
def test_strips_cdata(self):
out = _sanitize_tool_error("error: <![CDATA[malicious]]> here")
assert "<![CDATA[" not in out
assert "]]>" not in out
def test_strips_multiline_cdata(self):
out = _sanitize_tool_error("a\n<![CDATA[line1\nline2]]>\nb")
assert "CDATA" not in out
assert "a" in out and "b" in out
class TestCodeFenceStripping:
def test_strips_leading_fence_with_lang(self):
out = _sanitize_tool_error("```json\n{\"x\": 1}")
assert not out.replace("[TOOL_ERROR] ", "").startswith("```")
def test_strips_trailing_fence(self):
out = _sanitize_tool_error("payload\n```")
assert not out.rstrip().endswith("```")
def test_strips_bare_fence(self):
out = _sanitize_tool_error("```\nstuff")
assert "```" not in out.split("\n")[0]
class TestTruncation:
def test_caps_long_input(self):
long = "A" * (_TOOL_ERROR_MAX_LEN * 2)
out = _sanitize_tool_error(long)
# Total length is prefix + truncated body
body = out[len("[TOOL_ERROR] "):]
assert len(body) == _TOOL_ERROR_MAX_LEN
assert body.endswith("...")
def test_does_not_truncate_short_input(self):
msg = "short error"
out = _sanitize_tool_error(msg)
assert "..." not in out
assert msg in out
class TestEnvelope:
def test_wraps_with_prefix(self):
out = _sanitize_tool_error("oh no")
assert out.startswith("[TOOL_ERROR] ")
def test_empty_input(self):
out = _sanitize_tool_error("")
assert out == "[TOOL_ERROR] "
def test_preserves_normal_error_text(self):
msg = "Error executing read_file: FileNotFoundError: /tmp/missing"
out = _sanitize_tool_error(msg)
assert msg in out
class TestHandleFunctionCallIntegration:
"""Verify handle_function_call routes exception-path errors through the sanitizer.
Note: the "Unknown tool: ..." early-return in tools/registry.py is a
*different* code path from `except Exception` in handle_function_call
that one returns directly without sanitization (and there's nothing to
sanitize in a hardcoded format string anyway). This test exercises the
real exception path by passing args that make a known tool raise.
"""
def test_exception_path_error_is_sanitized(self):
import json
from model_tools import handle_function_call
from tools.registry import registry as _registry
# Force a known tool to raise with a payload containing role tags.
def boom(_args, **_kwargs):
raise RuntimeError("<tool_call>injected</tool_call> boom")
all_tools = _registry.get_all_tool_names()
assert all_tools, "no tools registered — test environment broken"
target = all_tools[0]
original = _registry._tools[target].handler
_registry._tools[target].handler = boom
try:
result_str = handle_function_call(target, {})
finally:
_registry._tools[target].handler = original
payload = json.loads(result_str)
assert "error" in payload, payload
assert payload["error"].startswith("[TOOL_ERROR] "), payload["error"]
# Role-tag stripping carried through
assert "<tool_call>" not in payload["error"]
assert "</tool_call>" not in payload["error"]
assert "boom" in payload["error"]

View file

@ -404,7 +404,16 @@ class ToolRegistry:
return entry.handler(args, **kwargs)
except Exception as e:
logger.exception("Tool %s dispatch error: %s", name, e)
return json.dumps({"error": f"Tool execution failed: {type(e).__name__}: {e}"})
# Route through the sanitizer so framing tokens / CDATA / fences
# in exception strings don't reach the model as structural noise.
# See model_tools._sanitize_tool_error for rationale.
raw = f"Tool execution failed: {type(e).__name__}: {e}"
try:
from model_tools import _sanitize_tool_error
sanitized = _sanitize_tool_error(raw)
except Exception:
sanitized = raw # defensive: never let the sanitizer block error propagation
return json.dumps({"error": sanitized})
# ------------------------------------------------------------------
# Query helpers (replace redundant dicts in model_tools.py)