security: sanitize tool error strings before injecting into model context (#26823)

Adds _sanitize_tool_error() in model_tools and routes both error paths
through it: registry.dispatch's try/except (the primary path for tool
exceptions) and handle_function_call's outer except (defense in depth).

Stripping targets structural framing tokens that the model itself can
react to even though json.dumps already handles wire-layer escaping:
XML role tags (tool_call, function_call, result, response, output,
input, system, assistant, user), CDATA sections, and markdown code
fences. Caps message body at 2000 chars and wraps with [TOOL_ERROR]
prefix.

Defense-in-depth: a tool exception carrying '<tool_call>...' won't
break message framing (json escapes it), but the model still reads
those tokens and they nudge it toward role-confusion framing.

Ported from ironclaw#1639 (one piece of #3838's three-feature scout).
The truncated-tool-call (#1632) and empty-response-recovery (#1677,
#1720) pieces are skipped because main now implements both far more
thoroughly (run_agent.py L8147/L12209/L13012 for truncation retry +
length rewrite; L4500/L15090+ for empty-response scaffolding stripper,
multi-stage nudge, fallback model activation).
This commit is contained in:
Teknium 2026-05-16 00:57:39 -07:00 committed by GitHub
parent 70b663504f
commit 627f8a5f1d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 191 additions and 2 deletions

View file

@ -21,6 +21,7 @@ Public API (signatures preserved from the original 2,400-line version):
"""
import json
import re
import asyncio
import logging
import threading
@ -485,6 +486,48 @@ _AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task"}
_READ_SEARCH_TOOLS = {"read_file", "search_files"}
# =========================================================================
# Tool error sanitization
# =========================================================================
#
# Tool exceptions can carry arbitrary text into the model's context as the
# `tool` message content. json.dumps() handles quote/backslash escaping so a
# raw injection of `</tool_call>` won't break message framing, but the model
# still *reads* those tokens and they can confuse downstream tool-call
# parsing or, in adversarial cases, nudge it toward role-confusion framing.
#
# This helper strips structural framing tokens (XML role tags, CDATA,
# markdown code fences) and caps the message at a sane upper bound before it
# becomes part of the conversation. It's defense-in-depth — the json layer
# already prevents framing escape — but cheap and worth having.
#
# Ported from ironclaw#1639.
_TOOL_ERROR_ROLE_TAG_RE = re.compile(
r'</?(?:tool_call|function_call|result|response|output|input|system|assistant|user)>',
re.IGNORECASE,
)
_TOOL_ERROR_FENCE_OPEN_RE = re.compile(r'^\s*```(?:json|xml|html|markdown)?\s*', re.MULTILINE)
_TOOL_ERROR_FENCE_CLOSE_RE = re.compile(r'\s*```\s*$', re.MULTILINE)
_TOOL_ERROR_CDATA_RE = re.compile(r'<!\[CDATA\[.*?\]\]>', re.DOTALL)
_TOOL_ERROR_MAX_LEN = 2000
def _sanitize_tool_error(error_msg: str) -> str:
"""Strip structural framing tokens from a tool error before showing it to the model.
See _TOOL_ERROR_ROLE_TAG_RE docstring above for rationale.
"""
if not error_msg:
return "[TOOL_ERROR] "
sanitized = _TOOL_ERROR_ROLE_TAG_RE.sub("", error_msg)
sanitized = _TOOL_ERROR_FENCE_OPEN_RE.sub("", sanitized)
sanitized = _TOOL_ERROR_FENCE_CLOSE_RE.sub("", sanitized)
sanitized = _TOOL_ERROR_CDATA_RE.sub("", sanitized)
if len(sanitized) > _TOOL_ERROR_MAX_LEN:
sanitized = sanitized[:_TOOL_ERROR_MAX_LEN - 3] + "..."
return f"[TOOL_ERROR] {sanitized}"
# =========================================================================
# Tool argument type coercion
# =========================================================================
@ -824,7 +867,7 @@ def handle_function_call(
except Exception as e:
error_msg = f"Error executing {function_name}: {str(e)}"
logger.exception(error_msg)
return json.dumps({"error": error_msg}, ensure_ascii=False)
return json.dumps({"error": _sanitize_tool_error(error_msg)}, ensure_ascii=False)
# =============================================================================