security: sanitize tool error strings before injecting into model context (#26823)

Adds _sanitize_tool_error() in model_tools and routes both error paths through it: registry.dispatch's try/except (the primary path for tool exceptions) and handle_function_call's outer except (defense in depth). Stripping targets structural framing tokens that the model itself can react to even though json.dumps already handles wire-layer escaping: XML role tags (tool_call, function_call, result, response, output, input, system, assistant, user), CDATA sections, and markdown code fences. Caps message body at 2000 chars and wraps with [TOOL_ERROR] prefix. Defense-in-depth: a tool exception carrying '<tool_call>...' won't break message framing (json escapes it), but the model still reads those tokens and they nudge it toward role-confusion framing. Ported from ironclaw#1639 (one piece of #3838's three-feature scout). The truncated-tool-call (#1632) and empty-response-recovery (#1677, #1720) pieces are skipped because main now implements both far more thoroughly (run_agent.py L8147/L12209/L13012 for truncation retry + length rewrite; L4500/L15090+ for empty-response scaffolding stripper, multi-stage nudge, fallback model activation).
2026-07-15 14:22:43 +00:00 · 2026-05-16 00:57:39 -07:00 · 2026-05-16 00:57:39 -07:00 · 627f8a5f1d
commit 627f8a5f1d
parent 70b663504f
3 changed files with 191 additions and 2 deletions
--- a/model_tools.py
+++ b/model_tools.py
@ -21,6 +21,7 @@ Public API (signatures preserved from the original 2,400-line version):
 """

 import json
+import re
 import asyncio
 import logging
 import threading
@ -485,6 +486,48 @@ _AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task"}
 _READ_SEARCH_TOOLS = {"read_file", "search_files"}


+# =========================================================================
+# Tool error sanitization
+# =========================================================================
+#
+# Tool exceptions can carry arbitrary text into the model's context as the
+# `tool` message content. json.dumps() handles quote/backslash escaping so a
+# raw injection of `</tool_call>` won't break message framing, but the model
+# still *reads* those tokens and they can confuse downstream tool-call
+# parsing or, in adversarial cases, nudge it toward role-confusion framing.
+#
+# This helper strips structural framing tokens (XML role tags, CDATA,
+# markdown code fences) and caps the message at a sane upper bound before it
+# becomes part of the conversation. It's defense-in-depth — the json layer
+# already prevents framing escape — but cheap and worth having.
+#
+# Ported from ironclaw#1639.
+_TOOL_ERROR_ROLE_TAG_RE = re.compile(
+    r'</?(?:tool_call|function_call|result|response|output|input|system|assistant|user)>',
+    re.IGNORECASE,
+)
+_TOOL_ERROR_FENCE_OPEN_RE = re.compile(r'^\s*```(?:json|xml|html|markdown)?\s*', re.MULTILINE)
+_TOOL_ERROR_FENCE_CLOSE_RE = re.compile(r'\s*```\s*$', re.MULTILINE)
+_TOOL_ERROR_CDATA_RE = re.compile(r'<!\[CDATA\[.*?\]\]>', re.DOTALL)
+_TOOL_ERROR_MAX_LEN = 2000
+
+
+def _sanitize_tool_error(error_msg: str) -> str:
+    """Strip structural framing tokens from a tool error before showing it to the model.
+
+    See _TOOL_ERROR_ROLE_TAG_RE docstring above for rationale.
+    """
+    if not error_msg:
+        return "[TOOL_ERROR] "
+    sanitized = _TOOL_ERROR_ROLE_TAG_RE.sub("", error_msg)
+    sanitized = _TOOL_ERROR_FENCE_OPEN_RE.sub("", sanitized)
+    sanitized = _TOOL_ERROR_FENCE_CLOSE_RE.sub("", sanitized)
+    sanitized = _TOOL_ERROR_CDATA_RE.sub("", sanitized)
+    if len(sanitized) > _TOOL_ERROR_MAX_LEN:
+        sanitized = sanitized[:_TOOL_ERROR_MAX_LEN - 3] + "..."
+    return f"[TOOL_ERROR] {sanitized}"
+
+
 # =========================================================================
 # Tool argument type coercion
 # =========================================================================
@ -824,7 +867,7 @@ def handle_function_call(
    except Exception as e:
        error_msg = f"Error executing {function_name}: {str(e)}"
        logger.exception(error_msg)
-        return json.dumps({"error": error_msg}, ensure_ascii=False)
+        return json.dumps({"error": _sanitize_tool_error(error_msg)}, ensure_ascii=False)


 # =============================================================================