mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
security: sanitize tool error strings before injecting into model context (#26823)
Adds _sanitize_tool_error() in model_tools and routes both error paths through it: registry.dispatch's try/except (the primary path for tool exceptions) and handle_function_call's outer except (defense in depth). Stripping targets structural framing tokens that the model itself can react to even though json.dumps already handles wire-layer escaping: XML role tags (tool_call, function_call, result, response, output, input, system, assistant, user), CDATA sections, and markdown code fences. Caps message body at 2000 chars and wraps with [TOOL_ERROR] prefix. Defense-in-depth: a tool exception carrying '<tool_call>...' won't break message framing (json escapes it), but the model still reads those tokens and they nudge it toward role-confusion framing. Ported from ironclaw#1639 (one piece of #3838's three-feature scout). The truncated-tool-call (#1632) and empty-response-recovery (#1677, #1720) pieces are skipped because main now implements both far more thoroughly (run_agent.py L8147/L12209/L13012 for truncation retry + length rewrite; L4500/L15090+ for empty-response scaffolding stripper, multi-stage nudge, fallback model activation).
This commit is contained in:
parent
70b663504f
commit
627f8a5f1d
3 changed files with 191 additions and 2 deletions
|
|
@ -21,6 +21,7 @@ Public API (signatures preserved from the original 2,400-line version):
|
|||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
|
|
@ -485,6 +486,48 @@ _AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task"}
|
|||
_READ_SEARCH_TOOLS = {"read_file", "search_files"}
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tool error sanitization
|
||||
# =========================================================================
|
||||
#
|
||||
# Tool exceptions can carry arbitrary text into the model's context as the
|
||||
# `tool` message content. json.dumps() handles quote/backslash escaping so a
|
||||
# raw injection of `</tool_call>` won't break message framing, but the model
|
||||
# still *reads* those tokens and they can confuse downstream tool-call
|
||||
# parsing or, in adversarial cases, nudge it toward role-confusion framing.
|
||||
#
|
||||
# This helper strips structural framing tokens (XML role tags, CDATA,
|
||||
# markdown code fences) and caps the message at a sane upper bound before it
|
||||
# becomes part of the conversation. It's defense-in-depth — the json layer
|
||||
# already prevents framing escape — but cheap and worth having.
|
||||
#
|
||||
# Ported from ironclaw#1639.
|
||||
_TOOL_ERROR_ROLE_TAG_RE = re.compile(
|
||||
r'</?(?:tool_call|function_call|result|response|output|input|system|assistant|user)>',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_TOOL_ERROR_FENCE_OPEN_RE = re.compile(r'^\s*```(?:json|xml|html|markdown)?\s*', re.MULTILINE)
|
||||
_TOOL_ERROR_FENCE_CLOSE_RE = re.compile(r'\s*```\s*$', re.MULTILINE)
|
||||
_TOOL_ERROR_CDATA_RE = re.compile(r'<!\[CDATA\[.*?\]\]>', re.DOTALL)
|
||||
_TOOL_ERROR_MAX_LEN = 2000
|
||||
|
||||
|
||||
def _sanitize_tool_error(error_msg: str) -> str:
|
||||
"""Strip structural framing tokens from a tool error before showing it to the model.
|
||||
|
||||
See _TOOL_ERROR_ROLE_TAG_RE docstring above for rationale.
|
||||
"""
|
||||
if not error_msg:
|
||||
return "[TOOL_ERROR] "
|
||||
sanitized = _TOOL_ERROR_ROLE_TAG_RE.sub("", error_msg)
|
||||
sanitized = _TOOL_ERROR_FENCE_OPEN_RE.sub("", sanitized)
|
||||
sanitized = _TOOL_ERROR_FENCE_CLOSE_RE.sub("", sanitized)
|
||||
sanitized = _TOOL_ERROR_CDATA_RE.sub("", sanitized)
|
||||
if len(sanitized) > _TOOL_ERROR_MAX_LEN:
|
||||
sanitized = sanitized[:_TOOL_ERROR_MAX_LEN - 3] + "..."
|
||||
return f"[TOOL_ERROR] {sanitized}"
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Tool argument type coercion
|
||||
# =========================================================================
|
||||
|
|
@ -824,7 +867,7 @@ def handle_function_call(
|
|||
except Exception as e:
|
||||
error_msg = f"Error executing {function_name}: {str(e)}"
|
||||
logger.exception(error_msg)
|
||||
return json.dumps({"error": error_msg}, ensure_ascii=False)
|
||||
return json.dumps({"error": _sanitize_tool_error(error_msg)}, ensure_ascii=False)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue