fix(unicode): sanitize surrogate metadata and allow two-pass retry

This commit is contained in:
Hermes Audit 2026-04-10 12:54:57 +00:00 committed by Teknium
parent 71036a7a75
commit 2c99b4e79b
2 changed files with 55 additions and 8 deletions

View file

@ -360,7 +360,8 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
"""Sanitize surrogate characters from all string content in a messages list. """Sanitize surrogate characters from all string content in a messages list.
Walks message dicts in-place. Returns True if any surrogates were found Walks message dicts in-place. Returns True if any surrogates were found
and replaced, False otherwise. and replaced, False otherwise. Covers content/text, name, and tool call
metadata/arguments so retries don't fail on a non-content field.
""" """
found = False found = False
for msg in messages: for msg in messages:
@ -377,6 +378,29 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
if isinstance(text, str) and _SURROGATE_RE.search(text): if isinstance(text, str) and _SURROGATE_RE.search(text):
part["text"] = _SURROGATE_RE.sub('\ufffd', text) part["text"] = _SURROGATE_RE.sub('\ufffd', text)
found = True found = True
name = msg.get("name")
if isinstance(name, str) and _SURROGATE_RE.search(name):
msg["name"] = _SURROGATE_RE.sub('\ufffd', name)
found = True
tool_calls = msg.get("tool_calls")
if isinstance(tool_calls, list):
for tc in tool_calls:
if not isinstance(tc, dict):
continue
tc_id = tc.get("id")
if isinstance(tc_id, str) and _SURROGATE_RE.search(tc_id):
tc["id"] = _SURROGATE_RE.sub('\ufffd', tc_id)
found = True
fn = tc.get("function")
if isinstance(fn, dict):
fn_name = fn.get("name")
if isinstance(fn_name, str) and _SURROGATE_RE.search(fn_name):
fn["name"] = _SURROGATE_RE.sub('\ufffd', fn_name)
found = True
fn_args = fn.get("arguments")
if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args):
fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args)
found = True
return found return found
@ -7242,7 +7266,7 @@ class AIAgent:
self._thinking_prefill_retries = 0 self._thinking_prefill_retries = 0
self._last_content_with_tools = None self._last_content_with_tools = None
self._mute_post_response = False self._mute_post_response = False
self._unicode_sanitized = False self._unicode_sanitization_passes = 0
# Pre-turn connection health check: detect and clean up dead TCP # Pre-turn connection health check: detect and clean up dead TCP
# connections left over from provider outages or dropped streams. # connections left over from provider outages or dropped streams.
@ -8233,14 +8257,16 @@ class AIAgent:
# 2. ASCII codec on systems with LANG=C or non-UTF-8 locale # 2. ASCII codec on systems with LANG=C or non-UTF-8 locale
# (e.g. Chromebooks) — any non-ASCII character fails. # (e.g. Chromebooks) — any non-ASCII character fails.
# Detect via the error message mentioning 'ascii' codec. # Detect via the error message mentioning 'ascii' codec.
# We sanitize messages in-place and retry once. # We sanitize messages in-place and may retry twice:
# first to strip surrogates, then once more for pure
# ASCII-only locale sanitization if needed.
# ----------------------------------------------------------- # -----------------------------------------------------------
if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_unicode_sanitized', False): if isinstance(api_error, UnicodeEncodeError) and getattr(self, '_unicode_sanitization_passes', 0) < 2:
self._unicode_sanitized = True
_err_str = str(api_error).lower() _err_str = str(api_error).lower()
_is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
_surrogates_found = _sanitize_messages_surrogates(messages) _surrogates_found = _sanitize_messages_surrogates(messages)
if _surrogates_found: if _surrogates_found:
self._unicode_sanitization_passes += 1
self._vprint( self._vprint(
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...", f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
force=True, force=True,
@ -8251,6 +8277,7 @@ class AIAgent:
# non-ASCII characters at all. Sanitize all # non-ASCII characters at all. Sanitize all
# non-ASCII content from messages and retry. # non-ASCII content from messages and retry.
if _sanitize_messages_non_ascii(messages): if _sanitize_messages_non_ascii(messages):
self._unicode_sanitization_passes += 1
self._vprint( self._vprint(
f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...", f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...",
force=True, force=True,

View file

@ -107,6 +107,26 @@ class TestSurrogateVsAsciiSanitization:
assert "\ud800" not in messages[0]["content"] assert "\ud800" not in messages[0]["content"]
assert "\ufffd" in messages[0]["content"] assert "\ufffd" in messages[0]["content"]
def test_surrogates_in_name_and_tool_calls_are_sanitized(self):
messages = [{
"role": "assistant",
"name": "bad\ud800name",
"content": None,
"tool_calls": [{
"id": "call_\ud800",
"type": "function",
"function": {
"name": "read\ud800_file",
"arguments": '{"path": "bad\ud800.txt"}'
}
}],
}]
assert _sanitize_messages_surrogates(messages) is True
assert "\ud800" not in messages[0]["name"]
assert "\ud800" not in messages[0]["tool_calls"][0]["id"]
assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["name"]
assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["arguments"]
def test_ascii_codec_strips_all_non_ascii(self): def test_ascii_codec_strips_all_non_ascii(self):
"""ASCII codec case: all non-ASCII is stripped, not replaced.""" """ASCII codec case: all non-ASCII is stripped, not replaced."""
messages = [{"role": "user", "content": "test ⚕🤖你好 end"}] messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]