mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(unicode): sanitize surrogate metadata and allow two-pass retry
This commit is contained in:
parent
71036a7a75
commit
2c99b4e79b
2 changed files with 55 additions and 8 deletions
43
run_agent.py
43
run_agent.py
|
|
@ -359,8 +359,9 @@ def _sanitize_surrogates(text: str) -> str:
|
||||||
def _sanitize_messages_surrogates(messages: list) -> bool:
|
def _sanitize_messages_surrogates(messages: list) -> bool:
|
||||||
"""Sanitize surrogate characters from all string content in a messages list.
|
"""Sanitize surrogate characters from all string content in a messages list.
|
||||||
|
|
||||||
Walks message dicts in-place. Returns True if any surrogates were found
|
Walks message dicts in-place. Returns True if any surrogates were found
|
||||||
and replaced, False otherwise.
|
and replaced, False otherwise. Covers content/text, name, and tool call
|
||||||
|
metadata/arguments so retries don't fail on a non-content field.
|
||||||
"""
|
"""
|
||||||
found = False
|
found = False
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
|
|
@ -377,6 +378,29 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
|
||||||
if isinstance(text, str) and _SURROGATE_RE.search(text):
|
if isinstance(text, str) and _SURROGATE_RE.search(text):
|
||||||
part["text"] = _SURROGATE_RE.sub('\ufffd', text)
|
part["text"] = _SURROGATE_RE.sub('\ufffd', text)
|
||||||
found = True
|
found = True
|
||||||
|
name = msg.get("name")
|
||||||
|
if isinstance(name, str) and _SURROGATE_RE.search(name):
|
||||||
|
msg["name"] = _SURROGATE_RE.sub('\ufffd', name)
|
||||||
|
found = True
|
||||||
|
tool_calls = msg.get("tool_calls")
|
||||||
|
if isinstance(tool_calls, list):
|
||||||
|
for tc in tool_calls:
|
||||||
|
if not isinstance(tc, dict):
|
||||||
|
continue
|
||||||
|
tc_id = tc.get("id")
|
||||||
|
if isinstance(tc_id, str) and _SURROGATE_RE.search(tc_id):
|
||||||
|
tc["id"] = _SURROGATE_RE.sub('\ufffd', tc_id)
|
||||||
|
found = True
|
||||||
|
fn = tc.get("function")
|
||||||
|
if isinstance(fn, dict):
|
||||||
|
fn_name = fn.get("name")
|
||||||
|
if isinstance(fn_name, str) and _SURROGATE_RE.search(fn_name):
|
||||||
|
fn["name"] = _SURROGATE_RE.sub('\ufffd', fn_name)
|
||||||
|
found = True
|
||||||
|
fn_args = fn.get("arguments")
|
||||||
|
if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args):
|
||||||
|
fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args)
|
||||||
|
found = True
|
||||||
return found
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -7242,7 +7266,7 @@ class AIAgent:
|
||||||
self._thinking_prefill_retries = 0
|
self._thinking_prefill_retries = 0
|
||||||
self._last_content_with_tools = None
|
self._last_content_with_tools = None
|
||||||
self._mute_post_response = False
|
self._mute_post_response = False
|
||||||
self._unicode_sanitized = False
|
self._unicode_sanitization_passes = 0
|
||||||
|
|
||||||
# Pre-turn connection health check: detect and clean up dead TCP
|
# Pre-turn connection health check: detect and clean up dead TCP
|
||||||
# connections left over from provider outages or dropped streams.
|
# connections left over from provider outages or dropped streams.
|
||||||
|
|
@ -8233,14 +8257,16 @@ class AIAgent:
|
||||||
# 2. ASCII codec on systems with LANG=C or non-UTF-8 locale
|
# 2. ASCII codec on systems with LANG=C or non-UTF-8 locale
|
||||||
# (e.g. Chromebooks) — any non-ASCII character fails.
|
# (e.g. Chromebooks) — any non-ASCII character fails.
|
||||||
# Detect via the error message mentioning 'ascii' codec.
|
# Detect via the error message mentioning 'ascii' codec.
|
||||||
# We sanitize messages in-place and retry once.
|
# We sanitize messages in-place and may retry twice:
|
||||||
|
# first to strip surrogates, then once more for pure
|
||||||
|
# ASCII-only locale sanitization if needed.
|
||||||
# -----------------------------------------------------------
|
# -----------------------------------------------------------
|
||||||
if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_unicode_sanitized', False):
|
if isinstance(api_error, UnicodeEncodeError) and getattr(self, '_unicode_sanitization_passes', 0) < 2:
|
||||||
self._unicode_sanitized = True
|
|
||||||
_err_str = str(api_error).lower()
|
_err_str = str(api_error).lower()
|
||||||
_is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
|
_is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
|
||||||
_surrogates_found = _sanitize_messages_surrogates(messages)
|
_surrogates_found = _sanitize_messages_surrogates(messages)
|
||||||
if _surrogates_found:
|
if _surrogates_found:
|
||||||
|
self._unicode_sanitization_passes += 1
|
||||||
self._vprint(
|
self._vprint(
|
||||||
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
|
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
|
||||||
force=True,
|
force=True,
|
||||||
|
|
@ -8248,16 +8274,17 @@ class AIAgent:
|
||||||
continue
|
continue
|
||||||
if _is_ascii_codec:
|
if _is_ascii_codec:
|
||||||
# ASCII codec: the system encoding can't handle
|
# ASCII codec: the system encoding can't handle
|
||||||
# non-ASCII characters at all. Sanitize all
|
# non-ASCII characters at all. Sanitize all
|
||||||
# non-ASCII content from messages and retry.
|
# non-ASCII content from messages and retry.
|
||||||
if _sanitize_messages_non_ascii(messages):
|
if _sanitize_messages_non_ascii(messages):
|
||||||
|
self._unicode_sanitization_passes += 1
|
||||||
self._vprint(
|
self._vprint(
|
||||||
f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...",
|
f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...",
|
||||||
force=True,
|
force=True,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
# Nothing to sanitize in messages — might be in system
|
# Nothing to sanitize in messages — might be in system
|
||||||
# prompt or prefill. Fall through to normal error path.
|
# prompt or prefill. Fall through to normal error path.
|
||||||
|
|
||||||
status_code = getattr(api_error, "status_code", None)
|
status_code = getattr(api_error, "status_code", None)
|
||||||
error_context = self._extract_api_error_context(api_error)
|
error_context = self._extract_api_error_context(api_error)
|
||||||
|
|
|
||||||
|
|
@ -107,6 +107,26 @@ class TestSurrogateVsAsciiSanitization:
|
||||||
assert "\ud800" not in messages[0]["content"]
|
assert "\ud800" not in messages[0]["content"]
|
||||||
assert "\ufffd" in messages[0]["content"]
|
assert "\ufffd" in messages[0]["content"]
|
||||||
|
|
||||||
|
def test_surrogates_in_name_and_tool_calls_are_sanitized(self):
|
||||||
|
messages = [{
|
||||||
|
"role": "assistant",
|
||||||
|
"name": "bad\ud800name",
|
||||||
|
"content": None,
|
||||||
|
"tool_calls": [{
|
||||||
|
"id": "call_\ud800",
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "read\ud800_file",
|
||||||
|
"arguments": '{"path": "bad\ud800.txt"}'
|
||||||
|
}
|
||||||
|
}],
|
||||||
|
}]
|
||||||
|
assert _sanitize_messages_surrogates(messages) is True
|
||||||
|
assert "\ud800" not in messages[0]["name"]
|
||||||
|
assert "\ud800" not in messages[0]["tool_calls"][0]["id"]
|
||||||
|
assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["name"]
|
||||||
|
assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["arguments"]
|
||||||
|
|
||||||
def test_ascii_codec_strips_all_non_ascii(self):
|
def test_ascii_codec_strips_all_non_ascii(self):
|
||||||
"""ASCII codec case: all non-ASCII is stripped, not replaced."""
|
"""ASCII codec case: all non-ASCII is stripped, not replaced."""
|
||||||
messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]
|
messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue