mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: handle UnicodeEncodeError with ASCII codec (#6843)
Broaden the UnicodeEncodeError recovery to handle systems with ASCII-only locale (LANG=C, Chromebooks) where ANY non-ASCII character causes encoding failure, not just lone surrogates. Changes: - Add _strip_non_ascii() and _sanitize_messages_non_ascii() helpers that strip all non-ASCII characters from message content, name, and tool_calls - Update the UnicodeEncodeError handler to detect ASCII codec errors and fall back to non-ASCII sanitization after surrogate check fails - Sanitize tool_calls arguments and name fields (not just content) - Fix bare .encode() in cli.py suspend handler to use explicit utf-8 - Add comprehensive test suite (17 tests)
This commit is contained in:
parent
7e28b7b5d5
commit
71036a7a75
3 changed files with 205 additions and 11 deletions
2
cli.py
2
cli.py
|
|
@ -7999,7 +7999,7 @@ class HermesCLI:
|
|||
agent_name = get_active_skin().get_branding("agent_name", "Hermes Agent")
|
||||
msg = f"\n{agent_name} has been suspended. Run `fg` to bring {agent_name} back."
|
||||
def _suspend():
|
||||
os.write(1, msg.encode())
|
||||
os.write(1, msg.encode("utf-8", errors="replace"))
|
||||
os.kill(0, _sig.SIGTSTP)
|
||||
run_in_terminal(_suspend)
|
||||
|
||||
|
|
|
|||
94
run_agent.py
94
run_agent.py
|
|
@ -380,6 +380,65 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
|
|||
return found
|
||||
|
||||
|
||||
def _strip_non_ascii(text: str) -> str:
|
||||
"""Remove non-ASCII characters, replacing with closest ASCII equivalent or removing.
|
||||
|
||||
Used as a last resort when the system encoding is ASCII and can't handle
|
||||
any non-ASCII characters (e.g. LANG=C on Chromebooks).
|
||||
"""
|
||||
return text.encode('ascii', errors='ignore').decode('ascii')
|
||||
|
||||
|
||||
def _sanitize_messages_non_ascii(messages: list) -> bool:
|
||||
"""Strip non-ASCII characters from all string content in a messages list.
|
||||
|
||||
This is a last-resort recovery for systems with ASCII-only encoding
|
||||
(LANG=C, Chromebooks, minimal containers). Returns True if any
|
||||
non-ASCII content was found and sanitized.
|
||||
"""
|
||||
found = False
|
||||
for msg in messages:
|
||||
if not isinstance(msg, dict):
|
||||
continue
|
||||
# Sanitize content (string)
|
||||
content = msg.get("content")
|
||||
if isinstance(content, str):
|
||||
sanitized = _strip_non_ascii(content)
|
||||
if sanitized != content:
|
||||
msg["content"] = sanitized
|
||||
found = True
|
||||
elif isinstance(content, list):
|
||||
for part in content:
|
||||
if isinstance(part, dict):
|
||||
text = part.get("text")
|
||||
if isinstance(text, str):
|
||||
sanitized = _strip_non_ascii(text)
|
||||
if sanitized != text:
|
||||
part["text"] = sanitized
|
||||
found = True
|
||||
# Sanitize name field (can contain non-ASCII in tool results)
|
||||
name = msg.get("name")
|
||||
if isinstance(name, str):
|
||||
sanitized = _strip_non_ascii(name)
|
||||
if sanitized != name:
|
||||
msg["name"] = sanitized
|
||||
found = True
|
||||
# Sanitize tool_calls
|
||||
tool_calls = msg.get("tool_calls")
|
||||
if isinstance(tool_calls, list):
|
||||
for tc in tool_calls:
|
||||
if isinstance(tc, dict):
|
||||
fn = tc.get("function", {})
|
||||
if isinstance(fn, dict):
|
||||
fn_args = fn.get("arguments")
|
||||
if isinstance(fn_args, str):
|
||||
sanitized = _strip_non_ascii(fn_args)
|
||||
if sanitized != fn_args:
|
||||
fn["arguments"] = sanitized
|
||||
found = True
|
||||
return found
|
||||
|
||||
|
||||
def _strip_budget_warnings_from_history(messages: list) -> None:
|
||||
"""Remove budget pressure warnings from tool-result messages in-place.
|
||||
|
||||
|
|
@ -7183,7 +7242,7 @@ class AIAgent:
|
|||
self._thinking_prefill_retries = 0
|
||||
self._last_content_with_tools = None
|
||||
self._mute_post_response = False
|
||||
self._surrogate_sanitized = False
|
||||
self._unicode_sanitized = False
|
||||
|
||||
# Pre-turn connection health check: detect and clean up dead TCP
|
||||
# connections left over from provider outages or dropped streams.
|
||||
|
|
@ -8168,21 +8227,36 @@ class AIAgent:
|
|||
self.thinking_callback("")
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# Surrogate character recovery. UnicodeEncodeError happens
|
||||
# when the messages contain lone surrogates (U+D800..U+DFFF)
|
||||
# that are invalid UTF-8. Common source: clipboard paste
|
||||
# from Google Docs or similar rich-text editors. We sanitize
|
||||
# the entire messages list in-place and retry once.
|
||||
# UnicodeEncodeError recovery. Two common causes:
|
||||
# 1. Lone surrogates (U+D800..U+DFFF) from clipboard paste
|
||||
# (Google Docs, rich-text editors) — sanitize and retry.
|
||||
# 2. ASCII codec on systems with LANG=C or non-UTF-8 locale
|
||||
# (e.g. Chromebooks) — any non-ASCII character fails.
|
||||
# Detect via the error message mentioning 'ascii' codec.
|
||||
# We sanitize messages in-place and retry once.
|
||||
# -----------------------------------------------------------
|
||||
if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_surrogate_sanitized', False):
|
||||
self._surrogate_sanitized = True
|
||||
if _sanitize_messages_surrogates(messages):
|
||||
if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_unicode_sanitized', False):
|
||||
self._unicode_sanitized = True
|
||||
_err_str = str(api_error).lower()
|
||||
_is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
|
||||
_surrogates_found = _sanitize_messages_surrogates(messages)
|
||||
if _surrogates_found:
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
|
||||
force=True,
|
||||
)
|
||||
continue
|
||||
# Surrogates weren't in messages — might be in system
|
||||
if _is_ascii_codec:
|
||||
# ASCII codec: the system encoding can't handle
|
||||
# non-ASCII characters at all. Sanitize all
|
||||
# non-ASCII content from messages and retry.
|
||||
if _sanitize_messages_non_ascii(messages):
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...",
|
||||
force=True,
|
||||
)
|
||||
continue
|
||||
# Nothing to sanitize in messages — might be in system
|
||||
# prompt or prefill. Fall through to normal error path.
|
||||
|
||||
status_code = getattr(api_error, "status_code", None)
|
||||
|
|
|
|||
120
tests/run_agent/test_unicode_ascii_codec.py
Normal file
120
tests/run_agent/test_unicode_ascii_codec.py
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
"""Tests for UnicodeEncodeError recovery with ASCII codec.
|
||||
|
||||
Covers the fix for issue #6843 — systems with ASCII locale (LANG=C)
|
||||
that can't encode non-ASCII characters in API request payloads.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from run_agent import (
|
||||
_strip_non_ascii,
|
||||
_sanitize_messages_non_ascii,
|
||||
_sanitize_messages_surrogates,
|
||||
)
|
||||
|
||||
|
||||
class TestStripNonAscii:
|
||||
"""Tests for _strip_non_ascii helper."""
|
||||
|
||||
def test_ascii_only(self):
|
||||
assert _strip_non_ascii("hello world") == "hello world"
|
||||
|
||||
def test_removes_non_ascii(self):
|
||||
assert _strip_non_ascii("hello ⚕ world") == "hello world"
|
||||
|
||||
def test_removes_emoji(self):
|
||||
assert _strip_non_ascii("test 🤖 done") == "test done"
|
||||
|
||||
def test_chinese_chars(self):
|
||||
assert _strip_non_ascii("你好world") == "world"
|
||||
|
||||
def test_empty_string(self):
|
||||
assert _strip_non_ascii("") == ""
|
||||
|
||||
def test_only_non_ascii(self):
|
||||
assert _strip_non_ascii("⚕🤖") == ""
|
||||
|
||||
|
||||
class TestSanitizeMessagesNonAscii:
|
||||
"""Tests for _sanitize_messages_non_ascii."""
|
||||
|
||||
def test_no_change_ascii_only(self):
|
||||
messages = [{"role": "user", "content": "hello"}]
|
||||
assert _sanitize_messages_non_ascii(messages) is False
|
||||
assert messages[0]["content"] == "hello"
|
||||
|
||||
def test_sanitizes_content_string(self):
|
||||
messages = [{"role": "user", "content": "hello ⚕ world"}]
|
||||
assert _sanitize_messages_non_ascii(messages) is True
|
||||
assert messages[0]["content"] == "hello world"
|
||||
|
||||
def test_sanitizes_content_list(self):
|
||||
messages = [{
|
||||
"role": "user",
|
||||
"content": [{"type": "text", "text": "hello 🤖"}]
|
||||
}]
|
||||
assert _sanitize_messages_non_ascii(messages) is True
|
||||
assert messages[0]["content"][0]["text"] == "hello "
|
||||
|
||||
def test_sanitizes_name_field(self):
|
||||
messages = [{"role": "tool", "name": "⚕tool", "content": "ok"}]
|
||||
assert _sanitize_messages_non_ascii(messages) is True
|
||||
assert messages[0]["name"] == "tool"
|
||||
|
||||
def test_sanitizes_tool_calls(self):
|
||||
messages = [{
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [{
|
||||
"id": "call_1",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "read_file",
|
||||
"arguments": '{"path": "⚕test.txt"}'
|
||||
}
|
||||
}]
|
||||
}]
|
||||
assert _sanitize_messages_non_ascii(messages) is True
|
||||
assert messages[0]["tool_calls"][0]["function"]["arguments"] == '{"path": "test.txt"}'
|
||||
|
||||
def test_handles_non_dict_messages(self):
|
||||
messages = ["not a dict", {"role": "user", "content": "hello"}]
|
||||
assert _sanitize_messages_non_ascii(messages) is False
|
||||
|
||||
def test_empty_messages(self):
|
||||
assert _sanitize_messages_non_ascii([]) is False
|
||||
|
||||
def test_multiple_messages(self):
|
||||
messages = [
|
||||
{"role": "system", "content": "⚕ System prompt"},
|
||||
{"role": "user", "content": "Hello 你好"},
|
||||
{"role": "assistant", "content": "Hi there!"},
|
||||
]
|
||||
assert _sanitize_messages_non_ascii(messages) is True
|
||||
assert messages[0]["content"] == " System prompt"
|
||||
assert messages[1]["content"] == "Hello "
|
||||
assert messages[2]["content"] == "Hi there!"
|
||||
|
||||
|
||||
class TestSurrogateVsAsciiSanitization:
|
||||
"""Test that surrogate and ASCII sanitization work independently."""
|
||||
|
||||
def test_surrogates_still_handled(self):
|
||||
"""Surrogates are caught by _sanitize_messages_surrogates, not _non_ascii."""
|
||||
msg_with_surrogate = "test \ud800 end"
|
||||
messages = [{"role": "user", "content": msg_with_surrogate}]
|
||||
assert _sanitize_messages_surrogates(messages) is True
|
||||
assert "\ud800" not in messages[0]["content"]
|
||||
assert "\ufffd" in messages[0]["content"]
|
||||
|
||||
def test_ascii_codec_strips_all_non_ascii(self):
|
||||
"""ASCII codec case: all non-ASCII is stripped, not replaced."""
|
||||
messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]
|
||||
assert _sanitize_messages_non_ascii(messages) is True
|
||||
# All non-ASCII chars removed; spaces around them collapse
|
||||
assert messages[0]["content"] == "test end"
|
||||
|
||||
def test_no_surrogates_returns_false(self):
|
||||
"""When no surrogates present, _sanitize_messages_surrogates returns False."""
|
||||
messages = [{"role": "user", "content": "hello ⚕ world"}]
|
||||
assert _sanitize_messages_surrogates(messages) is False
|
||||
Loading…
Add table
Add a link
Reference in a new issue