From 71036a7a759aae7795d6853f84a9aa61d2f4fc4b Mon Sep 17 00:00:00 2001 From: Hermes Audit Date: Thu, 9 Apr 2026 23:21:42 +0000 Subject: [PATCH] fix: handle UnicodeEncodeError with ASCII codec (#6843) Broaden the UnicodeEncodeError recovery to handle systems with ASCII-only locale (LANG=C, Chromebooks) where ANY non-ASCII character causes encoding failure, not just lone surrogates. Changes: - Add _strip_non_ascii() and _sanitize_messages_non_ascii() helpers that strip all non-ASCII characters from message content, name, and tool_calls - Update the UnicodeEncodeError handler to detect ASCII codec errors and fall back to non-ASCII sanitization after surrogate check fails - Sanitize tool_calls arguments and name fields (not just content) - Fix bare .encode() in cli.py suspend handler to use explicit utf-8 - Add comprehensive test suite (17 tests) --- cli.py | 2 +- run_agent.py | 94 +++++++++++++-- tests/run_agent/test_unicode_ascii_codec.py | 120 ++++++++++++++++++++ 3 files changed, 205 insertions(+), 11 deletions(-) create mode 100644 tests/run_agent/test_unicode_ascii_codec.py diff --git a/cli.py b/cli.py index fb0691148..95c2839a1 100644 --- a/cli.py +++ b/cli.py @@ -7999,7 +7999,7 @@ class HermesCLI: agent_name = get_active_skin().get_branding("agent_name", "Hermes Agent") msg = f"\n{agent_name} has been suspended. Run `fg` to bring {agent_name} back." def _suspend(): - os.write(1, msg.encode()) + os.write(1, msg.encode("utf-8", errors="replace")) os.kill(0, _sig.SIGTSTP) run_in_terminal(_suspend) diff --git a/run_agent.py b/run_agent.py index 129eb1679..f69ed6fc2 100644 --- a/run_agent.py +++ b/run_agent.py @@ -380,6 +380,65 @@ def _sanitize_messages_surrogates(messages: list) -> bool: return found +def _strip_non_ascii(text: str) -> str: + """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing. + + Used as a last resort when the system encoding is ASCII and can't handle + any non-ASCII characters (e.g. LANG=C on Chromebooks). + """ + return text.encode('ascii', errors='ignore').decode('ascii') + + +def _sanitize_messages_non_ascii(messages: list) -> bool: + """Strip non-ASCII characters from all string content in a messages list. + + This is a last-resort recovery for systems with ASCII-only encoding + (LANG=C, Chromebooks, minimal containers). Returns True if any + non-ASCII content was found and sanitized. + """ + found = False + for msg in messages: + if not isinstance(msg, dict): + continue + # Sanitize content (string) + content = msg.get("content") + if isinstance(content, str): + sanitized = _strip_non_ascii(content) + if sanitized != content: + msg["content"] = sanitized + found = True + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text") + if isinstance(text, str): + sanitized = _strip_non_ascii(text) + if sanitized != text: + part["text"] = sanitized + found = True + # Sanitize name field (can contain non-ASCII in tool results) + name = msg.get("name") + if isinstance(name, str): + sanitized = _strip_non_ascii(name) + if sanitized != name: + msg["name"] = sanitized + found = True + # Sanitize tool_calls + tool_calls = msg.get("tool_calls") + if isinstance(tool_calls, list): + for tc in tool_calls: + if isinstance(tc, dict): + fn = tc.get("function", {}) + if isinstance(fn, dict): + fn_args = fn.get("arguments") + if isinstance(fn_args, str): + sanitized = _strip_non_ascii(fn_args) + if sanitized != fn_args: + fn["arguments"] = sanitized + found = True + return found + + def _strip_budget_warnings_from_history(messages: list) -> None: """Remove budget pressure warnings from tool-result messages in-place. @@ -7183,7 +7242,7 @@ class AIAgent: self._thinking_prefill_retries = 0 self._last_content_with_tools = None self._mute_post_response = False - self._surrogate_sanitized = False + self._unicode_sanitized = False # Pre-turn connection health check: detect and clean up dead TCP # connections left over from provider outages or dropped streams. @@ -8168,21 +8227,36 @@ class AIAgent: self.thinking_callback("") # ----------------------------------------------------------- - # Surrogate character recovery. UnicodeEncodeError happens - # when the messages contain lone surrogates (U+D800..U+DFFF) - # that are invalid UTF-8. Common source: clipboard paste - # from Google Docs or similar rich-text editors. We sanitize - # the entire messages list in-place and retry once. + # UnicodeEncodeError recovery. Two common causes: + # 1. Lone surrogates (U+D800..U+DFFF) from clipboard paste + # (Google Docs, rich-text editors) — sanitize and retry. + # 2. ASCII codec on systems with LANG=C or non-UTF-8 locale + # (e.g. Chromebooks) — any non-ASCII character fails. + # Detect via the error message mentioning 'ascii' codec. + # We sanitize messages in-place and retry once. # ----------------------------------------------------------- - if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_surrogate_sanitized', False): - self._surrogate_sanitized = True - if _sanitize_messages_surrogates(messages): + if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_unicode_sanitized', False): + self._unicode_sanitized = True + _err_str = str(api_error).lower() + _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str + _surrogates_found = _sanitize_messages_surrogates(messages) + if _surrogates_found: self._vprint( f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...", force=True, ) continue - # Surrogates weren't in messages — might be in system + if _is_ascii_codec: + # ASCII codec: the system encoding can't handle + # non-ASCII characters at all. Sanitize all + # non-ASCII content from messages and retry. + if _sanitize_messages_non_ascii(messages): + self._vprint( + f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...", + force=True, + ) + continue + # Nothing to sanitize in messages — might be in system # prompt or prefill. Fall through to normal error path. status_code = getattr(api_error, "status_code", None) diff --git a/tests/run_agent/test_unicode_ascii_codec.py b/tests/run_agent/test_unicode_ascii_codec.py new file mode 100644 index 000000000..d45790053 --- /dev/null +++ b/tests/run_agent/test_unicode_ascii_codec.py @@ -0,0 +1,120 @@ +"""Tests for UnicodeEncodeError recovery with ASCII codec. + +Covers the fix for issue #6843 — systems with ASCII locale (LANG=C) +that can't encode non-ASCII characters in API request payloads. +""" + +import pytest + +from run_agent import ( + _strip_non_ascii, + _sanitize_messages_non_ascii, + _sanitize_messages_surrogates, +) + + +class TestStripNonAscii: + """Tests for _strip_non_ascii helper.""" + + def test_ascii_only(self): + assert _strip_non_ascii("hello world") == "hello world" + + def test_removes_non_ascii(self): + assert _strip_non_ascii("hello ⚕ world") == "hello world" + + def test_removes_emoji(self): + assert _strip_non_ascii("test 🤖 done") == "test done" + + def test_chinese_chars(self): + assert _strip_non_ascii("你好world") == "world" + + def test_empty_string(self): + assert _strip_non_ascii("") == "" + + def test_only_non_ascii(self): + assert _strip_non_ascii("⚕🤖") == "" + + +class TestSanitizeMessagesNonAscii: + """Tests for _sanitize_messages_non_ascii.""" + + def test_no_change_ascii_only(self): + messages = [{"role": "user", "content": "hello"}] + assert _sanitize_messages_non_ascii(messages) is False + assert messages[0]["content"] == "hello" + + def test_sanitizes_content_string(self): + messages = [{"role": "user", "content": "hello ⚕ world"}] + assert _sanitize_messages_non_ascii(messages) is True + assert messages[0]["content"] == "hello world" + + def test_sanitizes_content_list(self): + messages = [{ + "role": "user", + "content": [{"type": "text", "text": "hello 🤖"}] + }] + assert _sanitize_messages_non_ascii(messages) is True + assert messages[0]["content"][0]["text"] == "hello " + + def test_sanitizes_name_field(self): + messages = [{"role": "tool", "name": "⚕tool", "content": "ok"}] + assert _sanitize_messages_non_ascii(messages) is True + assert messages[0]["name"] == "tool" + + def test_sanitizes_tool_calls(self): + messages = [{ + "role": "assistant", + "content": None, + "tool_calls": [{ + "id": "call_1", + "type": "function", + "function": { + "name": "read_file", + "arguments": '{"path": "⚕test.txt"}' + } + }] + }] + assert _sanitize_messages_non_ascii(messages) is True + assert messages[0]["tool_calls"][0]["function"]["arguments"] == '{"path": "test.txt"}' + + def test_handles_non_dict_messages(self): + messages = ["not a dict", {"role": "user", "content": "hello"}] + assert _sanitize_messages_non_ascii(messages) is False + + def test_empty_messages(self): + assert _sanitize_messages_non_ascii([]) is False + + def test_multiple_messages(self): + messages = [ + {"role": "system", "content": "⚕ System prompt"}, + {"role": "user", "content": "Hello 你好"}, + {"role": "assistant", "content": "Hi there!"}, + ] + assert _sanitize_messages_non_ascii(messages) is True + assert messages[0]["content"] == " System prompt" + assert messages[1]["content"] == "Hello " + assert messages[2]["content"] == "Hi there!" + + +class TestSurrogateVsAsciiSanitization: + """Test that surrogate and ASCII sanitization work independently.""" + + def test_surrogates_still_handled(self): + """Surrogates are caught by _sanitize_messages_surrogates, not _non_ascii.""" + msg_with_surrogate = "test \ud800 end" + messages = [{"role": "user", "content": msg_with_surrogate}] + assert _sanitize_messages_surrogates(messages) is True + assert "\ud800" not in messages[0]["content"] + assert "\ufffd" in messages[0]["content"] + + def test_ascii_codec_strips_all_non_ascii(self): + """ASCII codec case: all non-ASCII is stripped, not replaced.""" + messages = [{"role": "user", "content": "test ⚕🤖你好 end"}] + assert _sanitize_messages_non_ascii(messages) is True + # All non-ASCII chars removed; spaces around them collapse + assert messages[0]["content"] == "test end" + + def test_no_surrogates_returns_false(self): + """When no surrogates present, _sanitize_messages_surrogates returns False.""" + messages = [{"role": "user", "content": "hello ⚕ world"}] + assert _sanitize_messages_surrogates(messages) is False