fix: handle UnicodeEncodeError with ASCII codec (#6843)

Broaden the UnicodeEncodeError recovery to handle systems with ASCII-only locale (LANG=C, Chromebooks) where ANY non-ASCII character causes encoding failure, not just lone surrogates. Changes: - Add _strip_non_ascii() and _sanitize_messages_non_ascii() helpers that strip all non-ASCII characters from message content, name, and tool_calls - Update the UnicodeEncodeError handler to detect ASCII codec errors and fall back to non-ASCII sanitization after surrogate check fails - Sanitize tool_calls arguments and name fields (not just content) - Fix bare .encode() in cli.py suspend handler to use explicit utf-8 - Add comprehensive test suite (17 tests)
2026-07-29 18:46:59 +00:00 · 2026-04-09 23:21:42 +00:00 · 2026-04-09 23:21:42 +00:00 · 71036a7a75
commit 71036a7a75
parent 7e28b7b5d5
3 changed files with 205 additions and 11 deletions
--- a/cli.py
+++ b/cli.py
@ -7999,7 +7999,7 @@ class HermesCLI:
            agent_name = get_active_skin().get_branding("agent_name", "Hermes Agent")
            msg = f"\n{agent_name} has been suspended. Run `fg` to bring {agent_name} back."
            def _suspend():
-                os.write(1, msg.encode())
+                os.write(1, msg.encode("utf-8", errors="replace"))
                os.kill(0, _sig.SIGTSTP)
            run_in_terminal(_suspend)

--- a/run_agent.py
+++ b/run_agent.py
@ -380,6 +380,65 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
    return found


+def _strip_non_ascii(text: str) -> str:
+    """Remove non-ASCII characters, replacing with closest ASCII equivalent or removing.
+
+    Used as a last resort when the system encoding is ASCII and can't handle
+    any non-ASCII characters (e.g. LANG=C on Chromebooks).
+    """
+    return text.encode('ascii', errors='ignore').decode('ascii')
+
+
+def _sanitize_messages_non_ascii(messages: list) -> bool:
+    """Strip non-ASCII characters from all string content in a messages list.
+
+    This is a last-resort recovery for systems with ASCII-only encoding
+    (LANG=C, Chromebooks, minimal containers).  Returns True if any
+    non-ASCII content was found and sanitized.
+    """
+    found = False
+    for msg in messages:
+        if not isinstance(msg, dict):
+            continue
+        # Sanitize content (string)
+        content = msg.get("content")
+        if isinstance(content, str):
+            sanitized = _strip_non_ascii(content)
+            if sanitized != content:
+                msg["content"] = sanitized
+                found = True
+        elif isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict):
+                    text = part.get("text")
+                    if isinstance(text, str):
+                        sanitized = _strip_non_ascii(text)
+                        if sanitized != text:
+                            part["text"] = sanitized
+                            found = True
+        # Sanitize name field (can contain non-ASCII in tool results)
+        name = msg.get("name")
+        if isinstance(name, str):
+            sanitized = _strip_non_ascii(name)
+            if sanitized != name:
+                msg["name"] = sanitized
+                found = True
+        # Sanitize tool_calls
+        tool_calls = msg.get("tool_calls")
+        if isinstance(tool_calls, list):
+            for tc in tool_calls:
+                if isinstance(tc, dict):
+                    fn = tc.get("function", {})
+                    if isinstance(fn, dict):
+                        fn_args = fn.get("arguments")
+                        if isinstance(fn_args, str):
+                            sanitized = _strip_non_ascii(fn_args)
+                            if sanitized != fn_args:
+                                fn["arguments"] = sanitized
+                                found = True
+    return found
+
+
 def _strip_budget_warnings_from_history(messages: list) -> None:
    """Remove budget pressure warnings from tool-result messages in-place.

@ -7183,7 +7242,7 @@ class AIAgent:
        self._thinking_prefill_retries = 0
        self._last_content_with_tools = None
        self._mute_post_response = False
-        self._surrogate_sanitized = False
+        self._unicode_sanitized = False

        # Pre-turn connection health check: detect and clean up dead TCP
        # connections left over from provider outages or dropped streams.
@ -8168,21 +8227,36 @@ class AIAgent:
                        self.thinking_callback("")

                    # -----------------------------------------------------------
-                    # Surrogate character recovery.  UnicodeEncodeError happens
-                    # when the messages contain lone surrogates (U+D800..U+DFFF)
-                    # that are invalid UTF-8.  Common source: clipboard paste
-                    # from Google Docs or similar rich-text editors.  We sanitize
-                    # the entire messages list in-place and retry once.
+                    # UnicodeEncodeError recovery.  Two common causes:
+                    #   1. Lone surrogates (U+D800..U+DFFF) from clipboard paste
+                    #      (Google Docs, rich-text editors) — sanitize and retry.
+                    #   2. ASCII codec on systems with LANG=C or non-UTF-8 locale
+                    #      (e.g. Chromebooks) — any non-ASCII character fails.
+                    #      Detect via the error message mentioning 'ascii' codec.
+                    # We sanitize messages in-place and retry once.
                    # -----------------------------------------------------------
-                    if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_surrogate_sanitized', False):
-                        self._surrogate_sanitized = True
-                        if _sanitize_messages_surrogates(messages):
+                    if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_unicode_sanitized', False):
+                        self._unicode_sanitized = True
+                        _err_str = str(api_error).lower()
+                        _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
+                        _surrogates_found = _sanitize_messages_surrogates(messages)
+                        if _surrogates_found:
                            self._vprint(
                                f"{self.log_prefix}⚠️  Stripped invalid surrogate characters from messages. Retrying...",
                                force=True,
                            )
                            continue
-                        # Surrogates weren't in messages — might be in system
+                        if _is_ascii_codec:
+                            # ASCII codec: the system encoding can't handle
+                            # non-ASCII characters at all.  Sanitize all
+                            # non-ASCII content from messages and retry.
+                            if _sanitize_messages_non_ascii(messages):
+                                self._vprint(
+                                    f"{self.log_prefix}⚠️  System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...",
+                                    force=True,
+                                )
+                                continue
+                        # Nothing to sanitize in messages — might be in system
                        # prompt or prefill.  Fall through to normal error path.

                    status_code = getattr(api_error, "status_code", None)
--- a/tests/run_agent/test_unicode_ascii_codec.py
+++ b/tests/run_agent/test_unicode_ascii_codec.py
@ -0,0 +1,120 @@
+"""Tests for UnicodeEncodeError recovery with ASCII codec.
+
+Covers the fix for issue #6843 — systems with ASCII locale (LANG=C)
+that can't encode non-ASCII characters in API request payloads.
+"""
+
+import pytest
+
+from run_agent import (
+    _strip_non_ascii,
+    _sanitize_messages_non_ascii,
+    _sanitize_messages_surrogates,
+)
+
+
+class TestStripNonAscii:
+    """Tests for _strip_non_ascii helper."""
+
+    def test_ascii_only(self):
+        assert _strip_non_ascii("hello world") == "hello world"
+
+    def test_removes_non_ascii(self):
+        assert _strip_non_ascii("hello ⚕ world") == "hello  world"
+
+    def test_removes_emoji(self):
+        assert _strip_non_ascii("test 🤖 done") == "test  done"
+
+    def test_chinese_chars(self):
+        assert _strip_non_ascii("你好world") == "world"
+
+    def test_empty_string(self):
+        assert _strip_non_ascii("") == ""
+
+    def test_only_non_ascii(self):
+        assert _strip_non_ascii("⚕🤖") == ""
+
+
+class TestSanitizeMessagesNonAscii:
+    """Tests for _sanitize_messages_non_ascii."""
+
+    def test_no_change_ascii_only(self):
+        messages = [{"role": "user", "content": "hello"}]
+        assert _sanitize_messages_non_ascii(messages) is False
+        assert messages[0]["content"] == "hello"
+
+    def test_sanitizes_content_string(self):
+        messages = [{"role": "user", "content": "hello ⚕ world"}]
+        assert _sanitize_messages_non_ascii(messages) is True
+        assert messages[0]["content"] == "hello  world"
+
+    def test_sanitizes_content_list(self):
+        messages = [{
+            "role": "user",
+            "content": [{"type": "text", "text": "hello 🤖"}]
+        }]
+        assert _sanitize_messages_non_ascii(messages) is True
+        assert messages[0]["content"][0]["text"] == "hello "
+
+    def test_sanitizes_name_field(self):
+        messages = [{"role": "tool", "name": "⚕tool", "content": "ok"}]
+        assert _sanitize_messages_non_ascii(messages) is True
+        assert messages[0]["name"] == "tool"
+
+    def test_sanitizes_tool_calls(self):
+        messages = [{
+            "role": "assistant",
+            "content": None,
+            "tool_calls": [{
+                "id": "call_1",
+                "type": "function",
+                "function": {
+                    "name": "read_file",
+                    "arguments": '{"path": "⚕test.txt"}'
+                }
+            }]
+        }]
+        assert _sanitize_messages_non_ascii(messages) is True
+        assert messages[0]["tool_calls"][0]["function"]["arguments"] == '{"path": "test.txt"}'
+
+    def test_handles_non_dict_messages(self):
+        messages = ["not a dict", {"role": "user", "content": "hello"}]
+        assert _sanitize_messages_non_ascii(messages) is False
+
+    def test_empty_messages(self):
+        assert _sanitize_messages_non_ascii([]) is False
+
+    def test_multiple_messages(self):
+        messages = [
+            {"role": "system", "content": "⚕ System prompt"},
+            {"role": "user", "content": "Hello 你好"},
+            {"role": "assistant", "content": "Hi there!"},
+        ]
+        assert _sanitize_messages_non_ascii(messages) is True
+        assert messages[0]["content"] == " System prompt"
+        assert messages[1]["content"] == "Hello "
+        assert messages[2]["content"] == "Hi there!"
+
+
+class TestSurrogateVsAsciiSanitization:
+    """Test that surrogate and ASCII sanitization work independently."""
+
+    def test_surrogates_still_handled(self):
+        """Surrogates are caught by _sanitize_messages_surrogates, not _non_ascii."""
+        msg_with_surrogate = "test \ud800 end"
+        messages = [{"role": "user", "content": msg_with_surrogate}]
+        assert _sanitize_messages_surrogates(messages) is True
+        assert "\ud800" not in messages[0]["content"]
+        assert "\ufffd" in messages[0]["content"]
+
+    def test_ascii_codec_strips_all_non_ascii(self):
+        """ASCII codec case: all non-ASCII is stripped, not replaced."""
+        messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]
+        assert _sanitize_messages_non_ascii(messages) is True
+        # All non-ASCII chars removed; spaces around them collapse
+        assert messages[0]["content"] == "test  end"
+
+    def test_no_surrogates_returns_false(self):
+        """When no surrogates present, _sanitize_messages_surrogates returns False."""
+        messages = [{"role": "user", "content": "hello ⚕ world"}]
+        assert _sanitize_messages_surrogates(messages) is False