fix: sanitize surrogate characters from clipboard paste to prevent UnicodeEncodeError (#3624)

Pasting text from rich-text editors (Google Docs, Word, etc.) can inject lone surrogate characters (U+D800..U+DFFF) that are invalid UTF-8. The OpenAI SDK serializes messages with ensure_ascii=False, then encodes to UTF-8 for the HTTP body — surrogates crash this with: UnicodeEncodeError: 'utf-8' codec can't encode character '\udce2' Three-layer fix: 1. Primary: sanitize user_message at the top of run_conversation() 2. CLI: sanitize in chat() before appending to conversation_history 3. Safety net: catch UnicodeEncodeError in the API error handler, sanitize the entire messages list in-place, and retry once. Also exclude UnicodeEncodeError from is_local_validation_error so it doesn't get classified as non-retryable. Includes 14 new tests covering the sanitization helpers and the integration with run_conversation().
2026-06-16 09:31:37 +00:00 · 2026-03-28 16:53:14 -07:00 · 2026-03-28 16:53:14 -07:00 · 857a5d7b47
commit 857a5d7b47
parent b029742092
3 changed files with 236 additions and 1 deletions
--- a/cli.py
+++ b/cli.py
@ -5534,6 +5534,13 @@ class HermesCLI:
            except Exception as e:
                logging.debug("@ context reference expansion failed: %s", e)

+        # Sanitize surrogate characters that can arrive via clipboard paste from
+        # rich-text editors (Google Docs, Word, etc.).  Lone surrogates are invalid
+        # UTF-8 and crash JSON serialization in the OpenAI SDK.
+        if isinstance(message, str):
+            from run_agent import _sanitize_surrogates
+            message = _sanitize_surrogates(message)
+
        # Add user message to history
        self.conversation_history.append({"role": "user", "content": message})

--- a/run_agent.py
+++ b/run_agent.py
@ -368,6 +368,48 @@ _BUDGET_WARNING_RE = re.compile(
 )


+# Regex to match lone surrogate code points (U+D800..U+DFFF).
+# These are invalid in UTF-8 and cause UnicodeEncodeError when the OpenAI SDK
+# serialises messages to JSON.  Common source: clipboard paste from Google Docs
+# or other rich-text editors on some platforms.
+_SURROGATE_RE = re.compile(r'[\ud800-\udfff]')
+
+
+def _sanitize_surrogates(text: str) -> str:
+    """Replace lone surrogate code points with U+FFFD (replacement character).
+
+    Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the
+    OpenAI SDK.  This is a fast no-op when the text contains no surrogates.
+    """
+    if _SURROGATE_RE.search(text):
+        return _SURROGATE_RE.sub('\ufffd', text)
+    return text
+
+
+def _sanitize_messages_surrogates(messages: list) -> bool:
+    """Sanitize surrogate characters from all string content in a messages list.
+
+    Walks message dicts in-place.  Returns True if any surrogates were found
+    and replaced, False otherwise.
+    """
+    found = False
+    for msg in messages:
+        if not isinstance(msg, dict):
+            continue
+        content = msg.get("content")
+        if isinstance(content, str) and _SURROGATE_RE.search(content):
+            msg["content"] = _SURROGATE_RE.sub('\ufffd', content)
+            found = True
+        elif isinstance(content, list):
+            for part in content:
+                if isinstance(part, dict):
+                    text = part.get("text")
+                    if isinstance(text, str) and _SURROGATE_RE.search(text):
+                        part["text"] = _SURROGATE_RE.sub('\ufffd', text)
+                        found = True
+    return found
+
+
 def _strip_budget_warnings_from_history(messages: list) -> None:
    """Remove budget pressure warnings from tool-result messages in-place.

@ -5959,6 +6001,14 @@ class AIAgent:
        # Installed once, transparent when streams are healthy, prevents crash on write.
        _install_safe_stdio()

+        # Sanitize surrogate characters from user input.  Clipboard paste from
+        # rich-text editors (Google Docs, Word, etc.) can inject lone surrogates
+        # that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK.
+        if isinstance(user_message, str):
+            user_message = _sanitize_surrogates(user_message)
+        if isinstance(persist_user_message, str):
+            persist_user_message = _sanitize_surrogates(persist_user_message)
+
        # Store stream callback for _interruptible_api_call to pick up
        self._stream_callback = stream_callback
        self._persist_user_message_idx = None
@ -5975,6 +6025,7 @@ class AIAgent:
        self._codex_incomplete_retries = 0
        self._last_content_with_tools = None
        self._mute_post_response = False
+        self._surrogate_sanitized = False
        # NOTE: _turns_since_memory and _iters_since_skill are NOT reset here.
        # They are initialized in __init__ and must persist across run_conversation
        # calls so that nudge logic accumulates correctly in CLI mode.
@ -6810,6 +6861,24 @@ class AIAgent:
                    if self.thinking_callback:
                        self.thinking_callback("")

+                    # -----------------------------------------------------------
+                    # Surrogate character recovery.  UnicodeEncodeError happens
+                    # when the messages contain lone surrogates (U+D800..U+DFFF)
+                    # that are invalid UTF-8.  Common source: clipboard paste
+                    # from Google Docs or similar rich-text editors.  We sanitize
+                    # the entire messages list in-place and retry once.
+                    # -----------------------------------------------------------
+                    if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_surrogate_sanitized', False):
+                        self._surrogate_sanitized = True
+                        if _sanitize_messages_surrogates(messages):
+                            self._vprint(
+                                f"{self.log_prefix}⚠️  Stripped invalid surrogate characters from messages. Retrying...",
+                                force=True,
+                            )
+                            continue
+                        # Surrogates weren't in messages — might be in system
+                        # prompt or prefill.  Fall through to normal error path.
+
                    status_code = getattr(api_error, "status_code", None)
                    if (
                        self.api_mode == "codex_responses"
@ -7078,8 +7147,13 @@ class AIAgent:
                    # 529 (Anthropic overloaded) is also transient.
                    # Also catch local validation errors (ValueError, TypeError) — these
                    # are programming bugs, not transient failures.
+                    # Exclude UnicodeEncodeError — it's a ValueError subclass but is
+                    # handled separately by the surrogate sanitization path above.
                    _RETRYABLE_STATUS_CODES = {413, 429, 529}
-                    is_local_validation_error = isinstance(api_error, (ValueError, TypeError))
+                    is_local_validation_error = (
+                        isinstance(api_error, (ValueError, TypeError))
+                        and not isinstance(api_error, UnicodeEncodeError)
+                    )
                    # Detect generic 400s from Anthropic OAuth (transient server-side failures).
                    # Real invalid_request_error responses include a descriptive message;
                    # transient ones contain only "Error" or are empty. (ref: issue #1608)
--- a/tests/test_surrogate_sanitization.py
+++ b/tests/test_surrogate_sanitization.py
@ -0,0 +1,154 @@
+"""Tests for surrogate character sanitization in user input.
+
+Surrogates (U+D800..U+DFFF) are invalid in UTF-8 and crash json.dumps()
+inside the OpenAI SDK. They can appear via clipboard paste from rich-text
+editors like Google Docs.
+"""
+import json
+import pytest
+from unittest.mock import MagicMock, patch
+
+from run_agent import (
+    _sanitize_surrogates,
+    _sanitize_messages_surrogates,
+    _SURROGATE_RE,
+)
+
+
+class TestSanitizeSurrogates:
+    """Test the _sanitize_surrogates() helper."""
+
+    def test_normal_text_unchanged(self):
+        text = "Hello, this is normal text with unicode: café ñ 日本語 🎉"
+        assert _sanitize_surrogates(text) == text
+
+    def test_empty_string(self):
+        assert _sanitize_surrogates("") == ""
+
+    def test_single_surrogate_replaced(self):
+        result = _sanitize_surrogates("Hello \udce2 world")
+        assert result == "Hello \ufffd world"
+
+    def test_multiple_surrogates_replaced(self):
+        result = _sanitize_surrogates("a\ud800b\udc00c\udfff")
+        assert result == "a\ufffdb\ufffdc\ufffd"
+
+    def test_all_surrogate_range(self):
+        """Verify the regex catches the full surrogate range."""
+        for cp in [0xD800, 0xD900, 0xDA00, 0xDB00, 0xDC00, 0xDD00, 0xDE00, 0xDF00, 0xDFFF]:
+            text = f"test{chr(cp)}end"
+            result = _sanitize_surrogates(text)
+            assert '\ufffd' in result, f"Surrogate U+{cp:04X} not caught"
+
+    def test_result_is_json_serializable(self):
+        """Sanitized text must survive json.dumps + utf-8 encoding."""
+        dirty = "data \udce2\udcb0 from clipboard"
+        clean = _sanitize_surrogates(dirty)
+        serialized = json.dumps({"content": clean}, ensure_ascii=False)
+        # Must not raise UnicodeEncodeError
+        serialized.encode("utf-8")
+
+    def test_original_surrogates_fail_encoding(self):
+        """Confirm the original bug: surrogates crash utf-8 encoding."""
+        dirty = "data \udce2 from clipboard"
+        serialized = json.dumps({"content": dirty}, ensure_ascii=False)
+        with pytest.raises(UnicodeEncodeError):
+            serialized.encode("utf-8")
+
+
+class TestSanitizeMessagesSurrogates:
+    """Test the _sanitize_messages_surrogates() helper for message lists."""
+
+    def test_clean_messages_returns_false(self):
+        msgs = [
+            {"role": "user", "content": "all clean"},
+            {"role": "assistant", "content": "me too"},
+        ]
+        assert _sanitize_messages_surrogates(msgs) is False
+
+    def test_dirty_string_content_sanitized(self):
+        msgs = [
+            {"role": "user", "content": "text with \udce2 surrogate"},
+        ]
+        assert _sanitize_messages_surrogates(msgs) is True
+        assert "\ufffd" in msgs[0]["content"]
+        assert "\udce2" not in msgs[0]["content"]
+
+    def test_dirty_multimodal_content_sanitized(self):
+        msgs = [
+            {"role": "user", "content": [
+                {"type": "text", "text": "multimodal \udce2 content"},
+                {"type": "image_url", "image_url": {"url": "http://example.com"}},
+            ]},
+        ]
+        assert _sanitize_messages_surrogates(msgs) is True
+        assert "\ufffd" in msgs[0]["content"][0]["text"]
+        assert "\udce2" not in msgs[0]["content"][0]["text"]
+
+    def test_mixed_clean_and_dirty(self):
+        msgs = [
+            {"role": "user", "content": "clean text"},
+            {"role": "user", "content": "dirty \udce2 text"},
+            {"role": "assistant", "content": "clean response"},
+        ]
+        assert _sanitize_messages_surrogates(msgs) is True
+        assert msgs[0]["content"] == "clean text"
+        assert "\ufffd" in msgs[1]["content"]
+        assert msgs[2]["content"] == "clean response"
+
+    def test_non_dict_items_skipped(self):
+        msgs = ["not a dict", {"role": "user", "content": "ok"}]
+        assert _sanitize_messages_surrogates(msgs) is False
+
+    def test_tool_messages_sanitized(self):
+        """Tool results could also contain surrogates from file reads etc."""
+        msgs = [
+            {"role": "tool", "content": "result with \udce2 data", "tool_call_id": "x"},
+        ]
+        assert _sanitize_messages_surrogates(msgs) is True
+        assert "\ufffd" in msgs[0]["content"]
+
+
+class TestRunConversationSurrogateSanitization:
+    """Integration: verify run_conversation sanitizes user_message."""
+
+    @patch("run_agent.AIAgent._build_system_prompt")
+    @patch("run_agent.AIAgent._interruptible_streaming_api_call")
+    @patch("run_agent.AIAgent._interruptible_api_call")
+    def test_user_message_surrogates_sanitized(self, mock_api, mock_stream, mock_sys):
+        """Surrogates in user_message are stripped before API call."""
+        from run_agent import AIAgent
+
+        mock_sys.return_value = "system prompt"
+
+        # Mock streaming to return a simple response
+        mock_choice = MagicMock()
+        mock_choice.message.content = "response"
+        mock_choice.message.tool_calls = None
+        mock_choice.message.refusal = None
+        mock_choice.finish_reason = "stop"
+        mock_choice.message.reasoning_content = None
+
+        mock_response = MagicMock()
+        mock_response.choices = [mock_choice]
+        mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=5, total_tokens=15)
+        mock_response.model = "test-model"
+        mock_response.id = "test-id"
+
+        mock_stream.return_value = mock_response
+        mock_api.return_value = mock_response
+
+        agent = AIAgent(model="test/model", quiet_mode=True, skip_memory=True, skip_context_files=True)
+        agent.client = MagicMock()
+
+        # Pass a message with surrogates
+        result = agent.run_conversation(
+            user_message="test \udce2 message",
+            conversation_history=[],
+        )
+
+        # The message stored in history should have surrogates replaced
+        for msg in result.get("messages", []):
+            if msg.get("role") == "user":
+                assert "\udce2" not in msg["content"], "Surrogate leaked into stored message"
+                assert "\ufffd" in msg["content"], "Replacement char not in stored message"