fix: sanitize surrogate characters from clipboard paste to prevent UnicodeEncodeError (#3624)

Pasting text from rich-text editors (Google Docs, Word, etc.) can inject
lone surrogate characters (U+D800..U+DFFF) that are invalid UTF-8.
The OpenAI SDK serializes messages with ensure_ascii=False, then encodes
to UTF-8 for the HTTP body — surrogates crash this with:
  UnicodeEncodeError: 'utf-8' codec can't encode character '\udce2'

Three-layer fix:
1. Primary: sanitize user_message at the top of run_conversation()
2. CLI: sanitize in chat() before appending to conversation_history
3. Safety net: catch UnicodeEncodeError in the API error handler,
   sanitize the entire messages list in-place, and retry once.
   Also exclude UnicodeEncodeError from is_local_validation_error
   so it doesn't get classified as non-retryable.

Includes 14 new tests covering the sanitization helpers and the
integration with run_conversation().
This commit is contained in:
Teknium 2026-03-28 16:53:14 -07:00 committed by GitHub
parent b029742092
commit 857a5d7b47
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 236 additions and 1 deletions

7
cli.py
View file

@ -5534,6 +5534,13 @@ class HermesCLI:
except Exception as e:
logging.debug("@ context reference expansion failed: %s", e)
# Sanitize surrogate characters that can arrive via clipboard paste from
# rich-text editors (Google Docs, Word, etc.). Lone surrogates are invalid
# UTF-8 and crash JSON serialization in the OpenAI SDK.
if isinstance(message, str):
from run_agent import _sanitize_surrogates
message = _sanitize_surrogates(message)
# Add user message to history
self.conversation_history.append({"role": "user", "content": message})

View file

@ -368,6 +368,48 @@ _BUDGET_WARNING_RE = re.compile(
)
# Regex to match lone surrogate code points (U+D800..U+DFFF).
# These are invalid in UTF-8 and cause UnicodeEncodeError when the OpenAI SDK
# serialises messages to JSON. Common source: clipboard paste from Google Docs
# or other rich-text editors on some platforms.
_SURROGATE_RE = re.compile(r'[\ud800-\udfff]')
def _sanitize_surrogates(text: str) -> str:
"""Replace lone surrogate code points with U+FFFD (replacement character).
Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the
OpenAI SDK. This is a fast no-op when the text contains no surrogates.
"""
if _SURROGATE_RE.search(text):
return _SURROGATE_RE.sub('\ufffd', text)
return text
def _sanitize_messages_surrogates(messages: list) -> bool:
"""Sanitize surrogate characters from all string content in a messages list.
Walks message dicts in-place. Returns True if any surrogates were found
and replaced, False otherwise.
"""
found = False
for msg in messages:
if not isinstance(msg, dict):
continue
content = msg.get("content")
if isinstance(content, str) and _SURROGATE_RE.search(content):
msg["content"] = _SURROGATE_RE.sub('\ufffd', content)
found = True
elif isinstance(content, list):
for part in content:
if isinstance(part, dict):
text = part.get("text")
if isinstance(text, str) and _SURROGATE_RE.search(text):
part["text"] = _SURROGATE_RE.sub('\ufffd', text)
found = True
return found
def _strip_budget_warnings_from_history(messages: list) -> None:
"""Remove budget pressure warnings from tool-result messages in-place.
@ -5959,6 +6001,14 @@ class AIAgent:
# Installed once, transparent when streams are healthy, prevents crash on write.
_install_safe_stdio()
# Sanitize surrogate characters from user input. Clipboard paste from
# rich-text editors (Google Docs, Word, etc.) can inject lone surrogates
# that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK.
if isinstance(user_message, str):
user_message = _sanitize_surrogates(user_message)
if isinstance(persist_user_message, str):
persist_user_message = _sanitize_surrogates(persist_user_message)
# Store stream callback for _interruptible_api_call to pick up
self._stream_callback = stream_callback
self._persist_user_message_idx = None
@ -5975,6 +6025,7 @@ class AIAgent:
self._codex_incomplete_retries = 0
self._last_content_with_tools = None
self._mute_post_response = False
self._surrogate_sanitized = False
# NOTE: _turns_since_memory and _iters_since_skill are NOT reset here.
# They are initialized in __init__ and must persist across run_conversation
# calls so that nudge logic accumulates correctly in CLI mode.
@ -6810,6 +6861,24 @@ class AIAgent:
if self.thinking_callback:
self.thinking_callback("")
# -----------------------------------------------------------
# Surrogate character recovery. UnicodeEncodeError happens
# when the messages contain lone surrogates (U+D800..U+DFFF)
# that are invalid UTF-8. Common source: clipboard paste
# from Google Docs or similar rich-text editors. We sanitize
# the entire messages list in-place and retry once.
# -----------------------------------------------------------
if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_surrogate_sanitized', False):
self._surrogate_sanitized = True
if _sanitize_messages_surrogates(messages):
self._vprint(
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
force=True,
)
continue
# Surrogates weren't in messages — might be in system
# prompt or prefill. Fall through to normal error path.
status_code = getattr(api_error, "status_code", None)
if (
self.api_mode == "codex_responses"
@ -7078,8 +7147,13 @@ class AIAgent:
# 529 (Anthropic overloaded) is also transient.
# Also catch local validation errors (ValueError, TypeError) — these
# are programming bugs, not transient failures.
# Exclude UnicodeEncodeError — it's a ValueError subclass but is
# handled separately by the surrogate sanitization path above.
_RETRYABLE_STATUS_CODES = {413, 429, 529}
is_local_validation_error = isinstance(api_error, (ValueError, TypeError))
is_local_validation_error = (
isinstance(api_error, (ValueError, TypeError))
and not isinstance(api_error, UnicodeEncodeError)
)
# Detect generic 400s from Anthropic OAuth (transient server-side failures).
# Real invalid_request_error responses include a descriptive message;
# transient ones contain only "Error" or are empty. (ref: issue #1608)

View file

@ -0,0 +1,154 @@
"""Tests for surrogate character sanitization in user input.
Surrogates (U+D800..U+DFFF) are invalid in UTF-8 and crash json.dumps()
inside the OpenAI SDK. They can appear via clipboard paste from rich-text
editors like Google Docs.
"""
import json
import pytest
from unittest.mock import MagicMock, patch
from run_agent import (
_sanitize_surrogates,
_sanitize_messages_surrogates,
_SURROGATE_RE,
)
class TestSanitizeSurrogates:
"""Test the _sanitize_surrogates() helper."""
def test_normal_text_unchanged(self):
text = "Hello, this is normal text with unicode: café ñ 日本語 🎉"
assert _sanitize_surrogates(text) == text
def test_empty_string(self):
assert _sanitize_surrogates("") == ""
def test_single_surrogate_replaced(self):
result = _sanitize_surrogates("Hello \udce2 world")
assert result == "Hello \ufffd world"
def test_multiple_surrogates_replaced(self):
result = _sanitize_surrogates("a\ud800b\udc00c\udfff")
assert result == "a\ufffdb\ufffdc\ufffd"
def test_all_surrogate_range(self):
"""Verify the regex catches the full surrogate range."""
for cp in [0xD800, 0xD900, 0xDA00, 0xDB00, 0xDC00, 0xDD00, 0xDE00, 0xDF00, 0xDFFF]:
text = f"test{chr(cp)}end"
result = _sanitize_surrogates(text)
assert '\ufffd' in result, f"Surrogate U+{cp:04X} not caught"
def test_result_is_json_serializable(self):
"""Sanitized text must survive json.dumps + utf-8 encoding."""
dirty = "data \udce2\udcb0 from clipboard"
clean = _sanitize_surrogates(dirty)
serialized = json.dumps({"content": clean}, ensure_ascii=False)
# Must not raise UnicodeEncodeError
serialized.encode("utf-8")
def test_original_surrogates_fail_encoding(self):
"""Confirm the original bug: surrogates crash utf-8 encoding."""
dirty = "data \udce2 from clipboard"
serialized = json.dumps({"content": dirty}, ensure_ascii=False)
with pytest.raises(UnicodeEncodeError):
serialized.encode("utf-8")
class TestSanitizeMessagesSurrogates:
"""Test the _sanitize_messages_surrogates() helper for message lists."""
def test_clean_messages_returns_false(self):
msgs = [
{"role": "user", "content": "all clean"},
{"role": "assistant", "content": "me too"},
]
assert _sanitize_messages_surrogates(msgs) is False
def test_dirty_string_content_sanitized(self):
msgs = [
{"role": "user", "content": "text with \udce2 surrogate"},
]
assert _sanitize_messages_surrogates(msgs) is True
assert "\ufffd" in msgs[0]["content"]
assert "\udce2" not in msgs[0]["content"]
def test_dirty_multimodal_content_sanitized(self):
msgs = [
{"role": "user", "content": [
{"type": "text", "text": "multimodal \udce2 content"},
{"type": "image_url", "image_url": {"url": "http://example.com"}},
]},
]
assert _sanitize_messages_surrogates(msgs) is True
assert "\ufffd" in msgs[0]["content"][0]["text"]
assert "\udce2" not in msgs[0]["content"][0]["text"]
def test_mixed_clean_and_dirty(self):
msgs = [
{"role": "user", "content": "clean text"},
{"role": "user", "content": "dirty \udce2 text"},
{"role": "assistant", "content": "clean response"},
]
assert _sanitize_messages_surrogates(msgs) is True
assert msgs[0]["content"] == "clean text"
assert "\ufffd" in msgs[1]["content"]
assert msgs[2]["content"] == "clean response"
def test_non_dict_items_skipped(self):
msgs = ["not a dict", {"role": "user", "content": "ok"}]
assert _sanitize_messages_surrogates(msgs) is False
def test_tool_messages_sanitized(self):
"""Tool results could also contain surrogates from file reads etc."""
msgs = [
{"role": "tool", "content": "result with \udce2 data", "tool_call_id": "x"},
]
assert _sanitize_messages_surrogates(msgs) is True
assert "\ufffd" in msgs[0]["content"]
class TestRunConversationSurrogateSanitization:
"""Integration: verify run_conversation sanitizes user_message."""
@patch("run_agent.AIAgent._build_system_prompt")
@patch("run_agent.AIAgent._interruptible_streaming_api_call")
@patch("run_agent.AIAgent._interruptible_api_call")
def test_user_message_surrogates_sanitized(self, mock_api, mock_stream, mock_sys):
"""Surrogates in user_message are stripped before API call."""
from run_agent import AIAgent
mock_sys.return_value = "system prompt"
# Mock streaming to return a simple response
mock_choice = MagicMock()
mock_choice.message.content = "response"
mock_choice.message.tool_calls = None
mock_choice.message.refusal = None
mock_choice.finish_reason = "stop"
mock_choice.message.reasoning_content = None
mock_response = MagicMock()
mock_response.choices = [mock_choice]
mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=5, total_tokens=15)
mock_response.model = "test-model"
mock_response.id = "test-id"
mock_stream.return_value = mock_response
mock_api.return_value = mock_response
agent = AIAgent(model="test/model", quiet_mode=True, skip_memory=True, skip_context_files=True)
agent.client = MagicMock()
# Pass a message with surrogates
result = agent.run_conversation(
user_message="test \udce2 message",
conversation_history=[],
)
# The message stored in history should have surrogates replaced
for msg in result.get("messages", []):
if msg.get("role") == "user":
assert "\udce2" not in msg["content"], "Surrogate leaked into stored message"
assert "\ufffd" in msg["content"], "Replacement char not in stored message"