mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: sanitize surrogate characters from clipboard paste to prevent UnicodeEncodeError (#3624)
Pasting text from rich-text editors (Google Docs, Word, etc.) can inject lone surrogate characters (U+D800..U+DFFF) that are invalid UTF-8. The OpenAI SDK serializes messages with ensure_ascii=False, then encodes to UTF-8 for the HTTP body — surrogates crash this with: UnicodeEncodeError: 'utf-8' codec can't encode character '\udce2' Three-layer fix: 1. Primary: sanitize user_message at the top of run_conversation() 2. CLI: sanitize in chat() before appending to conversation_history 3. Safety net: catch UnicodeEncodeError in the API error handler, sanitize the entire messages list in-place, and retry once. Also exclude UnicodeEncodeError from is_local_validation_error so it doesn't get classified as non-retryable. Includes 14 new tests covering the sanitization helpers and the integration with run_conversation().
This commit is contained in:
parent
b029742092
commit
857a5d7b47
3 changed files with 236 additions and 1 deletions
7
cli.py
7
cli.py
|
|
@ -5534,6 +5534,13 @@ class HermesCLI:
|
|||
except Exception as e:
|
||||
logging.debug("@ context reference expansion failed: %s", e)
|
||||
|
||||
# Sanitize surrogate characters that can arrive via clipboard paste from
|
||||
# rich-text editors (Google Docs, Word, etc.). Lone surrogates are invalid
|
||||
# UTF-8 and crash JSON serialization in the OpenAI SDK.
|
||||
if isinstance(message, str):
|
||||
from run_agent import _sanitize_surrogates
|
||||
message = _sanitize_surrogates(message)
|
||||
|
||||
# Add user message to history
|
||||
self.conversation_history.append({"role": "user", "content": message})
|
||||
|
||||
|
|
|
|||
76
run_agent.py
76
run_agent.py
|
|
@ -368,6 +368,48 @@ _BUDGET_WARNING_RE = re.compile(
|
|||
)
|
||||
|
||||
|
||||
# Regex to match lone surrogate code points (U+D800..U+DFFF).
|
||||
# These are invalid in UTF-8 and cause UnicodeEncodeError when the OpenAI SDK
|
||||
# serialises messages to JSON. Common source: clipboard paste from Google Docs
|
||||
# or other rich-text editors on some platforms.
|
||||
_SURROGATE_RE = re.compile(r'[\ud800-\udfff]')
|
||||
|
||||
|
||||
def _sanitize_surrogates(text: str) -> str:
|
||||
"""Replace lone surrogate code points with U+FFFD (replacement character).
|
||||
|
||||
Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the
|
||||
OpenAI SDK. This is a fast no-op when the text contains no surrogates.
|
||||
"""
|
||||
if _SURROGATE_RE.search(text):
|
||||
return _SURROGATE_RE.sub('\ufffd', text)
|
||||
return text
|
||||
|
||||
|
||||
def _sanitize_messages_surrogates(messages: list) -> bool:
|
||||
"""Sanitize surrogate characters from all string content in a messages list.
|
||||
|
||||
Walks message dicts in-place. Returns True if any surrogates were found
|
||||
and replaced, False otherwise.
|
||||
"""
|
||||
found = False
|
||||
for msg in messages:
|
||||
if not isinstance(msg, dict):
|
||||
continue
|
||||
content = msg.get("content")
|
||||
if isinstance(content, str) and _SURROGATE_RE.search(content):
|
||||
msg["content"] = _SURROGATE_RE.sub('\ufffd', content)
|
||||
found = True
|
||||
elif isinstance(content, list):
|
||||
for part in content:
|
||||
if isinstance(part, dict):
|
||||
text = part.get("text")
|
||||
if isinstance(text, str) and _SURROGATE_RE.search(text):
|
||||
part["text"] = _SURROGATE_RE.sub('\ufffd', text)
|
||||
found = True
|
||||
return found
|
||||
|
||||
|
||||
def _strip_budget_warnings_from_history(messages: list) -> None:
|
||||
"""Remove budget pressure warnings from tool-result messages in-place.
|
||||
|
||||
|
|
@ -5959,6 +6001,14 @@ class AIAgent:
|
|||
# Installed once, transparent when streams are healthy, prevents crash on write.
|
||||
_install_safe_stdio()
|
||||
|
||||
# Sanitize surrogate characters from user input. Clipboard paste from
|
||||
# rich-text editors (Google Docs, Word, etc.) can inject lone surrogates
|
||||
# that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK.
|
||||
if isinstance(user_message, str):
|
||||
user_message = _sanitize_surrogates(user_message)
|
||||
if isinstance(persist_user_message, str):
|
||||
persist_user_message = _sanitize_surrogates(persist_user_message)
|
||||
|
||||
# Store stream callback for _interruptible_api_call to pick up
|
||||
self._stream_callback = stream_callback
|
||||
self._persist_user_message_idx = None
|
||||
|
|
@ -5975,6 +6025,7 @@ class AIAgent:
|
|||
self._codex_incomplete_retries = 0
|
||||
self._last_content_with_tools = None
|
||||
self._mute_post_response = False
|
||||
self._surrogate_sanitized = False
|
||||
# NOTE: _turns_since_memory and _iters_since_skill are NOT reset here.
|
||||
# They are initialized in __init__ and must persist across run_conversation
|
||||
# calls so that nudge logic accumulates correctly in CLI mode.
|
||||
|
|
@ -6810,6 +6861,24 @@ class AIAgent:
|
|||
if self.thinking_callback:
|
||||
self.thinking_callback("")
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# Surrogate character recovery. UnicodeEncodeError happens
|
||||
# when the messages contain lone surrogates (U+D800..U+DFFF)
|
||||
# that are invalid UTF-8. Common source: clipboard paste
|
||||
# from Google Docs or similar rich-text editors. We sanitize
|
||||
# the entire messages list in-place and retry once.
|
||||
# -----------------------------------------------------------
|
||||
if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_surrogate_sanitized', False):
|
||||
self._surrogate_sanitized = True
|
||||
if _sanitize_messages_surrogates(messages):
|
||||
self._vprint(
|
||||
f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...",
|
||||
force=True,
|
||||
)
|
||||
continue
|
||||
# Surrogates weren't in messages — might be in system
|
||||
# prompt or prefill. Fall through to normal error path.
|
||||
|
||||
status_code = getattr(api_error, "status_code", None)
|
||||
if (
|
||||
self.api_mode == "codex_responses"
|
||||
|
|
@ -7078,8 +7147,13 @@ class AIAgent:
|
|||
# 529 (Anthropic overloaded) is also transient.
|
||||
# Also catch local validation errors (ValueError, TypeError) — these
|
||||
# are programming bugs, not transient failures.
|
||||
# Exclude UnicodeEncodeError — it's a ValueError subclass but is
|
||||
# handled separately by the surrogate sanitization path above.
|
||||
_RETRYABLE_STATUS_CODES = {413, 429, 529}
|
||||
is_local_validation_error = isinstance(api_error, (ValueError, TypeError))
|
||||
is_local_validation_error = (
|
||||
isinstance(api_error, (ValueError, TypeError))
|
||||
and not isinstance(api_error, UnicodeEncodeError)
|
||||
)
|
||||
# Detect generic 400s from Anthropic OAuth (transient server-side failures).
|
||||
# Real invalid_request_error responses include a descriptive message;
|
||||
# transient ones contain only "Error" or are empty. (ref: issue #1608)
|
||||
|
|
|
|||
154
tests/test_surrogate_sanitization.py
Normal file
154
tests/test_surrogate_sanitization.py
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
"""Tests for surrogate character sanitization in user input.
|
||||
|
||||
Surrogates (U+D800..U+DFFF) are invalid in UTF-8 and crash json.dumps()
|
||||
inside the OpenAI SDK. They can appear via clipboard paste from rich-text
|
||||
editors like Google Docs.
|
||||
"""
|
||||
import json
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from run_agent import (
|
||||
_sanitize_surrogates,
|
||||
_sanitize_messages_surrogates,
|
||||
_SURROGATE_RE,
|
||||
)
|
||||
|
||||
|
||||
class TestSanitizeSurrogates:
|
||||
"""Test the _sanitize_surrogates() helper."""
|
||||
|
||||
def test_normal_text_unchanged(self):
|
||||
text = "Hello, this is normal text with unicode: café ñ 日本語 🎉"
|
||||
assert _sanitize_surrogates(text) == text
|
||||
|
||||
def test_empty_string(self):
|
||||
assert _sanitize_surrogates("") == ""
|
||||
|
||||
def test_single_surrogate_replaced(self):
|
||||
result = _sanitize_surrogates("Hello \udce2 world")
|
||||
assert result == "Hello \ufffd world"
|
||||
|
||||
def test_multiple_surrogates_replaced(self):
|
||||
result = _sanitize_surrogates("a\ud800b\udc00c\udfff")
|
||||
assert result == "a\ufffdb\ufffdc\ufffd"
|
||||
|
||||
def test_all_surrogate_range(self):
|
||||
"""Verify the regex catches the full surrogate range."""
|
||||
for cp in [0xD800, 0xD900, 0xDA00, 0xDB00, 0xDC00, 0xDD00, 0xDE00, 0xDF00, 0xDFFF]:
|
||||
text = f"test{chr(cp)}end"
|
||||
result = _sanitize_surrogates(text)
|
||||
assert '\ufffd' in result, f"Surrogate U+{cp:04X} not caught"
|
||||
|
||||
def test_result_is_json_serializable(self):
|
||||
"""Sanitized text must survive json.dumps + utf-8 encoding."""
|
||||
dirty = "data \udce2\udcb0 from clipboard"
|
||||
clean = _sanitize_surrogates(dirty)
|
||||
serialized = json.dumps({"content": clean}, ensure_ascii=False)
|
||||
# Must not raise UnicodeEncodeError
|
||||
serialized.encode("utf-8")
|
||||
|
||||
def test_original_surrogates_fail_encoding(self):
|
||||
"""Confirm the original bug: surrogates crash utf-8 encoding."""
|
||||
dirty = "data \udce2 from clipboard"
|
||||
serialized = json.dumps({"content": dirty}, ensure_ascii=False)
|
||||
with pytest.raises(UnicodeEncodeError):
|
||||
serialized.encode("utf-8")
|
||||
|
||||
|
||||
class TestSanitizeMessagesSurrogates:
|
||||
"""Test the _sanitize_messages_surrogates() helper for message lists."""
|
||||
|
||||
def test_clean_messages_returns_false(self):
|
||||
msgs = [
|
||||
{"role": "user", "content": "all clean"},
|
||||
{"role": "assistant", "content": "me too"},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is False
|
||||
|
||||
def test_dirty_string_content_sanitized(self):
|
||||
msgs = [
|
||||
{"role": "user", "content": "text with \udce2 surrogate"},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is True
|
||||
assert "\ufffd" in msgs[0]["content"]
|
||||
assert "\udce2" not in msgs[0]["content"]
|
||||
|
||||
def test_dirty_multimodal_content_sanitized(self):
|
||||
msgs = [
|
||||
{"role": "user", "content": [
|
||||
{"type": "text", "text": "multimodal \udce2 content"},
|
||||
{"type": "image_url", "image_url": {"url": "http://example.com"}},
|
||||
]},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is True
|
||||
assert "\ufffd" in msgs[0]["content"][0]["text"]
|
||||
assert "\udce2" not in msgs[0]["content"][0]["text"]
|
||||
|
||||
def test_mixed_clean_and_dirty(self):
|
||||
msgs = [
|
||||
{"role": "user", "content": "clean text"},
|
||||
{"role": "user", "content": "dirty \udce2 text"},
|
||||
{"role": "assistant", "content": "clean response"},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is True
|
||||
assert msgs[0]["content"] == "clean text"
|
||||
assert "\ufffd" in msgs[1]["content"]
|
||||
assert msgs[2]["content"] == "clean response"
|
||||
|
||||
def test_non_dict_items_skipped(self):
|
||||
msgs = ["not a dict", {"role": "user", "content": "ok"}]
|
||||
assert _sanitize_messages_surrogates(msgs) is False
|
||||
|
||||
def test_tool_messages_sanitized(self):
|
||||
"""Tool results could also contain surrogates from file reads etc."""
|
||||
msgs = [
|
||||
{"role": "tool", "content": "result with \udce2 data", "tool_call_id": "x"},
|
||||
]
|
||||
assert _sanitize_messages_surrogates(msgs) is True
|
||||
assert "\ufffd" in msgs[0]["content"]
|
||||
|
||||
|
||||
class TestRunConversationSurrogateSanitization:
|
||||
"""Integration: verify run_conversation sanitizes user_message."""
|
||||
|
||||
@patch("run_agent.AIAgent._build_system_prompt")
|
||||
@patch("run_agent.AIAgent._interruptible_streaming_api_call")
|
||||
@patch("run_agent.AIAgent._interruptible_api_call")
|
||||
def test_user_message_surrogates_sanitized(self, mock_api, mock_stream, mock_sys):
|
||||
"""Surrogates in user_message are stripped before API call."""
|
||||
from run_agent import AIAgent
|
||||
|
||||
mock_sys.return_value = "system prompt"
|
||||
|
||||
# Mock streaming to return a simple response
|
||||
mock_choice = MagicMock()
|
||||
mock_choice.message.content = "response"
|
||||
mock_choice.message.tool_calls = None
|
||||
mock_choice.message.refusal = None
|
||||
mock_choice.finish_reason = "stop"
|
||||
mock_choice.message.reasoning_content = None
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.choices = [mock_choice]
|
||||
mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=5, total_tokens=15)
|
||||
mock_response.model = "test-model"
|
||||
mock_response.id = "test-id"
|
||||
|
||||
mock_stream.return_value = mock_response
|
||||
mock_api.return_value = mock_response
|
||||
|
||||
agent = AIAgent(model="test/model", quiet_mode=True, skip_memory=True, skip_context_files=True)
|
||||
agent.client = MagicMock()
|
||||
|
||||
# Pass a message with surrogates
|
||||
result = agent.run_conversation(
|
||||
user_message="test \udce2 message",
|
||||
conversation_history=[],
|
||||
)
|
||||
|
||||
# The message stored in history should have surrogates replaced
|
||||
for msg in result.get("messages", []):
|
||||
if msg.get("role") == "user":
|
||||
assert "\udce2" not in msg["content"], "Surrogate leaked into stored message"
|
||||
assert "\ufffd" in msg["content"], "Replacement char not in stored message"
|
||||
Loading…
Add table
Add a link
Reference in a new issue