mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Byte-level reasoning models (xiaomi/mimo-v2-pro, kimi, glm) can emit lone surrogates in reasoning output. The proactive sanitizer walked content/ name/tool_calls but not extra fields like reasoning or the nested reasoning_details array. Surrogates in those fields survived the proactive pass, crashed json.dumps() in the OpenAI SDK, and the recovery block's _sanitize_messages_surrogates(messages) call also didn't check those fields — so 'found' was False, no retry happened, and after 3 attempts the user saw: API call failed after 3 retries. 'utf-8' codec can't encode characters in position N-M: surrogates not allowed Changes: - _sanitize_messages_surrogates: walk any extra string fields (reasoning, reasoning_content, etc.) and recurse into nested dict/list values (reasoning_details). Mirrors _sanitize_messages_non_ascii coverage added in PR #10537. - _sanitize_structure_surrogates: new recursive walker, mirror of _sanitize_structure_non_ascii but for surrogate recovery. - UnicodeEncodeError recovery block: also sanitize api_messages, api_kwargs, and prefill_messages (not just the canonical messages list — the API-copy carries reasoning_content transformed from reasoning and that's what the SDK actually serializes). Always retry on detected surrogate errors, not only when we found something to strip — gate on error type per PR #10537's pattern. Tests: extended tests/cli/test_surrogate_sanitization.py with coverage for reasoning, reasoning_content, reasoning_details (flat and deeply nested), structure walker, and an integration case that reproduces the exact api_messages shape that was crashing.
336 lines
13 KiB
Python
336 lines
13 KiB
Python
"""Tests for surrogate character sanitization in user input.
|
|
|
|
Surrogates (U+D800..U+DFFF) are invalid in UTF-8 and crash json.dumps()
|
|
inside the OpenAI SDK. They can appear via clipboard paste from rich-text
|
|
editors like Google Docs, OR from byte-level reasoning models (xiaomi/mimo,
|
|
kimi, glm) emitting lone halves in reasoning output.
|
|
"""
|
|
import json
|
|
import pytest
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from run_agent import (
|
|
_sanitize_surrogates,
|
|
_sanitize_messages_surrogates,
|
|
_sanitize_structure_surrogates,
|
|
_SURROGATE_RE,
|
|
)
|
|
|
|
|
|
class TestSanitizeSurrogates:
|
|
"""Test the _sanitize_surrogates() helper."""
|
|
|
|
def test_normal_text_unchanged(self):
|
|
text = "Hello, this is normal text with unicode: café ñ 日本語 🎉"
|
|
assert _sanitize_surrogates(text) == text
|
|
|
|
def test_empty_string(self):
|
|
assert _sanitize_surrogates("") == ""
|
|
|
|
def test_single_surrogate_replaced(self):
|
|
result = _sanitize_surrogates("Hello \udce2 world")
|
|
assert result == "Hello \ufffd world"
|
|
|
|
def test_multiple_surrogates_replaced(self):
|
|
result = _sanitize_surrogates("a\ud800b\udc00c\udfff")
|
|
assert result == "a\ufffdb\ufffdc\ufffd"
|
|
|
|
def test_all_surrogate_range(self):
|
|
"""Verify the regex catches the full surrogate range."""
|
|
for cp in [0xD800, 0xD900, 0xDA00, 0xDB00, 0xDC00, 0xDD00, 0xDE00, 0xDF00, 0xDFFF]:
|
|
text = f"test{chr(cp)}end"
|
|
result = _sanitize_surrogates(text)
|
|
assert '\ufffd' in result, f"Surrogate U+{cp:04X} not caught"
|
|
|
|
def test_result_is_json_serializable(self):
|
|
"""Sanitized text must survive json.dumps + utf-8 encoding."""
|
|
dirty = "data \udce2\udcb0 from clipboard"
|
|
clean = _sanitize_surrogates(dirty)
|
|
serialized = json.dumps({"content": clean}, ensure_ascii=False)
|
|
# Must not raise UnicodeEncodeError
|
|
serialized.encode("utf-8")
|
|
|
|
def test_original_surrogates_fail_encoding(self):
|
|
"""Confirm the original bug: surrogates crash utf-8 encoding."""
|
|
dirty = "data \udce2 from clipboard"
|
|
serialized = json.dumps({"content": dirty}, ensure_ascii=False)
|
|
with pytest.raises(UnicodeEncodeError):
|
|
serialized.encode("utf-8")
|
|
|
|
|
|
class TestSanitizeMessagesSurrogates:
|
|
"""Test the _sanitize_messages_surrogates() helper for message lists."""
|
|
|
|
def test_clean_messages_returns_false(self):
|
|
msgs = [
|
|
{"role": "user", "content": "all clean"},
|
|
{"role": "assistant", "content": "me too"},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is False
|
|
|
|
def test_dirty_string_content_sanitized(self):
|
|
msgs = [
|
|
{"role": "user", "content": "text with \udce2 surrogate"},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is True
|
|
assert "\ufffd" in msgs[0]["content"]
|
|
assert "\udce2" not in msgs[0]["content"]
|
|
|
|
def test_dirty_multimodal_content_sanitized(self):
|
|
msgs = [
|
|
{"role": "user", "content": [
|
|
{"type": "text", "text": "multimodal \udce2 content"},
|
|
{"type": "image_url", "image_url": {"url": "http://example.com"}},
|
|
]},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is True
|
|
assert "\ufffd" in msgs[0]["content"][0]["text"]
|
|
assert "\udce2" not in msgs[0]["content"][0]["text"]
|
|
|
|
def test_mixed_clean_and_dirty(self):
|
|
msgs = [
|
|
{"role": "user", "content": "clean text"},
|
|
{"role": "user", "content": "dirty \udce2 text"},
|
|
{"role": "assistant", "content": "clean response"},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is True
|
|
assert msgs[0]["content"] == "clean text"
|
|
assert "\ufffd" in msgs[1]["content"]
|
|
assert msgs[2]["content"] == "clean response"
|
|
|
|
def test_non_dict_items_skipped(self):
|
|
msgs = ["not a dict", {"role": "user", "content": "ok"}]
|
|
assert _sanitize_messages_surrogates(msgs) is False
|
|
|
|
def test_tool_messages_sanitized(self):
|
|
"""Tool results could also contain surrogates from file reads etc."""
|
|
msgs = [
|
|
{"role": "tool", "content": "result with \udce2 data", "tool_call_id": "x"},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is True
|
|
assert "\ufffd" in msgs[0]["content"]
|
|
|
|
|
|
class TestReasoningFieldSurrogates:
|
|
"""Surrogates in reasoning fields (byte-level reasoning models).
|
|
|
|
xiaomi/mimo, kimi, glm and similar byte-level tokenizers can emit lone
|
|
surrogates in reasoning output. These fields are carried through to the
|
|
API as `reasoning_content` on assistant messages, and must be sanitized
|
|
or json.dumps() crashes with 'utf-8' codec can't encode surrogates.
|
|
"""
|
|
|
|
def test_reasoning_field_sanitized(self):
|
|
msgs = [
|
|
{"role": "assistant", "content": "ok", "reasoning": "thought \udce2 here"},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is True
|
|
assert "\udce2" not in msgs[0]["reasoning"]
|
|
assert "\ufffd" in msgs[0]["reasoning"]
|
|
|
|
def test_reasoning_content_field_sanitized(self):
|
|
"""api_messages carry `reasoning_content` built from `reasoning`."""
|
|
msgs = [
|
|
{"role": "assistant", "content": "ok", "reasoning_content": "thought \udce2 here"},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is True
|
|
assert "\udce2" not in msgs[0]["reasoning_content"]
|
|
assert "\ufffd" in msgs[0]["reasoning_content"]
|
|
|
|
def test_reasoning_details_nested_sanitized(self):
|
|
"""reasoning_details is a list of dicts with nested string fields."""
|
|
msgs = [
|
|
{
|
|
"role": "assistant",
|
|
"content": "ok",
|
|
"reasoning_details": [
|
|
{"type": "reasoning.summary", "summary": "summary \udce2 text"},
|
|
{"type": "reasoning.text", "text": "chain \udc00 of thought"},
|
|
],
|
|
},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is True
|
|
assert "\udce2" not in msgs[0]["reasoning_details"][0]["summary"]
|
|
assert "\ufffd" in msgs[0]["reasoning_details"][0]["summary"]
|
|
assert "\udc00" not in msgs[0]["reasoning_details"][1]["text"]
|
|
assert "\ufffd" in msgs[0]["reasoning_details"][1]["text"]
|
|
|
|
def test_deeply_nested_reasoning_sanitized(self):
|
|
"""Nested dicts / lists inside extra fields are recursed into."""
|
|
msgs = [
|
|
{
|
|
"role": "assistant",
|
|
"content": "ok",
|
|
"reasoning_details": [
|
|
{
|
|
"type": "reasoning.encrypted",
|
|
"content": {
|
|
"encrypted_content": "opaque",
|
|
"text_parts": ["part1", "part2 \udce2 part"],
|
|
},
|
|
},
|
|
],
|
|
},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is True
|
|
assert (
|
|
msgs[0]["reasoning_details"][0]["content"]["text_parts"][1]
|
|
== "part2 \ufffd part"
|
|
)
|
|
|
|
def test_reasoning_end_to_end_json_serialization(self):
|
|
"""After sanitization, the full message dict must serialize clean."""
|
|
msgs = [
|
|
{
|
|
"role": "assistant",
|
|
"content": "answer",
|
|
"reasoning_content": "reasoning with \udce2 surrogate",
|
|
"reasoning_details": [
|
|
{"summary": "nested \udcb0 surrogate"},
|
|
],
|
|
},
|
|
]
|
|
_sanitize_messages_surrogates(msgs)
|
|
# Must round-trip through json + utf-8 encoding without error
|
|
payload = json.dumps(msgs, ensure_ascii=False).encode("utf-8")
|
|
assert b"\\" not in payload[:0] # sanity — just ensure we got bytes
|
|
assert len(payload) > 0
|
|
|
|
def test_no_surrogates_returns_false(self):
|
|
"""Clean reasoning fields don't trigger a modification."""
|
|
msgs = [
|
|
{
|
|
"role": "assistant",
|
|
"content": "ok",
|
|
"reasoning": "clean thought",
|
|
"reasoning_content": "also clean",
|
|
"reasoning_details": [{"summary": "clean summary"}],
|
|
},
|
|
]
|
|
assert _sanitize_messages_surrogates(msgs) is False
|
|
|
|
|
|
class TestSanitizeStructureSurrogates:
|
|
"""Test the _sanitize_structure_surrogates() helper for nested payloads."""
|
|
|
|
def test_empty_payload(self):
|
|
assert _sanitize_structure_surrogates({}) is False
|
|
assert _sanitize_structure_surrogates([]) is False
|
|
|
|
def test_flat_dict(self):
|
|
payload = {"a": "clean", "b": "dirty \udce2 text"}
|
|
assert _sanitize_structure_surrogates(payload) is True
|
|
assert payload["a"] == "clean"
|
|
assert "\ufffd" in payload["b"]
|
|
|
|
def test_flat_list(self):
|
|
payload = ["clean", "dirty \udce2"]
|
|
assert _sanitize_structure_surrogates(payload) is True
|
|
assert payload[0] == "clean"
|
|
assert "\ufffd" in payload[1]
|
|
|
|
def test_nested_dict_in_list(self):
|
|
payload = [{"x": "dirty \udce2"}, {"x": "clean"}]
|
|
assert _sanitize_structure_surrogates(payload) is True
|
|
assert "\ufffd" in payload[0]["x"]
|
|
assert payload[1]["x"] == "clean"
|
|
|
|
def test_deeply_nested(self):
|
|
payload = {
|
|
"level1": {
|
|
"level2": [
|
|
{"level3": "deep \udce2 surrogate"},
|
|
],
|
|
},
|
|
}
|
|
assert _sanitize_structure_surrogates(payload) is True
|
|
assert "\ufffd" in payload["level1"]["level2"][0]["level3"]
|
|
|
|
def test_clean_payload_returns_false(self):
|
|
payload = {"a": "clean", "b": [{"c": "also clean"}]}
|
|
assert _sanitize_structure_surrogates(payload) is False
|
|
|
|
def test_non_string_values_ignored(self):
|
|
payload = {"int": 42, "list": [1, 2, 3], "dict": {"none": None}, "bool": True}
|
|
assert _sanitize_structure_surrogates(payload) is False
|
|
# Non-string values survive unchanged
|
|
assert payload["int"] == 42
|
|
assert payload["list"] == [1, 2, 3]
|
|
|
|
|
|
class TestApiMessagesSurrogateRecovery:
|
|
"""Integration: verify the recovery block sanitizes api_messages.
|
|
|
|
The bug this guards against: a surrogate in `reasoning_content` on
|
|
api_messages (transformed from `reasoning` during build) crashes the
|
|
OpenAI SDK's json.dumps(), and the recovery block previously only
|
|
sanitized the canonical `messages` list — not `api_messages` — so the
|
|
next retry would send the same broken payload and fail 3 times.
|
|
"""
|
|
|
|
def test_api_messages_reasoning_content_sanitized(self):
|
|
"""The extended sanitizer catches reasoning_content in api_messages."""
|
|
api_messages = [
|
|
{"role": "system", "content": "sys"},
|
|
{
|
|
"role": "assistant",
|
|
"content": "response",
|
|
"reasoning_content": "thought \udce2 trail",
|
|
"tool_calls": [
|
|
{
|
|
"id": "call_1",
|
|
"function": {"name": "tool", "arguments": "{}"},
|
|
}
|
|
],
|
|
},
|
|
{"role": "tool", "content": "result", "tool_call_id": "call_1"},
|
|
]
|
|
assert _sanitize_messages_surrogates(api_messages) is True
|
|
assert "\udce2" not in api_messages[1]["reasoning_content"]
|
|
# Full payload must now serialize clean
|
|
json.dumps(api_messages, ensure_ascii=False).encode("utf-8")
|
|
|
|
|
|
class TestRunConversationSurrogateSanitization:
|
|
"""Integration: verify run_conversation sanitizes user_message."""
|
|
|
|
@patch("run_agent.AIAgent._build_system_prompt")
|
|
@patch("run_agent.AIAgent._interruptible_streaming_api_call")
|
|
@patch("run_agent.AIAgent._interruptible_api_call")
|
|
def test_user_message_surrogates_sanitized(self, mock_api, mock_stream, mock_sys):
|
|
"""Surrogates in user_message are stripped before API call."""
|
|
from run_agent import AIAgent
|
|
|
|
mock_sys.return_value = "system prompt"
|
|
|
|
# Mock streaming to return a simple response
|
|
mock_choice = MagicMock()
|
|
mock_choice.message.content = "response"
|
|
mock_choice.message.tool_calls = None
|
|
mock_choice.message.refusal = None
|
|
mock_choice.finish_reason = "stop"
|
|
mock_choice.message.reasoning_content = None
|
|
|
|
mock_response = MagicMock()
|
|
mock_response.choices = [mock_choice]
|
|
mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=5, total_tokens=15)
|
|
mock_response.model = "test-model"
|
|
mock_response.id = "test-id"
|
|
|
|
mock_stream.return_value = mock_response
|
|
mock_api.return_value = mock_response
|
|
|
|
agent = AIAgent(model="test/model", api_key="test-key", base_url="http://localhost:1234/v1", quiet_mode=True, skip_memory=True, skip_context_files=True)
|
|
agent.client = MagicMock()
|
|
|
|
# Pass a message with surrogates
|
|
result = agent.run_conversation(
|
|
user_message="test \udce2 message",
|
|
conversation_history=[],
|
|
)
|
|
|
|
# The message stored in history should have surrogates replaced
|
|
for msg in result.get("messages", []):
|
|
if msg.get("role") == "user":
|
|
assert "\udce2" not in msg["content"], "Surrogate leaked into stored message"
|
|
assert "\ufffd" in msg["content"], "Replacement char not in stored message"
|