hermes-agent/tests/run_agent/test_unicode_ascii_codec.py
Teknium 93b6f45224 fix: always retry on ASCII codec UnicodeEncodeError — don't gate on per-component sanitization
The recovery block previously only retried (continue) when one of the
per-component sanitization checks (messages, tools, system prompt,
headers, credentials) found and stripped non-ASCII content.  When the
non-ASCII lived only in api_messages' reasoning_content field (which
is built from messages['reasoning'] and not checked by the original
_sanitize_messages_non_ascii), all checks returned False and the
recovery fell through to the normal error path — burning a retry
attempt despite _force_ascii_payload being set.

Now the recovery always continues (retries) when _is_ascii_codec is
detected.  The _force_ascii_payload flag guarantees the next iteration
runs _sanitize_structure_non_ascii(api_kwargs) on the full API payload,
catching any remaining non-ASCII regardless of where it lives.

Also adds test for the 'reasoning' field on canonical messages.

Fixes #6843
2026-04-15 15:03:28 -07:00

372 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for UnicodeEncodeError recovery with ASCII codec.
Covers the fix for issue #6843 — systems with ASCII locale (LANG=C)
that can't encode non-ASCII characters in API request payloads.
"""
import pytest
from run_agent import (
_strip_non_ascii,
_sanitize_messages_non_ascii,
_sanitize_structure_non_ascii,
_sanitize_tools_non_ascii,
_sanitize_messages_surrogates,
)
class TestStripNonAscii:
"""Tests for _strip_non_ascii helper."""
def test_ascii_only(self):
assert _strip_non_ascii("hello world") == "hello world"
def test_removes_non_ascii(self):
assert _strip_non_ascii("hello ⚕ world") == "hello world"
def test_removes_emoji(self):
assert _strip_non_ascii("test 🤖 done") == "test done"
def test_chinese_chars(self):
assert _strip_non_ascii("你好world") == "world"
def test_empty_string(self):
assert _strip_non_ascii("") == ""
def test_only_non_ascii(self):
assert _strip_non_ascii("⚕🤖") == ""
class TestSanitizeMessagesNonAscii:
"""Tests for _sanitize_messages_non_ascii."""
def test_no_change_ascii_only(self):
messages = [{"role": "user", "content": "hello"}]
assert _sanitize_messages_non_ascii(messages) is False
assert messages[0]["content"] == "hello"
def test_sanitizes_content_string(self):
messages = [{"role": "user", "content": "hello ⚕ world"}]
assert _sanitize_messages_non_ascii(messages) is True
assert messages[0]["content"] == "hello world"
def test_sanitizes_content_list(self):
messages = [{
"role": "user",
"content": [{"type": "text", "text": "hello 🤖"}]
}]
assert _sanitize_messages_non_ascii(messages) is True
assert messages[0]["content"][0]["text"] == "hello "
def test_sanitizes_name_field(self):
messages = [{"role": "tool", "name": "⚕tool", "content": "ok"}]
assert _sanitize_messages_non_ascii(messages) is True
assert messages[0]["name"] == "tool"
def test_sanitizes_tool_calls(self):
messages = [{
"role": "assistant",
"content": None,
"tool_calls": [{
"id": "call_1",
"type": "function",
"function": {
"name": "read_file",
"arguments": '{"path": "⚕test.txt"}'
}
}]
}]
assert _sanitize_messages_non_ascii(messages) is True
assert messages[0]["tool_calls"][0]["function"]["arguments"] == '{"path": "test.txt"}'
def test_handles_non_dict_messages(self):
messages = ["not a dict", {"role": "user", "content": "hello"}]
assert _sanitize_messages_non_ascii(messages) is False
def test_empty_messages(self):
assert _sanitize_messages_non_ascii([]) is False
def test_multiple_messages(self):
messages = [
{"role": "system", "content": "⚕ System prompt"},
{"role": "user", "content": "Hello 你好"},
{"role": "assistant", "content": "Hi there!"},
]
assert _sanitize_messages_non_ascii(messages) is True
assert messages[0]["content"] == " System prompt"
assert messages[1]["content"] == "Hello "
assert messages[2]["content"] == "Hi there!"
class TestSurrogateVsAsciiSanitization:
"""Test that surrogate and ASCII sanitization work independently."""
def test_surrogates_still_handled(self):
"""Surrogates are caught by _sanitize_messages_surrogates, not _non_ascii."""
msg_with_surrogate = "test \ud800 end"
messages = [{"role": "user", "content": msg_with_surrogate}]
assert _sanitize_messages_surrogates(messages) is True
assert "\ud800" not in messages[0]["content"]
assert "\ufffd" in messages[0]["content"]
def test_surrogates_in_name_and_tool_calls_are_sanitized(self):
messages = [{
"role": "assistant",
"name": "bad\ud800name",
"content": None,
"tool_calls": [{
"id": "call_\ud800",
"type": "function",
"function": {
"name": "read\ud800_file",
"arguments": '{"path": "bad\ud800.txt"}'
}
}],
}]
assert _sanitize_messages_surrogates(messages) is True
assert "\ud800" not in messages[0]["name"]
assert "\ud800" not in messages[0]["tool_calls"][0]["id"]
assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["name"]
assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["arguments"]
def test_ascii_codec_strips_all_non_ascii(self):
"""ASCII codec case: all non-ASCII is stripped, not replaced."""
messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]
assert _sanitize_messages_non_ascii(messages) is True
# All non-ASCII chars removed; spaces around them collapse
assert messages[0]["content"] == "test end"
def test_no_surrogates_returns_false(self):
"""When no surrogates present, _sanitize_messages_surrogates returns False."""
messages = [{"role": "user", "content": "hello ⚕ world"}]
assert _sanitize_messages_surrogates(messages) is False
class TestApiKeyNonAsciiSanitization:
"""Tests for API key sanitization in the UnicodeEncodeError recovery.
Covers the root cause of issue #6843: a non-ASCII character (ʋ U+028B)
in the API key causes httpx to fail when encoding the Authorization
header as ASCII. The recovery block must strip non-ASCII from the key.
"""
def test_strip_non_ascii_from_api_key(self):
"""_strip_non_ascii removes ʋ from an API key string."""
key = "sk-proj-abc" + "ʋ" + "def"
assert _strip_non_ascii(key) == "sk-proj-abcdef"
def test_api_key_at_position_153(self):
"""Reproduce the exact error: ʋ at position 153 in 'Bearer <key>'."""
key = "sk-proj-" + "a" * 138 + "ʋ" + "bcd"
auth_value = f"Bearer {key}"
# This is what httpx does — and it fails:
with pytest.raises(UnicodeEncodeError) as exc_info:
auth_value.encode("ascii")
assert exc_info.value.start == 153
# After sanitization, it should work:
sanitized_key = _strip_non_ascii(key)
sanitized_auth = f"Bearer {sanitized_key}"
sanitized_auth.encode("ascii") # should not raise
class TestSanitizeToolsNonAscii:
"""Tests for _sanitize_tools_non_ascii."""
def test_sanitizes_tool_description_and_parameter_descriptions(self):
tools = [
{
"type": "function",
"function": {
"name": "read_file",
"description": "Print structured output │ with emoji 🤖",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "File path │ with unicode",
}
},
},
},
}
]
assert _sanitize_tools_non_ascii(tools) is True
assert tools[0]["function"]["description"] == "Print structured output with emoji "
assert tools[0]["function"]["parameters"]["properties"]["path"]["description"] == "File path with unicode"
def test_no_change_for_ascii_only_tools(self):
tools = [
{
"type": "function",
"function": {
"name": "read_file",
"description": "Read file content",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "File path",
}
},
},
},
}
]
assert _sanitize_tools_non_ascii(tools) is False
class TestSanitizeStructureNonAscii:
def test_sanitizes_nested_dict_structure(self):
payload = {
"default_headers": {
"X-Title": "Hermes │ Agent",
"User-Agent": "Hermes/1.0 🤖",
}
}
assert _sanitize_structure_non_ascii(payload) is True
assert payload["default_headers"]["X-Title"] == "Hermes Agent"
assert payload["default_headers"]["User-Agent"] == "Hermes/1.0 "
class TestApiKeyClientSync:
"""Verify that ASCII recovery updates the live OpenAI client's api_key.
The OpenAI SDK stores its own copy of api_key which auth_headers reads
dynamically. If only self.api_key is updated but self.client.api_key
is not, the next request still sends the corrupted key in the
Authorization header.
"""
def test_client_api_key_updated_on_sanitize(self):
"""Simulate the recovery path and verify client.api_key is synced."""
from unittest.mock import MagicMock
from run_agent import AIAgent
agent = AIAgent.__new__(AIAgent)
bad_key = "sk-proj-abc\u028bdef" # ʋ lookalike at position 11
agent.api_key = bad_key
agent._client_kwargs = {"api_key": bad_key}
agent.quiet_mode = True
# Mock client with its own api_key attribute (like the real OpenAI client)
mock_client = MagicMock()
mock_client.api_key = bad_key
agent.client = mock_client
# --- replicate the recovery logic from run_agent.py ---
_raw_key = agent.api_key
_clean_key = _strip_non_ascii(_raw_key)
assert _clean_key != _raw_key, "test precondition: key should have non-ASCII"
agent.api_key = _clean_key
agent._client_kwargs["api_key"] = _clean_key
if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
agent.client.api_key = _clean_key
# All three locations should now hold the clean key
assert agent.api_key == "sk-proj-abcdef"
assert agent._client_kwargs["api_key"] == "sk-proj-abcdef"
assert agent.client.api_key == "sk-proj-abcdef"
# The bad char should be gone from all of them
assert "\u028b" not in agent.api_key
assert "\u028b" not in agent._client_kwargs["api_key"]
assert "\u028b" not in agent.client.api_key
def test_client_none_does_not_crash(self):
"""Recovery should not crash when client is None (pre-init)."""
from run_agent import AIAgent
agent = AIAgent.__new__(AIAgent)
bad_key = "sk-proj-\u028b"
agent.api_key = bad_key
agent._client_kwargs = {"api_key": bad_key}
agent.client = None
_clean_key = _strip_non_ascii(bad_key)
agent.api_key = _clean_key
agent._client_kwargs["api_key"] = _clean_key
if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"):
agent.client.api_key = _clean_key
assert agent.api_key == "sk-proj-"
assert agent.client is None # should not have been touched
class TestApiMessagesAndApiKwargsSanitized:
"""Regression tests for #6843 follow-up: api_messages and api_kwargs must
be sanitized alongside messages during ASCII-codec recovery.
The original fix only sanitized the canonical `messages` list.
api_messages is a separate API-copy built before the retry loop; it may
carry extra fields (reasoning_content, extra_body) with non-ASCII chars
that are not present in `messages`. Without sanitizing api_messages and
api_kwargs, the retry still raises UnicodeEncodeError even after the
'System encoding is ASCII — stripped...' log line appears.
"""
def test_api_messages_with_reasoning_content_is_sanitized(self):
"""api_messages may contain reasoning_content not in messages."""
api_messages = [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "hi"},
{
"role": "assistant",
"content": "Sure!",
# reasoning_content is injected by the API-copy builder and
# is NOT present in the canonical messages list
"reasoning_content": "Let me think \xab step by step \xbb",
},
]
found = _sanitize_messages_non_ascii(api_messages)
assert found is True
assert "\xab" not in api_messages[2]["reasoning_content"]
assert "\xbb" not in api_messages[2]["reasoning_content"]
def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self):
"""api_kwargs may contain non-ASCII in extra_body or other fields."""
api_kwargs = {
"model": "glm-5.1",
"messages": [{"role": "user", "content": "ok"}],
"extra_body": {
"system": "Think carefully \u2192 answer",
},
}
found = _sanitize_structure_non_ascii(api_kwargs)
assert found is True
assert "\u2192" not in api_kwargs["extra_body"]["system"]
def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self):
"""Even when canonical messages are clean, api_messages may be dirty."""
messages = [{"role": "user", "content": "hello"}]
api_messages = [
{"role": "user", "content": "hello"},
{
"role": "assistant",
"content": "ok",
"reasoning_content": "step \xab done",
},
]
# messages sanitize returns False (nothing to clean)
assert _sanitize_messages_non_ascii(messages) is False
# api_messages sanitize must catch the dirty reasoning_content
assert _sanitize_messages_non_ascii(api_messages) is True
assert "\xab" not in api_messages[1]["reasoning_content"]
def test_reasoning_field_in_canonical_messages_is_sanitized(self):
"""The canonical messages list stores reasoning as 'reasoning', not
'reasoning_content'. The extra-fields loop must catch it."""
messages = [
{"role": "user", "content": "hello"},
{
"role": "assistant",
"content": "ok",
"reasoning": "Let me think \xab carefully \xbb",
},
]
assert _sanitize_messages_non_ascii(messages) is True
assert "\xab" not in messages[1]["reasoning"]
assert "\xbb" not in messages[1]["reasoning"]