"""Tests for UnicodeEncodeError recovery with ASCII codec. Covers the fix for issue #6843 β€” systems with ASCII locale (LANG=C) that can't encode non-ASCII characters in API request payloads. """ import pytest from run_agent import ( _strip_non_ascii, _sanitize_messages_non_ascii, _sanitize_structure_non_ascii, _sanitize_tools_non_ascii, _sanitize_messages_surrogates, ) class TestStripNonAscii: """Tests for _strip_non_ascii helper.""" def test_ascii_only(self): assert _strip_non_ascii("hello world") == "hello world" def test_removes_non_ascii(self): assert _strip_non_ascii("hello βš• world") == "hello world" def test_removes_emoji(self): assert _strip_non_ascii("test πŸ€– done") == "test done" def test_chinese_chars(self): assert _strip_non_ascii("δ½ ε₯½world") == "world" def test_empty_string(self): assert _strip_non_ascii("") == "" def test_only_non_ascii(self): assert _strip_non_ascii("βš•πŸ€–") == "" class TestSanitizeMessagesNonAscii: """Tests for _sanitize_messages_non_ascii.""" def test_no_change_ascii_only(self): messages = [{"role": "user", "content": "hello"}] assert _sanitize_messages_non_ascii(messages) is False assert messages[0]["content"] == "hello" def test_sanitizes_content_string(self): messages = [{"role": "user", "content": "hello βš• world"}] assert _sanitize_messages_non_ascii(messages) is True assert messages[0]["content"] == "hello world" def test_sanitizes_content_list(self): messages = [{ "role": "user", "content": [{"type": "text", "text": "hello πŸ€–"}] }] assert _sanitize_messages_non_ascii(messages) is True assert messages[0]["content"][0]["text"] == "hello " def test_sanitizes_name_field(self): messages = [{"role": "tool", "name": "βš•tool", "content": "ok"}] assert _sanitize_messages_non_ascii(messages) is True assert messages[0]["name"] == "tool" def test_sanitizes_tool_calls(self): messages = [{ "role": "assistant", "content": None, "tool_calls": [{ "id": "call_1", "type": "function", "function": { "name": "read_file", "arguments": '{"path": "βš•test.txt"}' } }] }] assert _sanitize_messages_non_ascii(messages) is True assert messages[0]["tool_calls"][0]["function"]["arguments"] == '{"path": "test.txt"}' def test_handles_non_dict_messages(self): messages = ["not a dict", {"role": "user", "content": "hello"}] assert _sanitize_messages_non_ascii(messages) is False def test_empty_messages(self): assert _sanitize_messages_non_ascii([]) is False def test_multiple_messages(self): messages = [ {"role": "system", "content": "βš• System prompt"}, {"role": "user", "content": "Hello δ½ ε₯½"}, {"role": "assistant", "content": "Hi there!"}, ] assert _sanitize_messages_non_ascii(messages) is True assert messages[0]["content"] == " System prompt" assert messages[1]["content"] == "Hello " assert messages[2]["content"] == "Hi there!" class TestSurrogateVsAsciiSanitization: """Test that surrogate and ASCII sanitization work independently.""" def test_surrogates_still_handled(self): """Surrogates are caught by _sanitize_messages_surrogates, not _non_ascii.""" msg_with_surrogate = "test \ud800 end" messages = [{"role": "user", "content": msg_with_surrogate}] assert _sanitize_messages_surrogates(messages) is True assert "\ud800" not in messages[0]["content"] assert "\ufffd" in messages[0]["content"] def test_surrogates_in_name_and_tool_calls_are_sanitized(self): messages = [{ "role": "assistant", "name": "bad\ud800name", "content": None, "tool_calls": [{ "id": "call_\ud800", "type": "function", "function": { "name": "read\ud800_file", "arguments": '{"path": "bad\ud800.txt"}' } }], }] assert _sanitize_messages_surrogates(messages) is True assert "\ud800" not in messages[0]["name"] assert "\ud800" not in messages[0]["tool_calls"][0]["id"] assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["name"] assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["arguments"] def test_ascii_codec_strips_all_non_ascii(self): """ASCII codec case: all non-ASCII is stripped, not replaced.""" messages = [{"role": "user", "content": "test βš•πŸ€–δ½ ε₯½ end"}] assert _sanitize_messages_non_ascii(messages) is True # All non-ASCII chars removed; spaces around them collapse assert messages[0]["content"] == "test end" def test_no_surrogates_returns_false(self): """When no surrogates present, _sanitize_messages_surrogates returns False.""" messages = [{"role": "user", "content": "hello βš• world"}] assert _sanitize_messages_surrogates(messages) is False class TestApiKeyNonAsciiSanitization: """Tests for API key sanitization in the UnicodeEncodeError recovery. Covers the root cause of issue #6843: a non-ASCII character (Κ‹ U+028B) in the API key causes httpx to fail when encoding the Authorization header as ASCII. The recovery block must strip non-ASCII from the key. """ def test_strip_non_ascii_from_api_key(self): """_strip_non_ascii removes Κ‹ from an API key string.""" key = "sk-proj-abc" + "Κ‹" + "def" assert _strip_non_ascii(key) == "sk-proj-abcdef" def test_api_key_at_position_153(self): """Reproduce the exact error: Κ‹ at position 153 in 'Bearer '.""" key = "sk-proj-" + "a" * 138 + "Κ‹" + "bcd" auth_value = f"Bearer {key}" # This is what httpx does β€” and it fails: with pytest.raises(UnicodeEncodeError) as exc_info: auth_value.encode("ascii") assert exc_info.value.start == 153 # After sanitization, it should work: sanitized_key = _strip_non_ascii(key) sanitized_auth = f"Bearer {sanitized_key}" sanitized_auth.encode("ascii") # should not raise class TestSanitizeToolsNonAscii: """Tests for _sanitize_tools_non_ascii.""" def test_sanitizes_tool_description_and_parameter_descriptions(self): tools = [ { "type": "function", "function": { "name": "read_file", "description": "Print structured output β”‚ with emoji πŸ€–", "parameters": { "type": "object", "properties": { "path": { "type": "string", "description": "File path β”‚ with unicode", } }, }, }, } ] assert _sanitize_tools_non_ascii(tools) is True assert tools[0]["function"]["description"] == "Print structured output with emoji " assert tools[0]["function"]["parameters"]["properties"]["path"]["description"] == "File path with unicode" def test_no_change_for_ascii_only_tools(self): tools = [ { "type": "function", "function": { "name": "read_file", "description": "Read file content", "parameters": { "type": "object", "properties": { "path": { "type": "string", "description": "File path", } }, }, }, } ] assert _sanitize_tools_non_ascii(tools) is False class TestSanitizeStructureNonAscii: def test_sanitizes_nested_dict_structure(self): payload = { "default_headers": { "X-Title": "Hermes β”‚ Agent", "User-Agent": "Hermes/1.0 πŸ€–", } } assert _sanitize_structure_non_ascii(payload) is True assert payload["default_headers"]["X-Title"] == "Hermes Agent" assert payload["default_headers"]["User-Agent"] == "Hermes/1.0 " class TestApiKeyClientSync: """Verify that ASCII recovery updates the live OpenAI client's api_key. The OpenAI SDK stores its own copy of api_key which auth_headers reads dynamically. If only self.api_key is updated but self.client.api_key is not, the next request still sends the corrupted key in the Authorization header. """ def test_client_api_key_updated_on_sanitize(self): """Simulate the recovery path and verify client.api_key is synced.""" from unittest.mock import MagicMock from run_agent import AIAgent agent = AIAgent.__new__(AIAgent) bad_key = "sk-proj-abc\u028bdef" # Κ‹ lookalike at position 11 agent.api_key = bad_key agent._client_kwargs = {"api_key": bad_key} agent.quiet_mode = True # Mock client with its own api_key attribute (like the real OpenAI client) mock_client = MagicMock() mock_client.api_key = bad_key agent.client = mock_client # --- replicate the recovery logic from run_agent.py --- _raw_key = agent.api_key _clean_key = _strip_non_ascii(_raw_key) assert _clean_key != _raw_key, "test precondition: key should have non-ASCII" agent.api_key = _clean_key agent._client_kwargs["api_key"] = _clean_key if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"): agent.client.api_key = _clean_key # All three locations should now hold the clean key assert agent.api_key == "sk-proj-abcdef" assert agent._client_kwargs["api_key"] == "sk-proj-abcdef" assert agent.client.api_key == "sk-proj-abcdef" # The bad char should be gone from all of them assert "\u028b" not in agent.api_key assert "\u028b" not in agent._client_kwargs["api_key"] assert "\u028b" not in agent.client.api_key def test_client_none_does_not_crash(self): """Recovery should not crash when client is None (pre-init).""" from run_agent import AIAgent agent = AIAgent.__new__(AIAgent) bad_key = "sk-proj-\u028b" agent.api_key = bad_key agent._client_kwargs = {"api_key": bad_key} agent.client = None _clean_key = _strip_non_ascii(bad_key) agent.api_key = _clean_key agent._client_kwargs["api_key"] = _clean_key if getattr(agent, "client", None) is not None and hasattr(agent.client, "api_key"): agent.client.api_key = _clean_key assert agent.api_key == "sk-proj-" assert agent.client is None # should not have been touched class TestApiMessagesAndApiKwargsSanitized: """Regression tests for #6843 follow-up: api_messages and api_kwargs must be sanitized alongside messages during ASCII-codec recovery. The original fix only sanitized the canonical `messages` list. api_messages is a separate API-copy built before the retry loop; it may carry extra fields (reasoning_content, extra_body) with non-ASCII chars that are not present in `messages`. Without sanitizing api_messages and api_kwargs, the retry still raises UnicodeEncodeError even after the 'System encoding is ASCII β€” stripped...' log line appears. """ def test_api_messages_with_reasoning_content_is_sanitized(self): """api_messages may contain reasoning_content not in messages.""" api_messages = [ {"role": "system", "content": "You are helpful."}, {"role": "user", "content": "hi"}, { "role": "assistant", "content": "Sure!", # reasoning_content is injected by the API-copy builder and # is NOT present in the canonical messages list "reasoning_content": "Let me think \xab step by step \xbb", }, ] found = _sanitize_messages_non_ascii(api_messages) assert found is True assert "\xab" not in api_messages[2]["reasoning_content"] assert "\xbb" not in api_messages[2]["reasoning_content"] def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self): """api_kwargs may contain non-ASCII in extra_body or other fields.""" api_kwargs = { "model": "glm-5.1", "messages": [{"role": "user", "content": "ok"}], "extra_body": { "system": "Think carefully \u2192 answer", }, } found = _sanitize_structure_non_ascii(api_kwargs) assert found is True assert "\u2192" not in api_kwargs["extra_body"]["system"] def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self): """Even when canonical messages are clean, api_messages may be dirty.""" messages = [{"role": "user", "content": "hello"}] api_messages = [ {"role": "user", "content": "hello"}, { "role": "assistant", "content": "ok", "reasoning_content": "step \xab done", }, ] # messages sanitize returns False (nothing to clean) assert _sanitize_messages_non_ascii(messages) is False # api_messages sanitize must catch the dirty reasoning_content assert _sanitize_messages_non_ascii(api_messages) is True assert "\xab" not in api_messages[1]["reasoning_content"] def test_reasoning_field_in_canonical_messages_is_sanitized(self): """The canonical messages list stores reasoning as 'reasoning', not 'reasoning_content'. The extra-fields loop must catch it.""" messages = [ {"role": "user", "content": "hello"}, { "role": "assistant", "content": "ok", "reasoning": "Let me think \xab carefully \xbb", }, ] assert _sanitize_messages_non_ascii(messages) is True assert "\xab" not in messages[1]["reasoning"] assert "\xbb" not in messages[1]["reasoning"]