diff --git a/run_agent.py b/run_agent.py index 244fea6b2..a181b11a4 100644 --- a/run_agent.py +++ b/run_agent.py @@ -457,6 +457,15 @@ def _sanitize_messages_non_ascii(messages: list) -> bool: if sanitized != fn_args: fn["arguments"] = sanitized found = True + # Sanitize any additional top-level string fields (e.g. reasoning_content) + for key, value in msg.items(): + if key in {"content", "name", "tool_calls", "role"}: + continue + if isinstance(value, str): + sanitized = _strip_non_ascii(value) + if sanitized != value: + msg[key] = sanitized + found = True return found @@ -9107,7 +9116,19 @@ class AIAgent: # ASCII codec: the system encoding can't handle # non-ASCII characters at all. Sanitize all # non-ASCII content from messages/tool schemas and retry. + # Sanitize both the canonical `messages` list and + # `api_messages` (the API-copy built before the retry + # loop, which may contain extra fields like + # reasoning_content that are not in `messages`). _messages_sanitized = _sanitize_messages_non_ascii(messages) + if isinstance(api_messages, list): + _sanitize_messages_non_ascii(api_messages) + # Also sanitize the last api_kwargs if already built, + # so a leftover non-ASCII value in a transformed field + # (e.g. extra_body, reasoning_content) doesn't survive + # into the next attempt via _build_api_kwargs cache paths. + if isinstance(api_kwargs, dict): + _sanitize_structure_non_ascii(api_kwargs) _prefill_sanitized = False if isinstance(getattr(self, "prefill_messages", None), list): _prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages) diff --git a/tests/run_agent/test_unicode_ascii_codec.py b/tests/run_agent/test_unicode_ascii_codec.py index a8a52c34a..714429b30 100644 --- a/tests/run_agent/test_unicode_ascii_codec.py +++ b/tests/run_agent/test_unicode_ascii_codec.py @@ -268,9 +268,9 @@ class TestApiKeyClientSync: agent.client.api_key = _clean_key # All three locations should now hold the clean key - assert agent.api_key == "sk-proj-abcdef" - assert agent._client_kwargs["api_key"] == "sk-proj-abcdef" - assert agent.client.api_key == "sk-proj-abcdef" + assert agent.api_key == "***" + assert agent._client_kwargs["api_key"] == "***" + assert agent.client.api_key == "***" # The bad char should be gone from all of them assert "\u028b" not in agent.api_key assert "\u028b" not in agent._client_kwargs["api_key"] @@ -294,3 +294,64 @@ class TestApiKeyClientSync: assert agent.api_key == "sk-proj-" assert agent.client is None # should not have been touched + + +class TestApiMessagesAndApiKwargsSanitized: + """Regression tests for #6843 follow-up: api_messages and api_kwargs must + be sanitized alongside messages during ASCII-codec recovery. + + The original fix only sanitized the canonical `messages` list. + api_messages is a separate API-copy built before the retry loop; it may + carry extra fields (reasoning_content, extra_body) with non-ASCII chars + that are not present in `messages`. Without sanitizing api_messages and + api_kwargs, the retry still raises UnicodeEncodeError even after the + 'System encoding is ASCII — stripped...' log line appears. + """ + + def test_api_messages_with_reasoning_content_is_sanitized(self): + """api_messages may contain reasoning_content not in messages.""" + api_messages = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": "hi"}, + { + "role": "assistant", + "content": "Sure!", + # reasoning_content is injected by the API-copy builder and + # is NOT present in the canonical messages list + "reasoning_content": "Let me think \xab step by step \xbb", + }, + ] + found = _sanitize_messages_non_ascii(api_messages) + assert found is True + assert "\xab" not in api_messages[2]["reasoning_content"] + assert "\xbb" not in api_messages[2]["reasoning_content"] + + def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self): + """api_kwargs may contain non-ASCII in extra_body or other fields.""" + api_kwargs = { + "model": "glm-5.1", + "messages": [{"role": "user", "content": "ok"}], + "extra_body": { + "system": "Think carefully \u2192 answer", + }, + } + found = _sanitize_structure_non_ascii(api_kwargs) + assert found is True + assert "\u2192" not in api_kwargs["extra_body"]["system"] + + def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self): + """Even when canonical messages are clean, api_messages may be dirty.""" + messages = [{"role": "user", "content": "hello"}] + api_messages = [ + {"role": "user", "content": "hello"}, + { + "role": "assistant", + "content": "ok", + "reasoning_content": "step \xab done", + }, + ] + # messages sanitize returns False (nothing to clean) + assert _sanitize_messages_non_ascii(messages) is False + # api_messages sanitize must catch the dirty reasoning_content + assert _sanitize_messages_non_ascii(api_messages) is True + assert "\xab" not in api_messages[1]["reasoning_content"]