fix: sanitize api_messages and extra string fields during ASCII-codec recovery (#6843)

The ASCII-locale recovery path in run_agent.py sanitized the canonical
'messages' list but left 'api_messages' untouched. api_messages is a
separate API-copy built before the retry loop and may carry extra fields
(reasoning_content, extra_body entries) that are not present in
'messages'. This caused the retry to still raise UnicodeEncodeError even
after the 'System encoding is ASCII — stripped...' log line appeared.

Two changes:
- _sanitize_messages_non_ascii now walks all extra top-level string fields
  in each message dict (any key not in {content, name, tool_calls, role})
  so reasoning_content and future extras are cleaned in both 'messages'
  and 'api_messages'.
- The ASCII-codec recovery block now also calls sanitize on api_messages
  and api_kwargs so no non-ASCII survives into the next retry attempt.

Adds regression tests covering:
- reasoning_content with non-ASCII in api_messages
- extra_body with non-ASCII in api_kwargs
- canonical messages clean but api_messages dirty

Fixes #6843
This commit is contained in:
MestreY0d4-Uninter 2026-04-14 17:14:52 +00:00 committed by Teknium
parent d4eba82a37
commit efd1ddc6e1
2 changed files with 85 additions and 3 deletions

View file

@ -457,6 +457,15 @@ def _sanitize_messages_non_ascii(messages: list) -> bool:
if sanitized != fn_args: if sanitized != fn_args:
fn["arguments"] = sanitized fn["arguments"] = sanitized
found = True found = True
# Sanitize any additional top-level string fields (e.g. reasoning_content)
for key, value in msg.items():
if key in {"content", "name", "tool_calls", "role"}:
continue
if isinstance(value, str):
sanitized = _strip_non_ascii(value)
if sanitized != value:
msg[key] = sanitized
found = True
return found return found
@ -9107,7 +9116,19 @@ class AIAgent:
# ASCII codec: the system encoding can't handle # ASCII codec: the system encoding can't handle
# non-ASCII characters at all. Sanitize all # non-ASCII characters at all. Sanitize all
# non-ASCII content from messages/tool schemas and retry. # non-ASCII content from messages/tool schemas and retry.
# Sanitize both the canonical `messages` list and
# `api_messages` (the API-copy built before the retry
# loop, which may contain extra fields like
# reasoning_content that are not in `messages`).
_messages_sanitized = _sanitize_messages_non_ascii(messages) _messages_sanitized = _sanitize_messages_non_ascii(messages)
if isinstance(api_messages, list):
_sanitize_messages_non_ascii(api_messages)
# Also sanitize the last api_kwargs if already built,
# so a leftover non-ASCII value in a transformed field
# (e.g. extra_body, reasoning_content) doesn't survive
# into the next attempt via _build_api_kwargs cache paths.
if isinstance(api_kwargs, dict):
_sanitize_structure_non_ascii(api_kwargs)
_prefill_sanitized = False _prefill_sanitized = False
if isinstance(getattr(self, "prefill_messages", None), list): if isinstance(getattr(self, "prefill_messages", None), list):
_prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages) _prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages)

View file

@ -268,9 +268,9 @@ class TestApiKeyClientSync:
agent.client.api_key = _clean_key agent.client.api_key = _clean_key
# All three locations should now hold the clean key # All three locations should now hold the clean key
assert agent.api_key == "sk-proj-abcdef" assert agent.api_key == "***"
assert agent._client_kwargs["api_key"] == "sk-proj-abcdef" assert agent._client_kwargs["api_key"] == "***"
assert agent.client.api_key == "sk-proj-abcdef" assert agent.client.api_key == "***"
# The bad char should be gone from all of them # The bad char should be gone from all of them
assert "\u028b" not in agent.api_key assert "\u028b" not in agent.api_key
assert "\u028b" not in agent._client_kwargs["api_key"] assert "\u028b" not in agent._client_kwargs["api_key"]
@ -294,3 +294,64 @@ class TestApiKeyClientSync:
assert agent.api_key == "sk-proj-" assert agent.api_key == "sk-proj-"
assert agent.client is None # should not have been touched assert agent.client is None # should not have been touched
class TestApiMessagesAndApiKwargsSanitized:
"""Regression tests for #6843 follow-up: api_messages and api_kwargs must
be sanitized alongside messages during ASCII-codec recovery.
The original fix only sanitized the canonical `messages` list.
api_messages is a separate API-copy built before the retry loop; it may
carry extra fields (reasoning_content, extra_body) with non-ASCII chars
that are not present in `messages`. Without sanitizing api_messages and
api_kwargs, the retry still raises UnicodeEncodeError even after the
'System encoding is ASCII — stripped...' log line appears.
"""
def test_api_messages_with_reasoning_content_is_sanitized(self):
"""api_messages may contain reasoning_content not in messages."""
api_messages = [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "hi"},
{
"role": "assistant",
"content": "Sure!",
# reasoning_content is injected by the API-copy builder and
# is NOT present in the canonical messages list
"reasoning_content": "Let me think \xab step by step \xbb",
},
]
found = _sanitize_messages_non_ascii(api_messages)
assert found is True
assert "\xab" not in api_messages[2]["reasoning_content"]
assert "\xbb" not in api_messages[2]["reasoning_content"]
def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self):
"""api_kwargs may contain non-ASCII in extra_body or other fields."""
api_kwargs = {
"model": "glm-5.1",
"messages": [{"role": "user", "content": "ok"}],
"extra_body": {
"system": "Think carefully \u2192 answer",
},
}
found = _sanitize_structure_non_ascii(api_kwargs)
assert found is True
assert "\u2192" not in api_kwargs["extra_body"]["system"]
def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self):
"""Even when canonical messages are clean, api_messages may be dirty."""
messages = [{"role": "user", "content": "hello"}]
api_messages = [
{"role": "user", "content": "hello"},
{
"role": "assistant",
"content": "ok",
"reasoning_content": "step \xab done",
},
]
# messages sanitize returns False (nothing to clean)
assert _sanitize_messages_non_ascii(messages) is False
# api_messages sanitize must catch the dirty reasoning_content
assert _sanitize_messages_non_ascii(api_messages) is True
assert "\xab" not in api_messages[1]["reasoning_content"]