fix: sanitize api_messages and extra string fields during ASCII-codec recovery (#6843)

The ASCII-locale recovery path in run_agent.py sanitized the canonical 'messages' list but left 'api_messages' untouched. api_messages is a separate API-copy built before the retry loop and may carry extra fields (reasoning_content, extra_body entries) that are not present in 'messages'. This caused the retry to still raise UnicodeEncodeError even after the 'System encoding is ASCII — stripped...' log line appeared. Two changes: - _sanitize_messages_non_ascii now walks all extra top-level string fields in each message dict (any key not in {content, name, tool_calls, role}) so reasoning_content and future extras are cleaned in both 'messages' and 'api_messages'. - The ASCII-codec recovery block now also calls sanitize on api_messages and api_kwargs so no non-ASCII survives into the next retry attempt. Adds regression tests covering: - reasoning_content with non-ASCII in api_messages - extra_body with non-ASCII in api_kwargs - canonical messages clean but api_messages dirty Fixes #6843
2026-04-25 00:51:20 +00:00 · 2026-04-14 17:14:52 +00:00 · 2026-04-14 17:14:52 +00:00 · efd1ddc6e1
commit efd1ddc6e1
parent d4eba82a37
2 changed files with 85 additions and 3 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -457,6 +457,15 @@ def _sanitize_messages_non_ascii(messages: list) -> bool:
                            if sanitized != fn_args:
                                fn["arguments"] = sanitized
                                found = True
        # Sanitize any additional top-level string fields (e.g. reasoning_content)
        for key, value in msg.items():
            if key in {"content", "name", "tool_calls", "role"}:
                continue
            if isinstance(value, str):
                sanitized = _strip_non_ascii(value)
                if sanitized != value:
                    msg[key] = sanitized
                    found = True
    return found
@ -9107,7 +9116,19 @@ class AIAgent:
                            # ASCII codec: the system encoding can't handle
                            # non-ASCII characters at all. Sanitize all
                            # non-ASCII content from messages/tool schemas and retry.
                            # Sanitize both the canonical `messages` list and
                            # `api_messages` (the API-copy built before the retry
                            # loop, which may contain extra fields like
                            # reasoning_content that are not in `messages`).
                            _messages_sanitized = _sanitize_messages_non_ascii(messages)
                            if isinstance(api_messages, list):
                                _sanitize_messages_non_ascii(api_messages)
                            # Also sanitize the last api_kwargs if already built,
                            # so a leftover non-ASCII value in a transformed field
                            # (e.g. extra_body, reasoning_content) doesn't survive
                            # into the next attempt via _build_api_kwargs cache paths.
                            if isinstance(api_kwargs, dict):
                                _sanitize_structure_non_ascii(api_kwargs)
                            _prefill_sanitized = False
                            if isinstance(getattr(self, "prefill_messages", None), list):
                                _prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages)
--- a/tests/run_agent/test_unicode_ascii_codec.py
+++ b/tests/run_agent/test_unicode_ascii_codec.py
@ -268,9 +268,9 @@ class TestApiKeyClientSync:
            agent.client.api_key = _clean_key
        # All three locations should now hold the clean key
-        assert agent.api_key == "sk-proj-abcdef"
+        assert agent.api_key == "***"
-        assert agent._client_kwargs["api_key"] == "sk-proj-abcdef"
+        assert agent._client_kwargs["api_key"] == "***"
-        assert agent.client.api_key == "sk-proj-abcdef"
+        assert agent.client.api_key == "***"
        # The bad char should be gone from all of them
        assert "\u028b" not in agent.api_key
        assert "\u028b" not in agent._client_kwargs["api_key"]
@ -294,3 +294,64 @@ class TestApiKeyClientSync:
        assert agent.api_key == "sk-proj-"
        assert agent.client is None  # should not have been touched
 class TestApiMessagesAndApiKwargsSanitized:
    """Regression tests for #6843 follow-up: api_messages and api_kwargs must
    be sanitized alongside messages during ASCII-codec recovery.
    The original fix only sanitized the canonical `messages` list.
    api_messages is a separate API-copy built before the retry loop; it may
    carry extra fields (reasoning_content, extra_body) with non-ASCII chars
    that are not present in `messages`.  Without sanitizing api_messages and
    api_kwargs, the retry still raises UnicodeEncodeError even after the
    'System encoding is ASCII — stripped...' log line appears.
    """
    def test_api_messages_with_reasoning_content_is_sanitized(self):
        """api_messages may contain reasoning_content not in messages."""
        api_messages = [
            {"role": "system", "content": "You are helpful."},
            {"role": "user", "content": "hi"},
            {
                "role": "assistant",
                "content": "Sure!",
                # reasoning_content is injected by the API-copy builder and
                # is NOT present in the canonical messages list
                "reasoning_content": "Let me think \xab step by step \xbb",
            },
        ]
        found = _sanitize_messages_non_ascii(api_messages)
        assert found is True
        assert "\xab" not in api_messages[2]["reasoning_content"]
        assert "\xbb" not in api_messages[2]["reasoning_content"]
    def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self):
        """api_kwargs may contain non-ASCII in extra_body or other fields."""
        api_kwargs = {
            "model": "glm-5.1",
            "messages": [{"role": "user", "content": "ok"}],
            "extra_body": {
                "system": "Think carefully \u2192 answer",
            },
        }
        found = _sanitize_structure_non_ascii(api_kwargs)
        assert found is True
        assert "\u2192" not in api_kwargs["extra_body"]["system"]
    def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self):
        """Even when canonical messages are clean, api_messages may be dirty."""
        messages = [{"role": "user", "content": "hello"}]
        api_messages = [
            {"role": "user", "content": "hello"},
            {
                "role": "assistant",
                "content": "ok",
                "reasoning_content": "step \xab done",
            },
        ]
        # messages sanitize returns False (nothing to clean)
        assert _sanitize_messages_non_ascii(messages) is False
        # api_messages sanitize must catch the dirty reasoning_content
        assert _sanitize_messages_non_ascii(api_messages) is True
        assert "\xab" not in api_messages[1]["reasoning_content"]