From 93b6f4522479a7c92ef8dc6a75d71c8c83b7e7f1 Mon Sep 17 00:00:00 2001
From: Teknium <teknium1@gmail.com>
Date: Wed, 15 Apr 2026 14:56:55 -0700
Subject: [PATCH] =?UTF-8?q?fix:=20always=20retry=20on=20ASCII=20codec=20Un?=
 =?UTF-8?q?icodeEncodeError=20=E2=80=94=20don't=20gate=20on=20per-componen?=
 =?UTF-8?q?t=20sanitization?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recovery block previously only retried (continue) when one of the
per-component sanitization checks (messages, tools, system prompt,
headers, credentials) found and stripped non-ASCII content.  When the
non-ASCII lived only in api_messages' reasoning_content field (which
is built from messages['reasoning'] and not checked by the original
_sanitize_messages_non_ascii), all checks returned False and the
recovery fell through to the normal error path — burning a retry
attempt despite _force_ascii_payload being set.

Now the recovery always continues (retries) when _is_ascii_codec is
detected.  The _force_ascii_payload flag guarantees the next iteration
runs _sanitize_structure_non_ascii(api_kwargs) on the full API payload,
catching any remaining non-ASCII regardless of where it lives.

Also adds test for the 'reasoning' field on canonical messages.

Fixes #6843
---
 run_agent.py                                | 24 +++++++++++++++------
 tests/run_agent/test_unicode_ascii_codec.py | 21 +++++++++++++++---
 2 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/run_agent.py b/run_agent.py
index a181b11a4..b01107814 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -9186,22 +9186,34 @@ class AIAgent:
                                         force=True,
                                     )
 
-                            if (
+                            # Always retry on ASCII codec detection —
+                            # _force_ascii_payload guarantees the full
+                            # api_kwargs payload is sanitized on the
+                            # next iteration (line ~8475).  Even when
+                            # per-component checks above find nothing
+                            # (e.g. non-ASCII only in api_messages'
+                            # reasoning_content), the flag catches it.
+                            # Bounded by _unicode_sanitization_passes < 2.
+                            self._unicode_sanitization_passes += 1
+                            _any_sanitized = (
                                 _messages_sanitized
                                 or _prefill_sanitized
                                 or _tools_sanitized
                                 or _system_sanitized
                                 or _headers_sanitized
                                 or _credential_sanitized
-                            ):
-                                self._unicode_sanitization_passes += 1
+                            )
+                            if _any_sanitized:
                                 self._vprint(
                                     f"{self.log_prefix}⚠️  System encoding is ASCII — stripped non-ASCII characters from request payload. Retrying...",
                                     force=True,
                                 )
-                                continue
-                        # Nothing to sanitize in any payload component.
-                        # Fall through to normal error path.
+                            else:
+                                self._vprint(
+                                    f"{self.log_prefix}⚠️  System encoding is ASCII — enabling full-payload sanitization for retry...",
+                                    force=True,
+                                )
+                            continue
 
                     status_code = getattr(api_error, "status_code", None)
                     error_context = self._extract_api_error_context(api_error)
diff --git a/tests/run_agent/test_unicode_ascii_codec.py b/tests/run_agent/test_unicode_ascii_codec.py
index 714429b30..04b5e4043 100644
--- a/tests/run_agent/test_unicode_ascii_codec.py
+++ b/tests/run_agent/test_unicode_ascii_codec.py
@@ -268,9 +268,9 @@ class TestApiKeyClientSync:
             agent.client.api_key = _clean_key
 
         # All three locations should now hold the clean key
-        assert agent.api_key == "***"
-        assert agent._client_kwargs["api_key"] == "***"
-        assert agent.client.api_key == "***"
+        assert agent.api_key == "sk-proj-abcdef"
+        assert agent._client_kwargs["api_key"] == "sk-proj-abcdef"
+        assert agent.client.api_key == "sk-proj-abcdef"
         # The bad char should be gone from all of them
         assert "\u028b" not in agent.api_key
         assert "\u028b" not in agent._client_kwargs["api_key"]
@@ -355,3 +355,18 @@ class TestApiMessagesAndApiKwargsSanitized:
         # api_messages sanitize must catch the dirty reasoning_content
         assert _sanitize_messages_non_ascii(api_messages) is True
         assert "\xab" not in api_messages[1]["reasoning_content"]
+
+    def test_reasoning_field_in_canonical_messages_is_sanitized(self):
+        """The canonical messages list stores reasoning as 'reasoning', not
+        'reasoning_content'.  The extra-fields loop must catch it."""
+        messages = [
+            {"role": "user", "content": "hello"},
+            {
+                "role": "assistant",
+                "content": "ok",
+                "reasoning": "Let me think \xab carefully \xbb",
+            },
+        ]
+        assert _sanitize_messages_non_ascii(messages) is True
+        assert "\xab" not in messages[1]["reasoning"]
+        assert "\xbb" not in messages[1]["reasoning"]