fix(unicode): sanitize surrogate metadata and allow two-pass retry

2026-04-25 00:51:20 +00:00 · 2026-04-10 12:54:57 +00:00 · 2026-04-10 12:54:57 +00:00 · 2c99b4e79b
commit 2c99b4e79b
parent 71036a7a75
2 changed files with 55 additions and 8 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -359,8 +359,9 @@ def _sanitize_surrogates(text: str) -> str:
 def _sanitize_messages_surrogates(messages: list) -> bool:
    """Sanitize surrogate characters from all string content in a messages list.
-    Walks message dicts in-place.  Returns True if any surrogates were found
+    Walks message dicts in-place. Returns True if any surrogates were found
-    and replaced, False otherwise.
+    and replaced, False otherwise. Covers content/text, name, and tool call
    metadata/arguments so retries don't fail on a non-content field.
    """
    found = False
    for msg in messages:
@ -377,6 +378,29 @@ def _sanitize_messages_surrogates(messages: list) -> bool:
                    if isinstance(text, str) and _SURROGATE_RE.search(text):
                        part["text"] = _SURROGATE_RE.sub('\ufffd', text)
                        found = True
        name = msg.get("name")
        if isinstance(name, str) and _SURROGATE_RE.search(name):
            msg["name"] = _SURROGATE_RE.sub('\ufffd', name)
            found = True
        tool_calls = msg.get("tool_calls")
        if isinstance(tool_calls, list):
            for tc in tool_calls:
                if not isinstance(tc, dict):
                    continue
                tc_id = tc.get("id")
                if isinstance(tc_id, str) and _SURROGATE_RE.search(tc_id):
                    tc["id"] = _SURROGATE_RE.sub('\ufffd', tc_id)
                    found = True
                fn = tc.get("function")
                if isinstance(fn, dict):
                    fn_name = fn.get("name")
                    if isinstance(fn_name, str) and _SURROGATE_RE.search(fn_name):
                        fn["name"] = _SURROGATE_RE.sub('\ufffd', fn_name)
                        found = True
                    fn_args = fn.get("arguments")
                    if isinstance(fn_args, str) and _SURROGATE_RE.search(fn_args):
                        fn["arguments"] = _SURROGATE_RE.sub('\ufffd', fn_args)
                        found = True
    return found
@ -7242,7 +7266,7 @@ class AIAgent:
        self._thinking_prefill_retries = 0
        self._last_content_with_tools = None
        self._mute_post_response = False
-        self._unicode_sanitized = False
+        self._unicode_sanitization_passes = 0
        # Pre-turn connection health check: detect and clean up dead TCP
        # connections left over from provider outages or dropped streams.
@ -8233,14 +8257,16 @@ class AIAgent:
                    #   2. ASCII codec on systems with LANG=C or non-UTF-8 locale
                    #      (e.g. Chromebooks) — any non-ASCII character fails.
                    #      Detect via the error message mentioning 'ascii' codec.
-                    # We sanitize messages in-place and retry once.
+                    # We sanitize messages in-place and may retry twice:
                    # first to strip surrogates, then once more for pure
                    # ASCII-only locale sanitization if needed.
                    # -----------------------------------------------------------
-                    if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_unicode_sanitized', False):
+                    if isinstance(api_error, UnicodeEncodeError) and getattr(self, '_unicode_sanitization_passes', 0) < 2:
                        self._unicode_sanitized = True
                        _err_str = str(api_error).lower()
                        _is_ascii_codec = "'ascii'" in _err_str or "ascii" in _err_str
                        _surrogates_found = _sanitize_messages_surrogates(messages)
                        if _surrogates_found:
                            self._unicode_sanitization_passes += 1
                            self._vprint(
                                f"{self.log_prefix}⚠️  Stripped invalid surrogate characters from messages. Retrying...",
                                force=True,
@ -8248,16 +8274,17 @@ class AIAgent:
                            continue
                        if _is_ascii_codec:
                            # ASCII codec: the system encoding can't handle
-                            # non-ASCII characters at all.  Sanitize all
+                            # non-ASCII characters at all. Sanitize all
                            # non-ASCII content from messages and retry.
                            if _sanitize_messages_non_ascii(messages):
                                self._unicode_sanitization_passes += 1
                                self._vprint(
                                    f"{self.log_prefix}⚠️  System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...",
                                    force=True,
                                )
                                continue
                        # Nothing to sanitize in messages — might be in system
-                        # prompt or prefill.  Fall through to normal error path.
+                        # prompt or prefill. Fall through to normal error path.
                    status_code = getattr(api_error, "status_code", None)
                    error_context = self._extract_api_error_context(api_error)
--- a/tests/run_agent/test_unicode_ascii_codec.py
+++ b/tests/run_agent/test_unicode_ascii_codec.py
@ -107,6 +107,26 @@ class TestSurrogateVsAsciiSanitization:
        assert "\ud800" not in messages[0]["content"]
        assert "\ufffd" in messages[0]["content"]
    def test_surrogates_in_name_and_tool_calls_are_sanitized(self):
        messages = [{
            "role": "assistant",
            "name": "bad\ud800name",
            "content": None,
            "tool_calls": [{
                "id": "call_\ud800",
                "type": "function",
                "function": {
                    "name": "read\ud800_file",
                    "arguments": '{"path": "bad\ud800.txt"}'
                }
            }],
        }]
        assert _sanitize_messages_surrogates(messages) is True
        assert "\ud800" not in messages[0]["name"]
        assert "\ud800" not in messages[0]["tool_calls"][0]["id"]
        assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["name"]
        assert "\ud800" not in messages[0]["tool_calls"][0]["function"]["arguments"]
    def test_ascii_codec_strips_all_non_ascii(self):
        """ASCII codec case: all non-ASCII is stripped, not replaced."""
        messages = [{"role": "user", "content": "test ⚕🤖你好 end"}]