fix: extend ASCII-locale UnicodeEncodeError recovery to full request payload

The existing ASCII codec handler only sanitized conversation messages,
leaving tool schemas, system prompts, ephemeral prompts, prefill messages,
and HTTP headers as unhandled sources of non-ASCII content. On systems
with LANG=C or non-UTF-8 locale, Unicode symbols in tool descriptions
(e.g. arrows, em-dashes from prompt_builder) and system prompt content
would cause UnicodeEncodeError that fell through to the error path.

Changes:
- Add _sanitize_structure_non_ascii() generic recursive walker for
  nested dict/list payloads
- Add _sanitize_tools_non_ascii() thin wrapper for tool schemas
- Add _force_ascii_payload flag: once ASCII locale is detected, all
  subsequent API calls get proactively sanitized (prevents recurring
  failures from new tool results bringing fresh Unicode each turn)
- Extend the ASCII codec error handler to sanitize: prefill_messages,
  tool schemas (self.tools), system prompt, ephemeral system prompt,
  and default HTTP headers
- Update stale comment that acknowledged the gap

Cherry-picked from PR #8834 (credential pool changes dropped as
separate concern).
This commit is contained in:
kimsr96 2026-04-13 05:15:48 -07:00 committed by Teknium
parent 28a9c43f81
commit b909a9efef
2 changed files with 145 additions and 5 deletions

View file

@ -9,6 +9,8 @@ import pytest
from run_agent import (
_strip_non_ascii,
_sanitize_messages_non_ascii,
_sanitize_structure_non_ascii,
_sanitize_tools_non_ascii,
_sanitize_messages_surrogates,
)
@ -138,3 +140,66 @@ class TestSurrogateVsAsciiSanitization:
"""When no surrogates present, _sanitize_messages_surrogates returns False."""
messages = [{"role": "user", "content": "hello ⚕ world"}]
assert _sanitize_messages_surrogates(messages) is False
class TestSanitizeToolsNonAscii:
"""Tests for _sanitize_tools_non_ascii."""
def test_sanitizes_tool_description_and_parameter_descriptions(self):
tools = [
{
"type": "function",
"function": {
"name": "read_file",
"description": "Print structured output │ with emoji 🤖",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "File path │ with unicode",
}
},
},
},
}
]
assert _sanitize_tools_non_ascii(tools) is True
assert tools[0]["function"]["description"] == "Print structured output with emoji "
assert tools[0]["function"]["parameters"]["properties"]["path"]["description"] == "File path with unicode"
def test_no_change_for_ascii_only_tools(self):
tools = [
{
"type": "function",
"function": {
"name": "read_file",
"description": "Read file content",
"parameters": {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "File path",
}
},
},
},
}
]
assert _sanitize_tools_non_ascii(tools) is False
class TestSanitizeStructureNonAscii:
def test_sanitizes_nested_dict_structure(self):
payload = {
"default_headers": {
"X-Title": "Hermes │ Agent",
"User-Agent": "Hermes/1.0 🤖",
}
}
assert _sanitize_structure_non_ascii(payload) is True
assert payload["default_headers"]["X-Title"] == "Hermes Agent"
assert payload["default_headers"]["User-Agent"] == "Hermes/1.0 "