mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: extend ASCII-locale UnicodeEncodeError recovery to full request payload
The existing ASCII codec handler only sanitized conversation messages, leaving tool schemas, system prompts, ephemeral prompts, prefill messages, and HTTP headers as unhandled sources of non-ASCII content. On systems with LANG=C or non-UTF-8 locale, Unicode symbols in tool descriptions (e.g. arrows, em-dashes from prompt_builder) and system prompt content would cause UnicodeEncodeError that fell through to the error path. Changes: - Add _sanitize_structure_non_ascii() generic recursive walker for nested dict/list payloads - Add _sanitize_tools_non_ascii() thin wrapper for tool schemas - Add _force_ascii_payload flag: once ASCII locale is detected, all subsequent API calls get proactively sanitized (prevents recurring failures from new tool results bringing fresh Unicode each turn) - Extend the ASCII codec error handler to sanitize: prefill_messages, tool schemas (self.tools), system prompt, ephemeral system prompt, and default HTTP headers - Update stale comment that acknowledged the gap Cherry-picked from PR #8834 (credential pool changes dropped as separate concern).
This commit is contained in:
parent
28a9c43f81
commit
b909a9efef
2 changed files with 145 additions and 5 deletions
85
run_agent.py
85
run_agent.py
|
|
@ -460,6 +460,40 @@ def _sanitize_messages_non_ascii(messages: list) -> bool:
|
||||||
return found
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_tools_non_ascii(tools: list) -> bool:
|
||||||
|
"""Strip non-ASCII characters from tool payloads in-place."""
|
||||||
|
return _sanitize_structure_non_ascii(tools)
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_structure_non_ascii(payload: Any) -> bool:
|
||||||
|
"""Strip non-ASCII characters from nested dict/list payloads in-place."""
|
||||||
|
found = False
|
||||||
|
|
||||||
|
def _walk(node):
|
||||||
|
nonlocal found
|
||||||
|
if isinstance(node, dict):
|
||||||
|
for key, value in node.items():
|
||||||
|
if isinstance(value, str):
|
||||||
|
sanitized = _strip_non_ascii(value)
|
||||||
|
if sanitized != value:
|
||||||
|
node[key] = sanitized
|
||||||
|
found = True
|
||||||
|
elif isinstance(value, (dict, list)):
|
||||||
|
_walk(value)
|
||||||
|
elif isinstance(node, list):
|
||||||
|
for idx, value in enumerate(node):
|
||||||
|
if isinstance(value, str):
|
||||||
|
sanitized = _strip_non_ascii(value)
|
||||||
|
if sanitized != value:
|
||||||
|
node[idx] = sanitized
|
||||||
|
found = True
|
||||||
|
elif isinstance(value, (dict, list)):
|
||||||
|
_walk(value)
|
||||||
|
|
||||||
|
_walk(payload)
|
||||||
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -737,6 +771,7 @@ class AIAgent:
|
||||||
self.service_tier = service_tier
|
self.service_tier = service_tier
|
||||||
self.request_overrides = dict(request_overrides or {})
|
self.request_overrides = dict(request_overrides or {})
|
||||||
self.prefill_messages = prefill_messages or [] # Prefilled conversation turns
|
self.prefill_messages = prefill_messages or [] # Prefilled conversation turns
|
||||||
|
self._force_ascii_payload = False
|
||||||
|
|
||||||
# Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
|
# Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
|
||||||
# Reduces input costs by ~75% on multi-turn conversations by caching the
|
# Reduces input costs by ~75% on multi-turn conversations by caching the
|
||||||
|
|
@ -8189,6 +8224,8 @@ class AIAgent:
|
||||||
try:
|
try:
|
||||||
self._reset_stream_delivery_tracking()
|
self._reset_stream_delivery_tracking()
|
||||||
api_kwargs = self._build_api_kwargs(api_messages)
|
api_kwargs = self._build_api_kwargs(api_messages)
|
||||||
|
if self._force_ascii_payload:
|
||||||
|
_sanitize_structure_non_ascii(api_kwargs)
|
||||||
if self.api_mode == "codex_responses":
|
if self.api_mode == "codex_responses":
|
||||||
api_kwargs = self._preflight_codex_api_kwargs(api_kwargs, allow_stream=False)
|
api_kwargs = self._preflight_codex_api_kwargs(api_kwargs, allow_stream=False)
|
||||||
|
|
||||||
|
|
@ -8819,18 +8856,56 @@ class AIAgent:
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
if _is_ascii_codec:
|
if _is_ascii_codec:
|
||||||
|
self._force_ascii_payload = True
|
||||||
# ASCII codec: the system encoding can't handle
|
# ASCII codec: the system encoding can't handle
|
||||||
# non-ASCII characters at all. Sanitize all
|
# non-ASCII characters at all. Sanitize all
|
||||||
# non-ASCII content from messages and retry.
|
# non-ASCII content from messages/tool schemas and retry.
|
||||||
if _sanitize_messages_non_ascii(messages):
|
_messages_sanitized = _sanitize_messages_non_ascii(messages)
|
||||||
|
_prefill_sanitized = False
|
||||||
|
if isinstance(getattr(self, "prefill_messages", None), list):
|
||||||
|
_prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages)
|
||||||
|
|
||||||
|
_tools_sanitized = False
|
||||||
|
if isinstance(getattr(self, "tools", None), list):
|
||||||
|
_tools_sanitized = _sanitize_tools_non_ascii(self.tools)
|
||||||
|
|
||||||
|
_system_sanitized = False
|
||||||
|
if isinstance(active_system_prompt, str):
|
||||||
|
_sanitized_system = _strip_non_ascii(active_system_prompt)
|
||||||
|
if _sanitized_system != active_system_prompt:
|
||||||
|
active_system_prompt = _sanitized_system
|
||||||
|
self._cached_system_prompt = _sanitized_system
|
||||||
|
_system_sanitized = True
|
||||||
|
if isinstance(getattr(self, "ephemeral_system_prompt", None), str):
|
||||||
|
_sanitized_ephemeral = _strip_non_ascii(self.ephemeral_system_prompt)
|
||||||
|
if _sanitized_ephemeral != self.ephemeral_system_prompt:
|
||||||
|
self.ephemeral_system_prompt = _sanitized_ephemeral
|
||||||
|
_system_sanitized = True
|
||||||
|
|
||||||
|
_headers_sanitized = False
|
||||||
|
_default_headers = (
|
||||||
|
self._client_kwargs.get("default_headers")
|
||||||
|
if isinstance(getattr(self, "_client_kwargs", None), dict)
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
if isinstance(_default_headers, dict):
|
||||||
|
_headers_sanitized = _sanitize_structure_non_ascii(_default_headers)
|
||||||
|
|
||||||
|
if (
|
||||||
|
_messages_sanitized
|
||||||
|
or _prefill_sanitized
|
||||||
|
or _tools_sanitized
|
||||||
|
or _system_sanitized
|
||||||
|
or _headers_sanitized
|
||||||
|
):
|
||||||
self._unicode_sanitization_passes += 1
|
self._unicode_sanitization_passes += 1
|
||||||
self._vprint(
|
self._vprint(
|
||||||
f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from messages. Retrying...",
|
f"{self.log_prefix}⚠️ System encoding is ASCII — stripped non-ASCII characters from request payload. Retrying...",
|
||||||
force=True,
|
force=True,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
# Nothing to sanitize in messages — might be in system
|
# Nothing to sanitize in any payload component.
|
||||||
# prompt or prefill. Fall through to normal error path.
|
# Fall through to normal error path.
|
||||||
|
|
||||||
status_code = getattr(api_error, "status_code", None)
|
status_code = getattr(api_error, "status_code", None)
|
||||||
error_context = self._extract_api_error_context(api_error)
|
error_context = self._extract_api_error_context(api_error)
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,8 @@ import pytest
|
||||||
from run_agent import (
|
from run_agent import (
|
||||||
_strip_non_ascii,
|
_strip_non_ascii,
|
||||||
_sanitize_messages_non_ascii,
|
_sanitize_messages_non_ascii,
|
||||||
|
_sanitize_structure_non_ascii,
|
||||||
|
_sanitize_tools_non_ascii,
|
||||||
_sanitize_messages_surrogates,
|
_sanitize_messages_surrogates,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -138,3 +140,66 @@ class TestSurrogateVsAsciiSanitization:
|
||||||
"""When no surrogates present, _sanitize_messages_surrogates returns False."""
|
"""When no surrogates present, _sanitize_messages_surrogates returns False."""
|
||||||
messages = [{"role": "user", "content": "hello ⚕ world"}]
|
messages = [{"role": "user", "content": "hello ⚕ world"}]
|
||||||
assert _sanitize_messages_surrogates(messages) is False
|
assert _sanitize_messages_surrogates(messages) is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestSanitizeToolsNonAscii:
|
||||||
|
"""Tests for _sanitize_tools_non_ascii."""
|
||||||
|
|
||||||
|
def test_sanitizes_tool_description_and_parameter_descriptions(self):
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "read_file",
|
||||||
|
"description": "Print structured output │ with emoji 🤖",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "File path │ with unicode",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
assert _sanitize_tools_non_ascii(tools) is True
|
||||||
|
assert tools[0]["function"]["description"] == "Print structured output with emoji "
|
||||||
|
assert tools[0]["function"]["parameters"]["properties"]["path"]["description"] == "File path with unicode"
|
||||||
|
|
||||||
|
def test_no_change_for_ascii_only_tools(self):
|
||||||
|
tools = [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "read_file",
|
||||||
|
"description": "Read file content",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "File path",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
assert _sanitize_tools_non_ascii(tools) is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestSanitizeStructureNonAscii:
|
||||||
|
def test_sanitizes_nested_dict_structure(self):
|
||||||
|
payload = {
|
||||||
|
"default_headers": {
|
||||||
|
"X-Title": "Hermes │ Agent",
|
||||||
|
"User-Agent": "Hermes/1.0 🤖",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert _sanitize_structure_non_ascii(payload) is True
|
||||||
|
assert payload["default_headers"]["X-Title"] == "Hermes Agent"
|
||||||
|
assert payload["default_headers"]["User-Agent"] == "Hermes/1.0 "
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue