mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: sanitize api_messages and extra string fields during ASCII-codec recovery (#6843)
The ASCII-locale recovery path in run_agent.py sanitized the canonical
'messages' list but left 'api_messages' untouched. api_messages is a
separate API-copy built before the retry loop and may carry extra fields
(reasoning_content, extra_body entries) that are not present in
'messages'. This caused the retry to still raise UnicodeEncodeError even
after the 'System encoding is ASCII — stripped...' log line appeared.
Two changes:
- _sanitize_messages_non_ascii now walks all extra top-level string fields
in each message dict (any key not in {content, name, tool_calls, role})
so reasoning_content and future extras are cleaned in both 'messages'
and 'api_messages'.
- The ASCII-codec recovery block now also calls sanitize on api_messages
and api_kwargs so no non-ASCII survives into the next retry attempt.
Adds regression tests covering:
- reasoning_content with non-ASCII in api_messages
- extra_body with non-ASCII in api_kwargs
- canonical messages clean but api_messages dirty
Fixes #6843
This commit is contained in:
parent
d4eba82a37
commit
efd1ddc6e1
2 changed files with 85 additions and 3 deletions
21
run_agent.py
21
run_agent.py
|
|
@ -457,6 +457,15 @@ def _sanitize_messages_non_ascii(messages: list) -> bool:
|
||||||
if sanitized != fn_args:
|
if sanitized != fn_args:
|
||||||
fn["arguments"] = sanitized
|
fn["arguments"] = sanitized
|
||||||
found = True
|
found = True
|
||||||
|
# Sanitize any additional top-level string fields (e.g. reasoning_content)
|
||||||
|
for key, value in msg.items():
|
||||||
|
if key in {"content", "name", "tool_calls", "role"}:
|
||||||
|
continue
|
||||||
|
if isinstance(value, str):
|
||||||
|
sanitized = _strip_non_ascii(value)
|
||||||
|
if sanitized != value:
|
||||||
|
msg[key] = sanitized
|
||||||
|
found = True
|
||||||
return found
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -9107,7 +9116,19 @@ class AIAgent:
|
||||||
# ASCII codec: the system encoding can't handle
|
# ASCII codec: the system encoding can't handle
|
||||||
# non-ASCII characters at all. Sanitize all
|
# non-ASCII characters at all. Sanitize all
|
||||||
# non-ASCII content from messages/tool schemas and retry.
|
# non-ASCII content from messages/tool schemas and retry.
|
||||||
|
# Sanitize both the canonical `messages` list and
|
||||||
|
# `api_messages` (the API-copy built before the retry
|
||||||
|
# loop, which may contain extra fields like
|
||||||
|
# reasoning_content that are not in `messages`).
|
||||||
_messages_sanitized = _sanitize_messages_non_ascii(messages)
|
_messages_sanitized = _sanitize_messages_non_ascii(messages)
|
||||||
|
if isinstance(api_messages, list):
|
||||||
|
_sanitize_messages_non_ascii(api_messages)
|
||||||
|
# Also sanitize the last api_kwargs if already built,
|
||||||
|
# so a leftover non-ASCII value in a transformed field
|
||||||
|
# (e.g. extra_body, reasoning_content) doesn't survive
|
||||||
|
# into the next attempt via _build_api_kwargs cache paths.
|
||||||
|
if isinstance(api_kwargs, dict):
|
||||||
|
_sanitize_structure_non_ascii(api_kwargs)
|
||||||
_prefill_sanitized = False
|
_prefill_sanitized = False
|
||||||
if isinstance(getattr(self, "prefill_messages", None), list):
|
if isinstance(getattr(self, "prefill_messages", None), list):
|
||||||
_prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages)
|
_prefill_sanitized = _sanitize_messages_non_ascii(self.prefill_messages)
|
||||||
|
|
|
||||||
|
|
@ -268,9 +268,9 @@ class TestApiKeyClientSync:
|
||||||
agent.client.api_key = _clean_key
|
agent.client.api_key = _clean_key
|
||||||
|
|
||||||
# All three locations should now hold the clean key
|
# All three locations should now hold the clean key
|
||||||
assert agent.api_key == "sk-proj-abcdef"
|
assert agent.api_key == "***"
|
||||||
assert agent._client_kwargs["api_key"] == "sk-proj-abcdef"
|
assert agent._client_kwargs["api_key"] == "***"
|
||||||
assert agent.client.api_key == "sk-proj-abcdef"
|
assert agent.client.api_key == "***"
|
||||||
# The bad char should be gone from all of them
|
# The bad char should be gone from all of them
|
||||||
assert "\u028b" not in agent.api_key
|
assert "\u028b" not in agent.api_key
|
||||||
assert "\u028b" not in agent._client_kwargs["api_key"]
|
assert "\u028b" not in agent._client_kwargs["api_key"]
|
||||||
|
|
@ -294,3 +294,64 @@ class TestApiKeyClientSync:
|
||||||
|
|
||||||
assert agent.api_key == "sk-proj-"
|
assert agent.api_key == "sk-proj-"
|
||||||
assert agent.client is None # should not have been touched
|
assert agent.client is None # should not have been touched
|
||||||
|
|
||||||
|
|
||||||
|
class TestApiMessagesAndApiKwargsSanitized:
|
||||||
|
"""Regression tests for #6843 follow-up: api_messages and api_kwargs must
|
||||||
|
be sanitized alongside messages during ASCII-codec recovery.
|
||||||
|
|
||||||
|
The original fix only sanitized the canonical `messages` list.
|
||||||
|
api_messages is a separate API-copy built before the retry loop; it may
|
||||||
|
carry extra fields (reasoning_content, extra_body) with non-ASCII chars
|
||||||
|
that are not present in `messages`. Without sanitizing api_messages and
|
||||||
|
api_kwargs, the retry still raises UnicodeEncodeError even after the
|
||||||
|
'System encoding is ASCII — stripped...' log line appears.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_api_messages_with_reasoning_content_is_sanitized(self):
|
||||||
|
"""api_messages may contain reasoning_content not in messages."""
|
||||||
|
api_messages = [
|
||||||
|
{"role": "system", "content": "You are helpful."},
|
||||||
|
{"role": "user", "content": "hi"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "Sure!",
|
||||||
|
# reasoning_content is injected by the API-copy builder and
|
||||||
|
# is NOT present in the canonical messages list
|
||||||
|
"reasoning_content": "Let me think \xab step by step \xbb",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
found = _sanitize_messages_non_ascii(api_messages)
|
||||||
|
assert found is True
|
||||||
|
assert "\xab" not in api_messages[2]["reasoning_content"]
|
||||||
|
assert "\xbb" not in api_messages[2]["reasoning_content"]
|
||||||
|
|
||||||
|
def test_api_kwargs_with_non_ascii_extra_body_is_sanitized(self):
|
||||||
|
"""api_kwargs may contain non-ASCII in extra_body or other fields."""
|
||||||
|
api_kwargs = {
|
||||||
|
"model": "glm-5.1",
|
||||||
|
"messages": [{"role": "user", "content": "ok"}],
|
||||||
|
"extra_body": {
|
||||||
|
"system": "Think carefully \u2192 answer",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
found = _sanitize_structure_non_ascii(api_kwargs)
|
||||||
|
assert found is True
|
||||||
|
assert "\u2192" not in api_kwargs["extra_body"]["system"]
|
||||||
|
|
||||||
|
def test_messages_clean_but_api_messages_dirty_both_get_sanitized(self):
|
||||||
|
"""Even when canonical messages are clean, api_messages may be dirty."""
|
||||||
|
messages = [{"role": "user", "content": "hello"}]
|
||||||
|
api_messages = [
|
||||||
|
{"role": "user", "content": "hello"},
|
||||||
|
{
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "ok",
|
||||||
|
"reasoning_content": "step \xab done",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
# messages sanitize returns False (nothing to clean)
|
||||||
|
assert _sanitize_messages_non_ascii(messages) is False
|
||||||
|
# api_messages sanitize must catch the dirty reasoning_content
|
||||||
|
assert _sanitize_messages_non_ascii(api_messages) is True
|
||||||
|
assert "\xab" not in api_messages[1]["reasoning_content"]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue