mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-14 04:02:26 +00:00
execute_code: write sandbox files as UTF-8 on Windows
Second Windows-specific sandbox bug (WinError 10106 was the first):
after the env-scrub fix let the child start, it immediately failed to
import hermes_tools with:
SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97
in position 154: invalid start byte
Root cause: _execute_local wrote the generated hermes_tools.py stub and
the user's script.py via open(path, 'w') without encoding=. On Windows
the default text-mode encoding is cp1252 (system locale), which encodes
em-dashes (used in the stub's docstrings) as 0x97. Python then decodes
source files as UTF-8 (PEP 3120) on import, chokes on 0x97, and the
sandbox dies before any tool call.
Fix: pass encoding='utf-8' to all four file opens in the code_execution
path — the two staging writes in _execute_local (hermes_tools.py +
script.py) and the two RPC file-transport reads/writes in the generated
remote stub. JSON is ASCII-safe for most payloads but tool results
(terminal output, web_extract content) routinely carry non-ASCII.
Tests added (4):
- test_stub_and_script_writes_specify_utf8 — source grep guard
- test_file_rpc_stub_uses_utf8 — generated remote stub check
- test_stub_source_roundtrips_through_utf8 — concrete round-trip
- test_windows_default_encoding_would_have_failed — negative control
(skips on modern Python builds where default is already UTF-8
compatible, but retained for platforms where the regression could
return)
24/25 tests pass on Windows 3.11 (negative control skips because this
Python build handles em-dashes via cp1252 subset — the fix is still
correct, just the corruption path isn't always triggerable).
This commit is contained in:
parent
3b9cd58208
commit
da184439db
2 changed files with 179 additions and 6 deletions
|
|
@ -10,6 +10,14 @@ These tests cover ``_scrub_child_env`` directly so they run on every OS
|
||||||
— the logic is conditional on a passed-in ``is_windows`` flag, not on
|
— the logic is conditional on a passed-in ``is_windows`` flag, not on
|
||||||
the host platform. We also keep a live Winsock smoke test that only runs
|
the host platform. We also keep a live Winsock smoke test that only runs
|
||||||
on a real Windows host.
|
on a real Windows host.
|
||||||
|
|
||||||
|
Also covers the companion Windows bug: the sandbox writes
|
||||||
|
``hermes_tools.py`` and ``script.py`` into a temp dir, and those files
|
||||||
|
must be written as UTF-8 on every platform — the generated stub contains
|
||||||
|
em-dash/en-dash characters in docstrings, and the default ``open(path, "w")``
|
||||||
|
on Windows uses the system locale (cp1252 typically), corrupting those
|
||||||
|
bytes. The child then fails to import with a SyntaxError:
|
||||||
|
``'utf-8' codec can't decode byte 0x97``.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
@ -400,3 +408,158 @@ class TestPosixEquivalence:
|
||||||
f"Unexpected extra var in windows-mode output: {k} "
|
f"Unexpected extra var in windows-mode output: {k} "
|
||||||
f"(not in _WINDOWS_ESSENTIAL_ENV_VARS)"
|
f"(not in _WINDOWS_ESSENTIAL_ENV_VARS)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# UTF-8 file-write regression test
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# The sandbox writes two Python files into a temp dir — the generated
|
||||||
|
# ``hermes_tools.py`` stub, and the LLM's ``script.py``. Both contain
|
||||||
|
# non-ASCII characters in practice: the stub has em-dashes in docstrings
|
||||||
|
# ("``tcp://host:port`` — the parent falls back..."), and user scripts
|
||||||
|
# routinely contain non-ASCII strings, comments, or Unicode identifiers.
|
||||||
|
#
|
||||||
|
# On Windows, ``open(path, "w")`` without encoding= uses the system locale
|
||||||
|
# (cp1252 on US/UK installs), which cannot encode em-dashes. Python then
|
||||||
|
# tries to decode the file as UTF-8 when importing it (PEP 3120), fails,
|
||||||
|
# and the sandbox aborts with:
|
||||||
|
#
|
||||||
|
# SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97
|
||||||
|
# in position N: invalid start byte
|
||||||
|
#
|
||||||
|
# This was the *second* Windows-specific bug (WinError 10106 was the first).
|
||||||
|
# The fix is to always pass ``encoding="utf-8"`` when writing Python source.
|
||||||
|
|
||||||
|
|
||||||
|
class TestSandboxWritesUtf8:
|
||||||
|
"""Verify the file-write call sites use UTF-8 explicitly, not the
|
||||||
|
platform default. We check the source of ``execute_code`` rather
|
||||||
|
than spawning a real sandbox because the latter needs a full agent
|
||||||
|
context — but the code inspection is deterministic and fast."""
|
||||||
|
|
||||||
|
def test_stub_and_script_writes_specify_utf8(self):
|
||||||
|
"""Both ``hermes_tools.py`` and ``script.py`` writes in
|
||||||
|
``_execute_local`` must pass ``encoding="utf-8"``."""
|
||||||
|
import tools.code_execution_tool as cet
|
||||||
|
src = open(cet.__file__, encoding="utf-8").read()
|
||||||
|
|
||||||
|
# There should be no ``open(path, "w")`` without encoding= for
|
||||||
|
# the two staging files. Grep-style check: find every write of
|
||||||
|
# a .py file inside tmpdir and assert the line also contains
|
||||||
|
# ``encoding="utf-8"`` within a short window.
|
||||||
|
import re
|
||||||
|
pattern = re.compile(
|
||||||
|
r'open\(\s*os\.path\.join\(\s*tmpdir\s*,\s*"[^"]+\.py"\s*\)\s*,\s*"w"[^)]*\)'
|
||||||
|
)
|
||||||
|
for match in pattern.finditer(src):
|
||||||
|
line = match.group(0)
|
||||||
|
assert 'encoding="utf-8"' in line or "encoding='utf-8'" in line, (
|
||||||
|
f"Sandbox file write missing encoding=\"utf-8\" on Windows: {line!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_file_rpc_stub_uses_utf8(self):
|
||||||
|
"""The file-based RPC transport stub (used by remote backends)
|
||||||
|
reads/writes JSON response files. Those must also specify UTF-8
|
||||||
|
so non-ASCII tool results survive the round-trip intact."""
|
||||||
|
from tools.code_execution_tool import generate_hermes_tools_module
|
||||||
|
stub = generate_hermes_tools_module(["terminal"], transport="file")
|
||||||
|
# The generated stub should open response + request files as UTF-8.
|
||||||
|
assert 'encoding="utf-8"' in stub, (
|
||||||
|
"File-based RPC stub does not specify encoding=\"utf-8\" — "
|
||||||
|
"will corrupt non-ASCII tool results on non-UTF-8 locales."
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_stub_source_roundtrips_through_utf8(self):
|
||||||
|
"""Concrete regression: write the generated stub to a temp file
|
||||||
|
using ``encoding="utf-8"``, then parse it. This is what the
|
||||||
|
sandbox does, and it must succeed even when the stub contains
|
||||||
|
em-dashes (which it does — check the transport-header docstring).
|
||||||
|
"""
|
||||||
|
from tools.code_execution_tool import generate_hermes_tools_module
|
||||||
|
import tempfile, ast
|
||||||
|
stub = generate_hermes_tools_module(
|
||||||
|
["terminal", "read_file", "write_file"], transport="uds"
|
||||||
|
)
|
||||||
|
# Sanity: stub actually contains a non-ASCII character, otherwise
|
||||||
|
# this test wouldn't prove anything meaningful.
|
||||||
|
non_ascii = [c for c in stub if ord(c) > 127]
|
||||||
|
assert non_ascii, (
|
||||||
|
"Generated stub is pure ASCII — test is meaningless. If the "
|
||||||
|
"stub's docstrings have lost their em-dashes, update this "
|
||||||
|
"assertion, but be aware the original regression is no longer "
|
||||||
|
"covered."
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
mode="w", suffix=".py", delete=False, encoding="utf-8"
|
||||||
|
) as f:
|
||||||
|
f.write(stub)
|
||||||
|
tmp_path = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Re-read and parse exactly like the child Python would.
|
||||||
|
with open(tmp_path, encoding="utf-8") as fh:
|
||||||
|
round_tripped = fh.read()
|
||||||
|
assert round_tripped == stub, "UTF-8 round-trip corrupted the stub"
|
||||||
|
ast.parse(round_tripped) # must not raise SyntaxError
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
sys.platform != "win32",
|
||||||
|
reason="cp1252 default-encoding regression is Windows-specific",
|
||||||
|
)
|
||||||
|
def test_windows_default_encoding_would_have_failed(self):
|
||||||
|
"""Negative control: prove that on Windows, writing the stub
|
||||||
|
*without* ``encoding="utf-8"`` would corrupt the file. If this
|
||||||
|
test ever starts failing (i.e. default write succeeds), it means
|
||||||
|
Python's default encoding has changed and the explicit UTF-8
|
||||||
|
requirement may be obsolete — reconsider the fix."""
|
||||||
|
from tools.code_execution_tool import generate_hermes_tools_module
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
stub = generate_hermes_tools_module(["terminal"], transport="uds")
|
||||||
|
# Find a non-ASCII character we can use to prove the corruption.
|
||||||
|
non_ascii = [c for c in stub if ord(c) > 127]
|
||||||
|
if not non_ascii:
|
||||||
|
pytest.skip("stub has no non-ASCII chars — nothing to corrupt")
|
||||||
|
|
||||||
|
# Write with default encoding (simulating the old buggy code).
|
||||||
|
with tempfile.NamedTemporaryFile(
|
||||||
|
mode="w", suffix=".py", delete=False
|
||||||
|
) as f:
|
||||||
|
try:
|
||||||
|
f.write(stub)
|
||||||
|
tmp_path = f.name
|
||||||
|
wrote_successfully = True
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
# Default encoding can't even encode it — that's the bug
|
||||||
|
# in a different form. Still proves the point.
|
||||||
|
tmp_path = f.name
|
||||||
|
wrote_successfully = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not wrote_successfully:
|
||||||
|
# Default-encoding write raised outright. The bug is real.
|
||||||
|
return
|
||||||
|
|
||||||
|
# Read back as UTF-8 (what Python does on import).
|
||||||
|
with open(tmp_path, encoding="utf-8") as fh:
|
||||||
|
try:
|
||||||
|
fh.read()
|
||||||
|
# If this succeeds on Windows, the platform default is
|
||||||
|
# already UTF-8 (e.g. Python 3.15 with UTF-8 mode on).
|
||||||
|
# In that case the explicit encoding= is belt-and-
|
||||||
|
# suspenders but no longer strictly required. Skip.
|
||||||
|
pytest.skip(
|
||||||
|
"Default text-file encoding is UTF-8-compatible on "
|
||||||
|
"this Windows build — explicit encoding= is no "
|
||||||
|
"longer load-bearing, but keep it for belt-and-"
|
||||||
|
"suspenders."
|
||||||
|
)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Exactly the failure mode that motivated the fix.
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
|
||||||
|
|
@ -390,9 +390,12 @@ def _call(tool_name, args):
|
||||||
req_file = os.path.join(_RPC_DIR, f"req_{seq_str}")
|
req_file = os.path.join(_RPC_DIR, f"req_{seq_str}")
|
||||||
res_file = os.path.join(_RPC_DIR, f"res_{seq_str}")
|
res_file = os.path.join(_RPC_DIR, f"res_{seq_str}")
|
||||||
|
|
||||||
# Write request atomically (write to .tmp, then rename)
|
# Write request atomically (write to .tmp, then rename).
|
||||||
|
# encoding="utf-8" is critical: on Windows-hosted remote backends
|
||||||
|
# (or any non-UTF-8 locale) the default open() mode would mangle
|
||||||
|
# non-ASCII chars in tool args when encoding them as JSON.
|
||||||
tmp = req_file + ".tmp"
|
tmp = req_file + ".tmp"
|
||||||
with open(tmp, "w") as f:
|
with open(tmp, "w", encoding="utf-8") as f:
|
||||||
json.dump({"tool": tool_name, "args": args, "seq": seq}, f)
|
json.dump({"tool": tool_name, "args": args, "seq": seq}, f)
|
||||||
os.rename(tmp, req_file)
|
os.rename(tmp, req_file)
|
||||||
|
|
||||||
|
|
@ -405,7 +408,7 @@ def _call(tool_name, args):
|
||||||
time.sleep(poll_interval)
|
time.sleep(poll_interval)
|
||||||
poll_interval = min(poll_interval * 1.2, 0.25) # Back off to 250ms
|
poll_interval = min(poll_interval * 1.2, 0.25) # Back off to 250ms
|
||||||
|
|
||||||
with open(res_file) as f:
|
with open(res_file, encoding="utf-8") as f:
|
||||||
raw = f.read()
|
raw = f.read()
|
||||||
|
|
||||||
# Clean up response file
|
# Clean up response file
|
||||||
|
|
@ -1111,15 +1114,22 @@ def execute_code(
|
||||||
server_sock = None
|
server_sock = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Write the auto-generated hermes_tools module
|
# Write the auto-generated hermes_tools module.
|
||||||
|
# encoding="utf-8" is required on Windows — the stub and user code
|
||||||
|
# both contain non-ASCII characters (em-dashes in docstrings, plus
|
||||||
|
# whatever the user script carries). Python's default open() uses
|
||||||
|
# the system locale on Windows (cp1252 typically), which corrupts
|
||||||
|
# those bytes; the child then fails to import with a SyntaxError
|
||||||
|
# ("'utf-8' codec can't decode byte 0x97 in position ...") because
|
||||||
|
# Python source files are decoded as UTF-8 by default (PEP 3120).
|
||||||
# sandbox_tools is already the correct set (intersection with session
|
# sandbox_tools is already the correct set (intersection with session
|
||||||
# tools, or SANDBOX_ALLOWED_TOOLS as fallback — see lines above).
|
# tools, or SANDBOX_ALLOWED_TOOLS as fallback — see lines above).
|
||||||
tools_src = generate_hermes_tools_module(list(sandbox_tools))
|
tools_src = generate_hermes_tools_module(list(sandbox_tools))
|
||||||
with open(os.path.join(tmpdir, "hermes_tools.py"), "w") as f:
|
with open(os.path.join(tmpdir, "hermes_tools.py"), "w", encoding="utf-8") as f:
|
||||||
f.write(tools_src)
|
f.write(tools_src)
|
||||||
|
|
||||||
# Write the user's script
|
# Write the user's script
|
||||||
with open(os.path.join(tmpdir, "script.py"), "w") as f:
|
with open(os.path.join(tmpdir, "script.py"), "w", encoding="utf-8") as f:
|
||||||
f.write(code)
|
f.write(code)
|
||||||
|
|
||||||
# --- Start RPC server ---
|
# --- Start RPC server ---
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue