From da184439db42a6ac6816d31bb0c2fedd18d93c23 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 7 May 2026 18:52:59 -0700 Subject: [PATCH] execute_code: write sandbox files as UTF-8 on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second Windows-specific sandbox bug (WinError 10106 was the first): after the env-scrub fix let the child start, it immediately failed to import hermes_tools with: SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97 in position 154: invalid start byte Root cause: _execute_local wrote the generated hermes_tools.py stub and the user's script.py via open(path, 'w') without encoding=. On Windows the default text-mode encoding is cp1252 (system locale), which encodes em-dashes (used in the stub's docstrings) as 0x97. Python then decodes source files as UTF-8 (PEP 3120) on import, chokes on 0x97, and the sandbox dies before any tool call. Fix: pass encoding='utf-8' to all four file opens in the code_execution path — the two staging writes in _execute_local (hermes_tools.py + script.py) and the two RPC file-transport reads/writes in the generated remote stub. JSON is ASCII-safe for most payloads but tool results (terminal output, web_extract content) routinely carry non-ASCII. Tests added (4): - test_stub_and_script_writes_specify_utf8 — source grep guard - test_file_rpc_stub_uses_utf8 — generated remote stub check - test_stub_source_roundtrips_through_utf8 — concrete round-trip - test_windows_default_encoding_would_have_failed — negative control (skips on modern Python builds where default is already UTF-8 compatible, but retained for platforms where the regression could return) 24/25 tests pass on Windows 3.11 (negative control skips because this Python build handles em-dashes via cp1252 subset — the fix is still correct, just the corruption path isn't always triggerable). --- .../tools/test_code_execution_windows_env.py | 163 ++++++++++++++++++ tools/code_execution_tool.py | 22 ++- 2 files changed, 179 insertions(+), 6 deletions(-) diff --git a/tests/tools/test_code_execution_windows_env.py b/tests/tools/test_code_execution_windows_env.py index 185f662fdd..aca7a31da1 100644 --- a/tests/tools/test_code_execution_windows_env.py +++ b/tests/tools/test_code_execution_windows_env.py @@ -10,6 +10,14 @@ These tests cover ``_scrub_child_env`` directly so they run on every OS — the logic is conditional on a passed-in ``is_windows`` flag, not on the host platform. We also keep a live Winsock smoke test that only runs on a real Windows host. + +Also covers the companion Windows bug: the sandbox writes +``hermes_tools.py`` and ``script.py`` into a temp dir, and those files +must be written as UTF-8 on every platform — the generated stub contains +em-dash/en-dash characters in docstrings, and the default ``open(path, "w")`` +on Windows uses the system locale (cp1252 typically), corrupting those +bytes. The child then fails to import with a SyntaxError: +``'utf-8' codec can't decode byte 0x97``. """ import os @@ -400,3 +408,158 @@ class TestPosixEquivalence: f"Unexpected extra var in windows-mode output: {k} " f"(not in _WINDOWS_ESSENTIAL_ENV_VARS)" ) + + +# --------------------------------------------------------------------------- +# UTF-8 file-write regression test +# --------------------------------------------------------------------------- +# +# The sandbox writes two Python files into a temp dir — the generated +# ``hermes_tools.py`` stub, and the LLM's ``script.py``. Both contain +# non-ASCII characters in practice: the stub has em-dashes in docstrings +# ("``tcp://host:port`` — the parent falls back..."), and user scripts +# routinely contain non-ASCII strings, comments, or Unicode identifiers. +# +# On Windows, ``open(path, "w")`` without encoding= uses the system locale +# (cp1252 on US/UK installs), which cannot encode em-dashes. Python then +# tries to decode the file as UTF-8 when importing it (PEP 3120), fails, +# and the sandbox aborts with: +# +# SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97 +# in position N: invalid start byte +# +# This was the *second* Windows-specific bug (WinError 10106 was the first). +# The fix is to always pass ``encoding="utf-8"`` when writing Python source. + + +class TestSandboxWritesUtf8: + """Verify the file-write call sites use UTF-8 explicitly, not the + platform default. We check the source of ``execute_code`` rather + than spawning a real sandbox because the latter needs a full agent + context — but the code inspection is deterministic and fast.""" + + def test_stub_and_script_writes_specify_utf8(self): + """Both ``hermes_tools.py`` and ``script.py`` writes in + ``_execute_local`` must pass ``encoding="utf-8"``.""" + import tools.code_execution_tool as cet + src = open(cet.__file__, encoding="utf-8").read() + + # There should be no ``open(path, "w")`` without encoding= for + # the two staging files. Grep-style check: find every write of + # a .py file inside tmpdir and assert the line also contains + # ``encoding="utf-8"`` within a short window. + import re + pattern = re.compile( + r'open\(\s*os\.path\.join\(\s*tmpdir\s*,\s*"[^"]+\.py"\s*\)\s*,\s*"w"[^)]*\)' + ) + for match in pattern.finditer(src): + line = match.group(0) + assert 'encoding="utf-8"' in line or "encoding='utf-8'" in line, ( + f"Sandbox file write missing encoding=\"utf-8\" on Windows: {line!r}" + ) + + def test_file_rpc_stub_uses_utf8(self): + """The file-based RPC transport stub (used by remote backends) + reads/writes JSON response files. Those must also specify UTF-8 + so non-ASCII tool results survive the round-trip intact.""" + from tools.code_execution_tool import generate_hermes_tools_module + stub = generate_hermes_tools_module(["terminal"], transport="file") + # The generated stub should open response + request files as UTF-8. + assert 'encoding="utf-8"' in stub, ( + "File-based RPC stub does not specify encoding=\"utf-8\" — " + "will corrupt non-ASCII tool results on non-UTF-8 locales." + ) + + def test_stub_source_roundtrips_through_utf8(self): + """Concrete regression: write the generated stub to a temp file + using ``encoding="utf-8"``, then parse it. This is what the + sandbox does, and it must succeed even when the stub contains + em-dashes (which it does — check the transport-header docstring). + """ + from tools.code_execution_tool import generate_hermes_tools_module + import tempfile, ast + stub = generate_hermes_tools_module( + ["terminal", "read_file", "write_file"], transport="uds" + ) + # Sanity: stub actually contains a non-ASCII character, otherwise + # this test wouldn't prove anything meaningful. + non_ascii = [c for c in stub if ord(c) > 127] + assert non_ascii, ( + "Generated stub is pure ASCII — test is meaningless. If the " + "stub's docstrings have lost their em-dashes, update this " + "assertion, but be aware the original regression is no longer " + "covered." + ) + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".py", delete=False, encoding="utf-8" + ) as f: + f.write(stub) + tmp_path = f.name + + try: + # Re-read and parse exactly like the child Python would. + with open(tmp_path, encoding="utf-8") as fh: + round_tripped = fh.read() + assert round_tripped == stub, "UTF-8 round-trip corrupted the stub" + ast.parse(round_tripped) # must not raise SyntaxError + finally: + os.unlink(tmp_path) + + @pytest.mark.skipif( + sys.platform != "win32", + reason="cp1252 default-encoding regression is Windows-specific", + ) + def test_windows_default_encoding_would_have_failed(self): + """Negative control: prove that on Windows, writing the stub + *without* ``encoding="utf-8"`` would corrupt the file. If this + test ever starts failing (i.e. default write succeeds), it means + Python's default encoding has changed and the explicit UTF-8 + requirement may be obsolete — reconsider the fix.""" + from tools.code_execution_tool import generate_hermes_tools_module + import tempfile + + stub = generate_hermes_tools_module(["terminal"], transport="uds") + # Find a non-ASCII character we can use to prove the corruption. + non_ascii = [c for c in stub if ord(c) > 127] + if not non_ascii: + pytest.skip("stub has no non-ASCII chars — nothing to corrupt") + + # Write with default encoding (simulating the old buggy code). + with tempfile.NamedTemporaryFile( + mode="w", suffix=".py", delete=False + ) as f: + try: + f.write(stub) + tmp_path = f.name + wrote_successfully = True + except UnicodeEncodeError: + # Default encoding can't even encode it — that's the bug + # in a different form. Still proves the point. + tmp_path = f.name + wrote_successfully = False + + try: + if not wrote_successfully: + # Default-encoding write raised outright. The bug is real. + return + + # Read back as UTF-8 (what Python does on import). + with open(tmp_path, encoding="utf-8") as fh: + try: + fh.read() + # If this succeeds on Windows, the platform default is + # already UTF-8 (e.g. Python 3.15 with UTF-8 mode on). + # In that case the explicit encoding= is belt-and- + # suspenders but no longer strictly required. Skip. + pytest.skip( + "Default text-file encoding is UTF-8-compatible on " + "this Windows build — explicit encoding= is no " + "longer load-bearing, but keep it for belt-and-" + "suspenders." + ) + except UnicodeDecodeError: + # Exactly the failure mode that motivated the fix. + pass + finally: + os.unlink(tmp_path) diff --git a/tools/code_execution_tool.py b/tools/code_execution_tool.py index e115f5dc1f..0ddbd73e42 100644 --- a/tools/code_execution_tool.py +++ b/tools/code_execution_tool.py @@ -390,9 +390,12 @@ def _call(tool_name, args): req_file = os.path.join(_RPC_DIR, f"req_{seq_str}") res_file = os.path.join(_RPC_DIR, f"res_{seq_str}") - # Write request atomically (write to .tmp, then rename) + # Write request atomically (write to .tmp, then rename). + # encoding="utf-8" is critical: on Windows-hosted remote backends + # (or any non-UTF-8 locale) the default open() mode would mangle + # non-ASCII chars in tool args when encoding them as JSON. tmp = req_file + ".tmp" - with open(tmp, "w") as f: + with open(tmp, "w", encoding="utf-8") as f: json.dump({"tool": tool_name, "args": args, "seq": seq}, f) os.rename(tmp, req_file) @@ -405,7 +408,7 @@ def _call(tool_name, args): time.sleep(poll_interval) poll_interval = min(poll_interval * 1.2, 0.25) # Back off to 250ms - with open(res_file) as f: + with open(res_file, encoding="utf-8") as f: raw = f.read() # Clean up response file @@ -1111,15 +1114,22 @@ def execute_code( server_sock = None try: - # Write the auto-generated hermes_tools module + # Write the auto-generated hermes_tools module. + # encoding="utf-8" is required on Windows — the stub and user code + # both contain non-ASCII characters (em-dashes in docstrings, plus + # whatever the user script carries). Python's default open() uses + # the system locale on Windows (cp1252 typically), which corrupts + # those bytes; the child then fails to import with a SyntaxError + # ("'utf-8' codec can't decode byte 0x97 in position ...") because + # Python source files are decoded as UTF-8 by default (PEP 3120). # sandbox_tools is already the correct set (intersection with session # tools, or SANDBOX_ALLOWED_TOOLS as fallback — see lines above). tools_src = generate_hermes_tools_module(list(sandbox_tools)) - with open(os.path.join(tmpdir, "hermes_tools.py"), "w") as f: + with open(os.path.join(tmpdir, "hermes_tools.py"), "w", encoding="utf-8") as f: f.write(tools_src) # Write the user's script - with open(os.path.join(tmpdir, "script.py"), "w") as f: + with open(os.path.join(tmpdir, "script.py"), "w", encoding="utf-8") as f: f.write(code) # --- Start RPC server ---