From da184439db42a6ac6816d31bb0c2fedd18d93c23 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 7 May 2026 18:52:59 -0700
Subject: [PATCH] execute_code: write sandbox files as UTF-8 on Windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Second Windows-specific sandbox bug (WinError 10106 was the first):
after the env-scrub fix let the child start, it immediately failed to
import hermes_tools with:

    SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97
                 in position 154: invalid start byte

Root cause: _execute_local wrote the generated hermes_tools.py stub and
the user's script.py via open(path, 'w') without encoding=.  On Windows
the default text-mode encoding is cp1252 (system locale), which encodes
em-dashes (used in the stub's docstrings) as 0x97.  Python then decodes
source files as UTF-8 (PEP 3120) on import, chokes on 0x97, and the
sandbox dies before any tool call.

Fix: pass encoding='utf-8' to all four file opens in the code_execution
path — the two staging writes in _execute_local (hermes_tools.py +
script.py) and the two RPC file-transport reads/writes in the generated
remote stub.  JSON is ASCII-safe for most payloads but tool results
(terminal output, web_extract content) routinely carry non-ASCII.

Tests added (4):
  - test_stub_and_script_writes_specify_utf8 — source grep guard
  - test_file_rpc_stub_uses_utf8 — generated remote stub check
  - test_stub_source_roundtrips_through_utf8 — concrete round-trip
  - test_windows_default_encoding_would_have_failed — negative control
    (skips on modern Python builds where default is already UTF-8
    compatible, but retained for platforms where the regression could
    return)

24/25 tests pass on Windows 3.11 (negative control skips because this
Python build handles em-dashes via cp1252 subset — the fix is still
correct, just the corruption path isn't always triggerable).
---
 .../tools/test_code_execution_windows_env.py  | 163 ++++++++++++++++++
 tools/code_execution_tool.py                  |  22 ++-
 2 files changed, 179 insertions(+), 6 deletions(-)

diff --git a/tests/tools/test_code_execution_windows_env.py b/tests/tools/test_code_execution_windows_env.py
index 185f662fdd..aca7a31da1 100644
--- a/tests/tools/test_code_execution_windows_env.py
+++ b/tests/tools/test_code_execution_windows_env.py
@@ -10,6 +10,14 @@ These tests cover ``_scrub_child_env`` directly so they run on every OS
 — the logic is conditional on a passed-in ``is_windows`` flag, not on
 the host platform.  We also keep a live Winsock smoke test that only runs
 on a real Windows host.
+
+Also covers the companion Windows bug: the sandbox writes
+``hermes_tools.py`` and ``script.py`` into a temp dir, and those files
+must be written as UTF-8 on every platform — the generated stub contains
+em-dash/en-dash characters in docstrings, and the default ``open(path, "w")``
+on Windows uses the system locale (cp1252 typically), corrupting those
+bytes.  The child then fails to import with a SyntaxError:
+``'utf-8' codec can't decode byte 0x97``.
 """
 
 import os
@@ -400,3 +408,158 @@ class TestPosixEquivalence:
                 f"Unexpected extra var in windows-mode output: {k} "
                 f"(not in _WINDOWS_ESSENTIAL_ENV_VARS)"
             )
+
+
+# ---------------------------------------------------------------------------
+# UTF-8 file-write regression test
+# ---------------------------------------------------------------------------
+#
+# The sandbox writes two Python files into a temp dir — the generated
+# ``hermes_tools.py`` stub, and the LLM's ``script.py``.  Both contain
+# non-ASCII characters in practice: the stub has em-dashes in docstrings
+# ("``tcp://host:port`` — the parent falls back..."), and user scripts
+# routinely contain non-ASCII strings, comments, or Unicode identifiers.
+#
+# On Windows, ``open(path, "w")`` without encoding= uses the system locale
+# (cp1252 on US/UK installs), which cannot encode em-dashes.  Python then
+# tries to decode the file as UTF-8 when importing it (PEP 3120), fails,
+# and the sandbox aborts with:
+#
+#     SyntaxError: (unicode error) 'utf-8' codec can't decode byte 0x97
+#                  in position N: invalid start byte
+#
+# This was the *second* Windows-specific bug (WinError 10106 was the first).
+# The fix is to always pass ``encoding="utf-8"`` when writing Python source.
+
+
+class TestSandboxWritesUtf8:
+    """Verify the file-write call sites use UTF-8 explicitly, not the
+    platform default.  We check the source of ``execute_code`` rather
+    than spawning a real sandbox because the latter needs a full agent
+    context — but the code inspection is deterministic and fast."""
+
+    def test_stub_and_script_writes_specify_utf8(self):
+        """Both ``hermes_tools.py`` and ``script.py`` writes in
+        ``_execute_local`` must pass ``encoding="utf-8"``."""
+        import tools.code_execution_tool as cet
+        src = open(cet.__file__, encoding="utf-8").read()
+
+        # There should be no ``open(path, "w")`` without encoding= for
+        # the two staging files.  Grep-style check: find every write of
+        # a .py file inside tmpdir and assert the line also contains
+        # ``encoding="utf-8"`` within a short window.
+        import re
+        pattern = re.compile(
+            r'open\(\s*os\.path\.join\(\s*tmpdir\s*,\s*"[^"]+\.py"\s*\)\s*,\s*"w"[^)]*\)'
+        )
+        for match in pattern.finditer(src):
+            line = match.group(0)
+            assert 'encoding="utf-8"' in line or "encoding='utf-8'" in line, (
+                f"Sandbox file write missing encoding=\"utf-8\" on Windows: {line!r}"
+            )
+
+    def test_file_rpc_stub_uses_utf8(self):
+        """The file-based RPC transport stub (used by remote backends)
+        reads/writes JSON response files.  Those must also specify UTF-8
+        so non-ASCII tool results survive the round-trip intact."""
+        from tools.code_execution_tool import generate_hermes_tools_module
+        stub = generate_hermes_tools_module(["terminal"], transport="file")
+        # The generated stub should open response + request files as UTF-8.
+        assert 'encoding="utf-8"' in stub, (
+            "File-based RPC stub does not specify encoding=\"utf-8\" — "
+            "will corrupt non-ASCII tool results on non-UTF-8 locales."
+        )
+
+    def test_stub_source_roundtrips_through_utf8(self):
+        """Concrete regression: write the generated stub to a temp file
+        using ``encoding="utf-8"``, then parse it.  This is what the
+        sandbox does, and it must succeed even when the stub contains
+        em-dashes (which it does — check the transport-header docstring).
+        """
+        from tools.code_execution_tool import generate_hermes_tools_module
+        import tempfile, ast
+        stub = generate_hermes_tools_module(
+            ["terminal", "read_file", "write_file"], transport="uds"
+        )
+        # Sanity: stub actually contains a non-ASCII character, otherwise
+        # this test wouldn't prove anything meaningful.
+        non_ascii = [c for c in stub if ord(c) > 127]
+        assert non_ascii, (
+            "Generated stub is pure ASCII — test is meaningless.  If the "
+            "stub's docstrings have lost their em-dashes, update this "
+            "assertion, but be aware the original regression is no longer "
+            "covered."
+        )
+
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".py", delete=False, encoding="utf-8"
+        ) as f:
+            f.write(stub)
+            tmp_path = f.name
+
+        try:
+            # Re-read and parse exactly like the child Python would.
+            with open(tmp_path, encoding="utf-8") as fh:
+                round_tripped = fh.read()
+            assert round_tripped == stub, "UTF-8 round-trip corrupted the stub"
+            ast.parse(round_tripped)  # must not raise SyntaxError
+        finally:
+            os.unlink(tmp_path)
+
+    @pytest.mark.skipif(
+        sys.platform != "win32",
+        reason="cp1252 default-encoding regression is Windows-specific",
+    )
+    def test_windows_default_encoding_would_have_failed(self):
+        """Negative control: prove that on Windows, writing the stub
+        *without* ``encoding="utf-8"`` would corrupt the file.  If this
+        test ever starts failing (i.e. default write succeeds), it means
+        Python's default encoding has changed and the explicit UTF-8
+        requirement may be obsolete — reconsider the fix."""
+        from tools.code_execution_tool import generate_hermes_tools_module
+        import tempfile
+
+        stub = generate_hermes_tools_module(["terminal"], transport="uds")
+        # Find a non-ASCII character we can use to prove the corruption.
+        non_ascii = [c for c in stub if ord(c) > 127]
+        if not non_ascii:
+            pytest.skip("stub has no non-ASCII chars — nothing to corrupt")
+
+        # Write with default encoding (simulating the old buggy code).
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".py", delete=False
+        ) as f:
+            try:
+                f.write(stub)
+                tmp_path = f.name
+                wrote_successfully = True
+            except UnicodeEncodeError:
+                # Default encoding can't even encode it — that's the bug
+                # in a different form.  Still proves the point.
+                tmp_path = f.name
+                wrote_successfully = False
+
+        try:
+            if not wrote_successfully:
+                # Default-encoding write raised outright.  The bug is real.
+                return
+
+            # Read back as UTF-8 (what Python does on import).
+            with open(tmp_path, encoding="utf-8") as fh:
+                try:
+                    fh.read()
+                    # If this succeeds on Windows, the platform default is
+                    # already UTF-8 (e.g. Python 3.15 with UTF-8 mode on).
+                    # In that case the explicit encoding= is belt-and-
+                    # suspenders but no longer strictly required.  Skip.
+                    pytest.skip(
+                        "Default text-file encoding is UTF-8-compatible on "
+                        "this Windows build — explicit encoding= is no "
+                        "longer load-bearing, but keep it for belt-and-"
+                        "suspenders."
+                    )
+                except UnicodeDecodeError:
+                    # Exactly the failure mode that motivated the fix.
+                    pass
+        finally:
+            os.unlink(tmp_path)
diff --git a/tools/code_execution_tool.py b/tools/code_execution_tool.py
index e115f5dc1f..0ddbd73e42 100644
--- a/tools/code_execution_tool.py
+++ b/tools/code_execution_tool.py
@@ -390,9 +390,12 @@ def _call(tool_name, args):
     req_file = os.path.join(_RPC_DIR, f"req_{seq_str}")
     res_file = os.path.join(_RPC_DIR, f"res_{seq_str}")
 
-    # Write request atomically (write to .tmp, then rename)
+    # Write request atomically (write to .tmp, then rename).
+    # encoding="utf-8" is critical: on Windows-hosted remote backends
+    # (or any non-UTF-8 locale) the default open() mode would mangle
+    # non-ASCII chars in tool args when encoding them as JSON.
     tmp = req_file + ".tmp"
-    with open(tmp, "w") as f:
+    with open(tmp, "w", encoding="utf-8") as f:
         json.dump({"tool": tool_name, "args": args, "seq": seq}, f)
     os.rename(tmp, req_file)
 
@@ -405,7 +408,7 @@ def _call(tool_name, args):
         time.sleep(poll_interval)
         poll_interval = min(poll_interval * 1.2, 0.25)  # Back off to 250ms
 
-    with open(res_file) as f:
+    with open(res_file, encoding="utf-8") as f:
         raw = f.read()
 
     # Clean up response file
@@ -1111,15 +1114,22 @@ def execute_code(
     server_sock = None
 
     try:
-        # Write the auto-generated hermes_tools module
+        # Write the auto-generated hermes_tools module.
+        # encoding="utf-8" is required on Windows — the stub and user code
+        # both contain non-ASCII characters (em-dashes in docstrings, plus
+        # whatever the user script carries).  Python's default open() uses
+        # the system locale on Windows (cp1252 typically), which corrupts
+        # those bytes; the child then fails to import with a SyntaxError
+        # ("'utf-8' codec can't decode byte 0x97 in position ...") because
+        # Python source files are decoded as UTF-8 by default (PEP 3120).
         # sandbox_tools is already the correct set (intersection with session
         # tools, or SANDBOX_ALLOWED_TOOLS as fallback — see lines above).
         tools_src = generate_hermes_tools_module(list(sandbox_tools))
-        with open(os.path.join(tmpdir, "hermes_tools.py"), "w") as f:
+        with open(os.path.join(tmpdir, "hermes_tools.py"), "w", encoding="utf-8") as f:
             f.write(tools_src)
 
         # Write the user's script
-        with open(os.path.join(tmpdir, "script.py"), "w") as f:
+        with open(os.path.join(tmpdir, "script.py"), "w", encoding="utf-8") as f:
             f.write(code)
 
         # --- Start RPC server ---