execute_code: pass through Windows OS-essential env vars

The sandbox's env scrubbing was dropping SYSTEMROOT, WINDIR, COMSPEC, APPDATA, etc. On Windows this broke the child process before any RPC could happen: OSError: [WinError 10106] The requested service provider could not be loaded or initialized Python's socket module uses SYSTEMROOT to locate mswsock.dll during Winsock initialization. Without it, socket.socket(AF_INET, SOCK_STREAM) fails — and the existing loopback-TCP fallback for Windows couldn't work. Fix: add a small Windows-only allowlist (_WINDOWS_ESSENTIAL_ENV_VARS) matched by exact uppercase name, after the existing secret-substring block. The secret block still runs first, so the allowlist cannot be used to exfiltrate credentials. Also extract the env scrubber into a testable helper (_scrub_child_env) that takes is_windows as a parameter, so the logic can be unit-tested on any OS. Live Winsock smoke test verifies that a child spawned with the scrubbed env can now create an AF_INET socket on a real Windows host; the test is guarded by sys.platform == 'win32' so POSIX CI stays green.
2026-06-24 10:52:21 +00:00 · 2026-05-07 18:39:38 -07:00 · 2026-05-07 18:39:38 -07:00 · fab984c7f8
commit fab984c7f8
parent f0d2516a30
2 changed files with 327 additions and 23 deletions
--- a/tests/tools/test_code_execution_windows_env.py
+++ b/tests/tools/test_code_execution_windows_env.py
@ -0,0 +1,243 @@
+"""Tests for execute_code env scrubbing on Windows.
+
+On Windows the child process needs a small set of OS-essential env vars
+(SYSTEMROOT, WINDIR, COMSPEC, ...) to run.  Without SYSTEMROOT in particular,
+``socket.socket(AF_INET, SOCK_STREAM)`` fails inside the sandbox with
+WinError 10106 (Winsock can't locate mswsock.dll) and no tool call over
+loopback TCP can ever succeed.
+
+These tests cover ``_scrub_child_env`` directly so they run on every OS
+— the logic is conditional on a passed-in ``is_windows`` flag, not on
+the host platform.  We also keep a live Winsock smoke test that only runs
+on a real Windows host.
+"""
+
+import os
+import socket
+import subprocess
+import sys
+import textwrap
+import unittest.mock as mock
+
+import pytest
+
+from tools.code_execution_tool import (
+    _SAFE_ENV_PREFIXES,
+    _SECRET_SUBSTRINGS,
+    _WINDOWS_ESSENTIAL_ENV_VARS,
+    _scrub_child_env,
+)
+
+
+def _no_passthrough(_name):
+    return False
+
+
+class TestWindowsEssentialAllowlist:
+    """The allowlist itself — contents, shape, and invariants."""
+
+    def test_contains_winsock_required_vars(self):
+        # Without SYSTEMROOT the child cannot initialize Winsock.
+        assert "SYSTEMROOT" in _WINDOWS_ESSENTIAL_ENV_VARS
+
+    def test_contains_subprocess_required_vars(self):
+        # Without COMSPEC, subprocess can't resolve the default shell.
+        assert "COMSPEC" in _WINDOWS_ESSENTIAL_ENV_VARS
+
+    def test_contains_user_profile_vars(self):
+        # os.path.expanduser("~") on Windows uses USERPROFILE.
+        assert "USERPROFILE" in _WINDOWS_ESSENTIAL_ENV_VARS
+        assert "APPDATA" in _WINDOWS_ESSENTIAL_ENV_VARS
+        assert "LOCALAPPDATA" in _WINDOWS_ESSENTIAL_ENV_VARS
+
+    def test_contains_only_uppercase_names(self):
+        # Windows env var names are case-insensitive but we canonicalize to
+        # uppercase for the membership check (``k.upper() in _WINDOWS_...``).
+        for name in _WINDOWS_ESSENTIAL_ENV_VARS:
+            assert name == name.upper(), f"{name!r} should be uppercase"
+
+    def test_no_overlap_with_secret_substrings(self):
+        # Sanity: none of the essential OS vars should look like secrets.
+        # If this ever fires, we'd have a precedence ordering bug (secrets
+        # are blocked *before* the essentials check).
+        for name in _WINDOWS_ESSENTIAL_ENV_VARS:
+            assert not any(s in name for s in _SECRET_SUBSTRINGS), (
+                f"{name!r} looks secret-like — would be blocked before the "
+                "essentials allowlist can match"
+            )
+
+
+class TestScrubChildEnvWindows:
+    """Verify _scrub_child_env passes Windows essentials through when
+    is_windows=True and blocks them when is_windows=False (so POSIX hosts
+    don't inherit pointless Windows vars)."""
+
+    def _sample_windows_env(self):
+        """A realistic subset of what os.environ looks like on Windows."""
+        return {
+            "SYSTEMROOT": r"C:\Windows",
+            "SystemDrive": "C:",        # Windows preserves native case
+            "WINDIR": r"C:\Windows",
+            "ComSpec": r"C:\Windows\System32\cmd.exe",
+            "PATHEXT": ".COM;.EXE;.BAT;.CMD;.PY",
+            "USERPROFILE": r"C:\Users\alice",
+            "APPDATA": r"C:\Users\alice\AppData\Roaming",
+            "LOCALAPPDATA": r"C:\Users\alice\AppData\Local",
+            "PATH": r"C:\Windows\System32;C:\Python311",
+            "HOME": r"C:\Users\alice",
+            "TEMP": r"C:\Users\alice\AppData\Local\Temp",
+            # Should still be blocked:
+            "OPENAI_API_KEY": "sk-secret",
+            "GITHUB_TOKEN": "ghp_secret",
+            "MY_PASSWORD": "hunter2",
+            # Not matched by any rule — should be dropped on both OSes:
+            "RANDOM_UNKNOWN_VAR": "value",
+        }
+
+    def test_windows_essentials_passed_through_when_is_windows_true(self):
+        env = self._sample_windows_env()
+        scrubbed = _scrub_child_env(env,
+                                    is_passthrough=_no_passthrough,
+                                    is_windows=True)
+
+        # Every essential var from the sample env should survive.
+        assert scrubbed["SYSTEMROOT"] == r"C:\Windows"
+        assert scrubbed["SystemDrive"] == "C:"  # case preserved
+        assert scrubbed["WINDIR"] == r"C:\Windows"
+        assert scrubbed["ComSpec"] == r"C:\Windows\System32\cmd.exe"
+        assert scrubbed["PATHEXT"] == ".COM;.EXE;.BAT;.CMD;.PY"
+        assert scrubbed["USERPROFILE"] == r"C:\Users\alice"
+        assert scrubbed["APPDATA"].endswith("Roaming")
+        assert scrubbed["LOCALAPPDATA"].endswith("Local")
+
+        # Safe-prefix vars still pass (baseline behavior).
+        assert "PATH" in scrubbed
+        assert "HOME" in scrubbed
+        assert "TEMP" in scrubbed
+
+    def test_secrets_still_blocked_on_windows(self):
+        """The Windows allowlist must NOT defeat the secret-substring block.
+
+        This is the key security invariant: essentials are allowed by
+        *exact name*, and the secret-substring block runs before the
+        essentials check anyway, so a variable named e.g. ``API_KEY`` can
+        never sneak through just because we added Windows support.
+        """
+        env = self._sample_windows_env()
+        scrubbed = _scrub_child_env(env,
+                                    is_passthrough=_no_passthrough,
+                                    is_windows=True)
+        assert "OPENAI_API_KEY" not in scrubbed
+        assert "GITHUB_TOKEN" not in scrubbed
+        assert "MY_PASSWORD" not in scrubbed
+
+    def test_unknown_vars_still_dropped_on_windows(self):
+        env = self._sample_windows_env()
+        scrubbed = _scrub_child_env(env,
+                                    is_passthrough=_no_passthrough,
+                                    is_windows=True)
+        assert "RANDOM_UNKNOWN_VAR" not in scrubbed
+
+    def test_essentials_blocked_when_is_windows_false(self):
+        """On POSIX hosts, Windows-specific vars should not pass — they
+        have no meaning and could confuse child tooling."""
+        env = self._sample_windows_env()
+        scrubbed = _scrub_child_env(env,
+                                    is_passthrough=_no_passthrough,
+                                    is_windows=False)
+        # Safe prefixes still match (PATH, HOME, TEMP).
+        assert "PATH" in scrubbed
+        assert "HOME" in scrubbed
+        assert "TEMP" in scrubbed
+        # But Windows OS vars should be dropped.
+        assert "SYSTEMROOT" not in scrubbed
+        assert "WINDIR" not in scrubbed
+        assert "ComSpec" not in scrubbed
+        assert "APPDATA" not in scrubbed
+
+    def test_case_insensitive_essential_match(self):
+        """Windows env var names are case-insensitive at the OS level but
+        Python preserves whatever case os.environ reported.  The scrubber
+        must normalize to uppercase for the membership check."""
+        env = {
+            "SystemRoot": r"C:\Windows",       # mixed case
+            "comspec": r"C:\Windows\System32\cmd.exe",  # lowercase
+            "APPDATA": r"C:\Users\x\AppData\Roaming",   # uppercase
+        }
+        scrubbed = _scrub_child_env(env,
+                                    is_passthrough=_no_passthrough,
+                                    is_windows=True)
+        assert "SystemRoot" in scrubbed
+        assert "comspec" in scrubbed
+        assert "APPDATA" in scrubbed
+
+
+class TestScrubChildEnvPassthroughInteraction:
+    """The passthrough hook runs *before* the secret block, so a skill
+    can legitimately forward a third-party API key.  The Windows
+    essentials addition must not interfere with that."""
+
+    def test_passthrough_wins_over_secret_block(self):
+        env = {"TENOR_API_KEY": "x", "PATH": "/bin"}
+        scrubbed = _scrub_child_env(env,
+                                    is_passthrough=lambda k: k == "TENOR_API_KEY",
+                                    is_windows=False)
+        assert scrubbed.get("TENOR_API_KEY") == "x"
+        assert scrubbed.get("PATH") == "/bin"
+
+    def test_passthrough_still_works_on_windows(self):
+        env = {
+            "TENOR_API_KEY": "x",
+            "SYSTEMROOT": r"C:\Windows",
+            "OPENAI_API_KEY": "sk-secret",  # not passthrough
+        }
+        scrubbed = _scrub_child_env(
+            env,
+            is_passthrough=lambda k: k == "TENOR_API_KEY",
+            is_windows=True,
+        )
+        assert scrubbed.get("TENOR_API_KEY") == "x"
+        assert scrubbed.get("SYSTEMROOT") == r"C:\Windows"
+        assert "OPENAI_API_KEY" not in scrubbed
+
+
+@pytest.mark.skipif(
+    sys.platform != "win32",
+    reason="Winsock-specific regression — only meaningful on Windows",
+)
+class TestWindowsSocketSmokeTest:
+    """Integration-ish smoke test: spawn a child Python with a scrubbed
+    env and confirm it can create an AF_INET socket.  This is the
+    regression that motivated the fix — without SYSTEMROOT the child
+    hits WinError 10106 before any RPC is attempted."""
+
+    def test_child_can_create_socket_with_scrubbed_env(self):
+        scrubbed = _scrub_child_env(os.environ, is_passthrough=_no_passthrough)
+
+        # Build a tiny child script that simply opens an AF_INET socket.
+        script = textwrap.dedent("""
+            import socket, sys
+            try:
+                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                s.close()
+                print("OK")
+                sys.exit(0)
+            except OSError as exc:
+                print(f"FAIL: {exc}")
+                sys.exit(1)
+        """).strip()
+
+        result = subprocess.run(
+            [sys.executable, "-c", script],
+            env=scrubbed,
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        assert result.returncode == 0, (
+            f"Child failed to create socket with scrubbed env:\n"
+            f"  stdout={result.stdout!r}\n"
+            f"  stderr={result.stderr!r}\n"
+            f"  scrubbed keys={sorted(scrubbed.keys())}"
+        )
+        assert "OK" in result.stdout
--- a/tools/code_execution_tool.py
+++ b/tools/code_execution_tool.py
@ -73,6 +73,85 @@ DEFAULT_MAX_TOOL_CALLS = 50
 MAX_STDOUT_BYTES = 50_000    # 50 KB
 MAX_STDERR_BYTES = 10_000    # 10 KB

+# Environment variable scrubbing rules (shared between the local + remote
+# backends).  Secret-substring block is applied first; anything left must
+# match either a safe prefix or, on Windows, an OS-essential name.
+_SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", "LANG", "LC_", "TERM",
+                      "TMPDIR", "TMP", "TEMP", "SHELL", "LOGNAME",
+                      "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA",
+                      "HERMES_")
+_SECRET_SUBSTRINGS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL",
+                      "PASSWD", "AUTH")
+
+# Windows-only: a handful of variables are required by the OS/CRT itself.
+# Without them, even stdlib calls like ``socket.socket()`` fail with
+# WinError 10106 (Winsock can't locate mswsock.dll) and ``subprocess``
+# can't resolve cmd.exe.  These are well-known OS paths, not secrets, so
+# we allow them through by exact name.  The _SECRET_SUBSTRINGS block
+# still runs as a safety net (none of these names match those substrings).
+_WINDOWS_ESSENTIAL_ENV_VARS = frozenset({
+    "SYSTEMROOT",       # %SYSTEMROOT%\System32 — Winsock needs this
+    "SYSTEMDRIVE",      # C: (or wherever Windows lives)
+    "WINDIR",           # usually same as SYSTEMROOT
+    "COMSPEC",          # cmd.exe path — subprocess shell=True needs it
+    "PATHEXT",          # .COM;.EXE;.BAT;... — shell lookup
+    "OS",               # "Windows_NT" — some tools gate on this
+    "PROCESSOR_ARCHITECTURE",
+    "NUMBER_OF_PROCESSORS",
+    "PUBLIC",           # C:\Users\Public
+    "ALLUSERSPROFILE",  # C:\ProgramData — some stdlib paths use it
+    "PROGRAMDATA",      # C:\ProgramData
+    "PROGRAMFILES",
+    "PROGRAMFILES(X86)",
+    "PROGRAMW6432",
+    "APPDATA",          # %USERPROFILE%\AppData\Roaming — Python uses it
+    "LOCALAPPDATA",     # %USERPROFILE%\AppData\Local
+    "USERPROFILE",      # C:\Users\<name> — Python's expanduser uses it
+    "USERDOMAIN",
+    "USERNAME",
+    "HOMEDRIVE",        # C:
+    "HOMEPATH",         # \Users\<name>
+    "COMPUTERNAME",
+})
+
+
+def _scrub_child_env(source_env, is_passthrough=None, is_windows=None):
+    """Produce the scrubbed child-process env for execute_code.
+
+    Rules (order matters):
+      1. Passthrough vars (skill- or config-declared) always pass.
+      2. Secret-substring names (KEY/TOKEN/etc.) are blocked.
+      3. Names matching a safe prefix pass.
+      4. On Windows, a small OS-essential allowlist passes by exact name
+         — without these the child can't even create a socket or spawn a
+         subprocess.
+
+    Extracted into a helper so tests can exercise the logic without
+    spawning a subprocess.
+    """
+    if is_passthrough is None:
+        try:
+            from tools.env_passthrough import is_env_passthrough as _ep
+        except Exception:
+            _ep = lambda _: False  # noqa: E731
+        is_passthrough = _ep
+    if is_windows is None:
+        is_windows = _IS_WINDOWS
+
+    scrubbed = {}
+    for k, v in source_env.items():
+        if is_passthrough(k):
+            scrubbed[k] = v
+            continue
+        if any(s in k.upper() for s in _SECRET_SUBSTRINGS):
+            continue
+        if any(k.startswith(p) for p in _SAFE_ENV_PREFIXES):
+            scrubbed[k] = v
+            continue
+        if is_windows and k.upper() in _WINDOWS_ESSENTIAL_ENV_VARS:
+            scrubbed[k] = v
+    return scrubbed
+

 def check_sandbox_requirements() -> bool:
    """Code execution sandbox requires a POSIX OS for Unix domain sockets."""
@ -1079,29 +1158,11 @@ def execute_code(
        # generated scripts. The child accesses tools via RPC, not direct API.
        # Exception: env vars declared by loaded skills (via env_passthrough
        # registry) or explicitly allowed by the user in config.yaml
-        # (terminal.env_passthrough) are passed through.
-        _SAFE_ENV_PREFIXES = ("PATH", "HOME", "USER", "LANG", "LC_", "TERM",
-                              "TMPDIR", "TMP", "TEMP", "SHELL", "LOGNAME",
-                              "XDG_", "PYTHONPATH", "VIRTUAL_ENV", "CONDA",
-                              "HERMES_")
-        _SECRET_SUBSTRINGS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL",
-                              "PASSWD", "AUTH")
-        try:
-            from tools.env_passthrough import is_env_passthrough as _is_passthrough
-        except Exception:
-            _is_passthrough = lambda _: False  # noqa: E731
-        child_env = {}
-        for k, v in os.environ.items():
-            # Passthrough vars (skill-declared or user-configured) always pass.
-            if _is_passthrough(k):
-                child_env[k] = v
-                continue
-            # Block vars with secret-like names.
-            if any(s in k.upper() for s in _SECRET_SUBSTRINGS):
-                continue
-            # Allow vars with known safe prefixes.
-            if any(k.startswith(p) for p in _SAFE_ENV_PREFIXES):
-                child_env[k] = v
+        # (terminal.env_passthrough) are passed through.  On Windows, a small
+        # OS-essential allowlist (SYSTEMROOT, WINDIR, COMSPEC, ...) is also
+        # passed through — without those, the child can't create a socket
+        # or spawn a subprocess.  See ``_scrub_child_env`` for the rules.
+        child_env = _scrub_child_env(os.environ)
        child_env["HERMES_RPC_SOCKET"] = rpc_endpoint
        child_env["PYTHONDONTWRITEBYTECODE"] = "1"
        # Ensure the hermes-agent root is importable in the sandbox so