From 08ec602770c4451f0e095ad3a288934dae6f98a6 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 9 May 2026 18:44:58 -0700
Subject: [PATCH] fix(tool-result-storage): persist via stdin to bypass 128 KB
 exec-arg cap (#22913)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux's MAX_ARG_STRLEN caps any single argv element at 128 KB
(32 * PAGE_SIZE). The previous heredoc-in-the-command-string approach
in _write_to_sandbox put the entire tool result inside the 'bash -c'
arg, so any result over ~128 KB raised OSError [Errno 7] 'Argument
list too long' before the heredoc ever ran. The caller logged a
warning, but quiet_mode (CLI default) sets tools.* to ERROR — so the
warning never reached agent.log either, and the agent saw a 1.5 KB
preview tagged 'Full output could not be saved to sandbox'. Hits
delegate_task with 3+ subagent outputs routinely now.

Switch to passing content via env.execute(stdin_data=...). cmd is
now just 'mkdir -p X && cat > Y' (under 1 KB), and the heavyweight
payload travels through stdin where there is no argv-element limit.

E2E reproduced the user's exact 144,778-char delegate_task envelope:
old code OSError'd, new code round-trips cleanly to disk with all
three task summaries intact.
---
 tests/tools/test_tool_result_storage.py | 27 ++++++++++++++-----------
 tools/tool_result_storage.py            | 22 ++++++++++++--------
 2 files changed, 29 insertions(+), 20 deletions(-)
diff --git a/tests/tools/test_tool_result_storage.py b/tests/tools/test_tool_result_storage.py
index 3cea3b59ffa..17b6815c1d1 100644
--- a/tests/tools/test_tool_result_storage.py
+++ b/tests/tools/test_tool_result_storage.py
@@ -90,8 +90,11 @@ class TestWriteToSandbox:
         env.execute.assert_called_once()
         cmd = env.execute.call_args[0][0]
         assert "mkdir -p" in cmd
-        assert "hello world" in cmd
-        assert HEREDOC_MARKER in cmd
+        # Content travels through stdin, NOT inside the command string —
+        # otherwise large content would hit Linux's 128 KB MAX_ARG_STRLEN
+        # ceiling on `bash -c <cmd>` (#22906).
+        assert "hello world" not in cmd
+        assert env.execute.call_args[1]["stdin_data"] == "hello world"
 
     def test_failure_returns_false(self):
         env = MagicMock()
@@ -99,16 +102,16 @@ class TestWriteToSandbox:
         result = _write_to_sandbox("content", "/tmp/hermes-results/abc.txt", env)
         assert result is False
 
-    def test_heredoc_collision_uses_uuid_marker(self):
+    def test_large_content_via_stdin(self):
+        """Regression: 200 KB content exceeds Linux MAX_ARG_STRLEN (128 KB).
+        It must travel via stdin, never inside the command string."""
         env = MagicMock()
         env.execute.return_value = {"output": "", "returncode": 0}
-        content = f"text with {HEREDOC_MARKER} inside"
-        _write_to_sandbox(content, "/tmp/hermes-results/abc.txt", env)
+        big = "x" * 200_000
+        _write_to_sandbox(big, "/tmp/hermes-results/big.txt", env)
         cmd = env.execute.call_args[0][0]
-        # The default marker should NOT be used as the delimiter
-        lines = cmd.split("\n")
-        # The first and last lines contain the actual delimiter
-        assert HEREDOC_MARKER not in lines[0].split("<<")[1]
+        assert len(cmd) < 1_000  # cmd is just `mkdir -p X && cat > Y`
+        assert env.execute.call_args[1]["stdin_data"] == big
 
     def test_timeout_passed(self):
         env = MagicMock()
@@ -247,9 +250,9 @@ class TestMaybePersistToolResult:
             threshold=30_000,
         )
         assert PERSISTED_OUTPUT_TAG in result
-        # The heredoc written to sandbox should contain the full JSON blob
-        cmd = env.execute.call_args[0][0]
-        assert '"exit_code"' in cmd
+        # Content is delivered through stdin (no longer embedded in the
+        # command string — see test_large_content_via_stdin for why).
+        assert env.execute.call_args[1]["stdin_data"] == content
 
     def test_above_threshold_no_env_truncates_inline(self):
         content = "x" * 60_000
diff --git a/tools/tool_result_storage.py b/tools/tool_result_storage.py
index 43422644825..fed8621eee4 100644
--- a/tools/tool_result_storage.py
+++ b/tools/tool_result_storage.py
@@ -76,15 +76,21 @@ def _heredoc_marker(content: str) -> str:
 
 
 def _write_to_sandbox(content: str, remote_path: str, env) -> bool:
-    """Write content into the sandbox via env.execute(). Returns True on success."""
-    marker = _heredoc_marker(content)
+    """Write content into the sandbox via env.execute(). Returns True on success.
+
+    Pushes ``content`` through stdin rather than embedding it in the command
+    string. Linux's ``MAX_ARG_STRLEN`` caps any single argv element at 128 KB
+    (32 * PAGE_SIZE), so the previous heredoc-in-the-command-string approach
+    silently failed with ``OSError: [Errno 7] Argument list too long`` for any
+    tool result over ~128 KB — exactly the case persistence exists to handle.
+    Routing through stdin removes that ceiling on local + ssh (``_stdin_mode
+    == "pipe"``); remote backends with ``_stdin_mode == "heredoc"`` keep their
+    existing API-body sized limit, which is orders of magnitude larger than
+    the exec-arg ceiling.
+    """
     storage_dir = os.path.dirname(remote_path)
-    cmd = (
-        f"mkdir -p {shlex.quote(storage_dir)} && cat > {shlex.quote(remote_path)} << '{marker}'\n"
-        f"{content}\n"
-        f"{marker}"
-    )
-    result = env.execute(cmd, timeout=30)
+    cmd = f"mkdir -p {shlex.quote(storage_dir)} && cat > {shlex.quote(remote_path)}"
+    result = env.execute(cmd, timeout=30, stdin_data=content)
     return result.get("returncode", 1) == 0