wip: tool result fixes -- persistence

2026-04-25 00:51:20 +00:00 · 2026-04-07 22:21:27 -07:00 · 2026-04-07 22:21:27 -07:00 · 65e24c942e
commit 65e24c942e
parent 22d1bda185
11 changed files with 869 additions and 235 deletions
--- a/tools/tool_result_storage.py
+++ b/tools/tool_result_storage.py
@ -0,0 +1,223 @@
+"""Tool result persistence -- preserves large outputs instead of truncating.
+
+Defense against context-window overflow operates at three levels:
+
+1. **Per-tool output cap** (inside each tool): Tools like search_files
+   pre-truncate their own output before returning. This is the first line
+   of defense and the only one the tool author controls.
+
+2. **Per-result persistence** (maybe_persist_tool_result): After a tool
+   returns, if its output exceeds the tool's registered threshold
+   (registry.get_max_result_size), the full output is written INTO THE
+   SANDBOX at /tmp/hermes-results/{tool_use_id}.txt via env.execute().
+   The in-context content is replaced with a preview + file path reference.
+   The model can read_file to access the full output on any backend.
+
+3. **Per-turn aggregate budget** (enforce_turn_budget): After all tool
+   results in a single assistant turn are collected, if the total exceeds
+   MAX_TURN_BUDGET_CHARS (200K), the largest non-persisted results are
+   spilled to disk until the aggregate is under budget. This catches cases
+   where many medium-sized results combine to overflow context.
+"""
+
+import json
+import logging
+import uuid
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_MAX_RESULT_SIZE_CHARS: int = 50_000
+MAX_TURN_BUDGET_CHARS: int = 200_000
+PREVIEW_SIZE_CHARS: int = 2_000
+PERSISTED_OUTPUT_TAG = "<persisted-output>"
+PERSISTED_OUTPUT_CLOSING_TAG = "</persisted-output>"
+STORAGE_DIR = "/tmp/hermes-results"
+HEREDOC_MARKER = "HERMES_PERSIST_EOF"
+_BUDGET_TOOL_NAME = "__budget_enforcement__"
+
+
+def generate_preview(content: str, max_chars: int = PREVIEW_SIZE_CHARS) -> tuple[str, bool]:
+    """Truncate at last newline within max_chars. Returns (preview, has_more)."""
+    if len(content) <= max_chars:
+        return content, False
+    truncated = content[:max_chars]
+    last_nl = truncated.rfind("\n")
+    if last_nl > max_chars // 2:
+        truncated = truncated[:last_nl + 1]
+    return truncated, True
+
+
+def _heredoc_marker(content: str) -> str:
+    """Return a heredoc delimiter that doesn't collide with content."""
+    if HEREDOC_MARKER not in content:
+        return HEREDOC_MARKER
+    return f"HERMES_PERSIST_{uuid.uuid4().hex[:8]}"
+
+
+def _extract_raw_output(content: str) -> str:
+    """Extract the 'output' field from JSON tool results for cleaner persistence.
+
+    Tool handlers return json.dumps({"output": ..., "exit_code": ...}) for the
+    API, but persisted files should contain readable text, not a JSON blob.
+    """
+    try:
+        data = json.loads(content)
+        if isinstance(data, dict) and "output" in data:
+            return data["output"]
+    except (json.JSONDecodeError, TypeError):
+        pass
+    return content
+
+
+def _write_to_sandbox(content: str, remote_path: str, env) -> bool:
+    """Write content into the sandbox via env.execute(). Returns True on success."""
+    marker = _heredoc_marker(content)
+    cmd = (
+        f"mkdir -p {STORAGE_DIR} && cat > {remote_path} << '{marker}'\n"
+        f"{content}\n"
+        f"{marker}"
+    )
+    result = env.execute(cmd, timeout=30)
+    return result.get("returncode", 1) == 0
+
+
+def _build_persisted_message(
+    preview: str,
+    has_more: bool,
+    original_size: int,
+    file_path: str,
+) -> str:
+    """Build the <persisted-output> replacement block."""
+    size_kb = original_size / 1024
+    if size_kb >= 1024:
+        size_str = f"{size_kb / 1024:.1f} MB"
+    else:
+        size_str = f"{size_kb:.1f} KB"
+
+    msg = f"{PERSISTED_OUTPUT_TAG}\n"
+    msg += f"This tool result was too large ({original_size:,} characters, {size_str}).\n"
+    msg += f"Full output saved to: {file_path}\n"
+    msg += "Use the read_file tool with offset and limit to access specific sections of this output.\n\n"
+    msg += f"Preview (first {len(preview)} chars):\n"
+    msg += preview
+    if has_more:
+        msg += "\n..."
+    msg += f"\n{PERSISTED_OUTPUT_CLOSING_TAG}"
+    return msg
+
+
+def maybe_persist_tool_result(
+    content: str,
+    tool_name: str,
+    tool_use_id: str,
+    env=None,
+    threshold: int | float | None = None,
+) -> str:
+    """Layer 2: persist oversized result into the sandbox, return preview + path.
+
+    Writes via env.execute() so the file is accessible from any backend
+    (local, Docker, SSH, Modal, Daytona). Falls back to inline truncation
+    if write fails or no env is available.
+
+    Args:
+        content: Raw tool result string.
+        tool_name: Name of the tool (used for threshold lookup).
+        tool_use_id: Unique ID for this tool call (used as filename).
+        env: The active BaseEnvironment instance, or None.
+        threshold: Override threshold; if None, looked up from registry.
+
+    Returns:
+        Original content if small, or <persisted-output> replacement.
+    """
+    if threshold is None:
+        from tools.registry import registry
+        threshold = registry.get_max_result_size(tool_name)
+
+    # Infinity means never persist (e.g. read_file)
+    if threshold == float("inf"):
+        return content
+
+    if len(content) <= threshold:
+        return content
+
+    remote_path = f"{STORAGE_DIR}/{tool_use_id}.txt"
+    # Write raw output (not JSON wrapper) so read_file returns readable text
+    file_content = _extract_raw_output(content)
+    preview, has_more = generate_preview(file_content)
+
+    # Try writing into the sandbox
+    if env is not None:
+        try:
+            if _write_to_sandbox(file_content, remote_path, env):
+                logger.info(
+                    "Persisted large tool result: %s (%s, %d chars -> %s)",
+                    tool_name, tool_use_id, len(content), remote_path,
+                )
+                return _build_persisted_message(preview, has_more, len(content), remote_path)
+        except Exception as exc:
+            logger.warning("Sandbox write failed for %s: %s", tool_use_id, exc)
+
+    # Fallback: inline truncation (no sandbox available or write failed)
+    logger.info(
+        "Inline-truncating large tool result: %s (%d chars, no sandbox write)",
+        tool_name, len(content),
+    )
+    return (
+        f"{preview}\n\n"
+        f"[Truncated: tool response was {len(content):,} chars. "
+        f"Full output could not be saved to sandbox.]"
+    )
+
+
+def enforce_turn_budget(
+    tool_messages: list[dict],
+    env=None,
+    budget: int = MAX_TURN_BUDGET_CHARS,
+) -> list[dict]:
+    """Layer 3: enforce aggregate budget across all tool results in a turn.
+
+    If total chars exceed budget, persist the largest non-persisted results
+    first (via sandbox write) until under budget. Already-persisted results
+    are skipped.
+
+    Mutates the list in-place and returns it.
+    """
+    candidates = []
+    total_size = 0
+    for i, msg in enumerate(tool_messages):
+        content = msg.get("content", "")
+        size = len(content)
+        total_size += size
+        if PERSISTED_OUTPUT_TAG not in content:
+            candidates.append((i, size))
+
+    if total_size <= budget:
+        return tool_messages
+
+    # Sort candidates by size descending — persist largest first
+    candidates.sort(key=lambda x: x[1], reverse=True)
+
+    for idx, size in candidates:
+        if total_size <= budget:
+            break
+        msg = tool_messages[idx]
+        content = msg["content"]
+        tool_use_id = msg.get("tool_call_id", f"budget_{idx}")
+
+        replacement = maybe_persist_tool_result(
+            content=content,
+            tool_name=_BUDGET_TOOL_NAME,
+            tool_use_id=tool_use_id,
+            env=env,
+            threshold=0,
+        )
+        if replacement != content:
+            total_size -= size
+            total_size += len(replacement)
+            tool_messages[idx]["content"] = replacement
+            logger.info(
+                "Budget enforcement: persisted tool result %s (%d chars)",
+                tool_use_id, size,
+            )
+
+    return tool_messages