mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
wip: tool result fixes -- persistence
This commit is contained in:
parent
22d1bda185
commit
65e24c942e
11 changed files with 869 additions and 235 deletions
42
tools/binary_extensions.py
Normal file
42
tools/binary_extensions.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
"""Binary file extensions to skip for text-based operations.
|
||||
|
||||
These files can't be meaningfully compared as text and are often large.
|
||||
Ported from free-code src/constants/files.ts.
|
||||
"""
|
||||
|
||||
BINARY_EXTENSIONS = frozenset({
|
||||
# Images
|
||||
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".tiff", ".tif",
|
||||
# Videos
|
||||
".mp4", ".mov", ".avi", ".mkv", ".webm", ".wmv", ".flv", ".m4v", ".mpeg", ".mpg",
|
||||
# Audio
|
||||
".mp3", ".wav", ".ogg", ".flac", ".aac", ".m4a", ".wma", ".aiff", ".opus",
|
||||
# Archives
|
||||
".zip", ".tar", ".gz", ".bz2", ".7z", ".rar", ".xz", ".z", ".tgz", ".iso",
|
||||
# Executables/binaries
|
||||
".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a", ".obj", ".lib",
|
||||
".app", ".msi", ".deb", ".rpm",
|
||||
# Documents (PDF is here; read_file excludes it at the call site)
|
||||
".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
|
||||
".odt", ".ods", ".odp",
|
||||
# Fonts
|
||||
".ttf", ".otf", ".woff", ".woff2", ".eot",
|
||||
# Bytecode / VM artifacts
|
||||
".pyc", ".pyo", ".class", ".jar", ".war", ".ear", ".node", ".wasm", ".rlib",
|
||||
# Database files
|
||||
".sqlite", ".sqlite3", ".db", ".mdb", ".idx",
|
||||
# Design / 3D
|
||||
".psd", ".ai", ".eps", ".sketch", ".fig", ".xd", ".blend", ".3ds", ".max",
|
||||
# Flash
|
||||
".swf", ".fla",
|
||||
# Lock/profiling data
|
||||
".lockb", ".dat", ".data",
|
||||
})
|
||||
|
||||
|
||||
def has_binary_extension(path: str) -> bool:
|
||||
"""Check if a file path has a binary extension. Pure string check, no I/O."""
|
||||
dot = path.rfind(".")
|
||||
if dot == -1:
|
||||
return False
|
||||
return path[dot:].lower() in BINARY_EXTENSIONS
|
||||
|
|
@ -1343,4 +1343,5 @@ registry.register(
|
|||
enabled_tools=kw.get("enabled_tools")),
|
||||
check_fn=check_sandbox_requirements,
|
||||
emoji="🐍",
|
||||
max_result_size_chars=30_000,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import logging
|
|||
import os
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from tools.binary_extensions import has_binary_extension
|
||||
from tools.file_operations import ShellFileOperations
|
||||
from agent.redact import redact_sensitive_text
|
||||
|
||||
|
|
@ -290,11 +291,24 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
|
|||
),
|
||||
})
|
||||
|
||||
# Resolve path once for all guards below
|
||||
import pathlib as _pathlib
|
||||
_resolved = _pathlib.Path(path).expanduser().resolve()
|
||||
|
||||
# ── Binary file guard ─────────────────────────────────────────
|
||||
# Block binary files by extension (no I/O).
|
||||
if has_binary_extension(str(_resolved)):
|
||||
_ext = _resolved.suffix.lower()
|
||||
return json.dumps({
|
||||
"error": (
|
||||
f"Cannot read binary file '{path}' ({_ext}). "
|
||||
"Use vision_analyze for images, or terminal to inspect binary files."
|
||||
),
|
||||
})
|
||||
|
||||
# ── Hermes internal path guard ────────────────────────────────
|
||||
# Prevent prompt injection via catalog or hub metadata files.
|
||||
import pathlib as _pathlib
|
||||
from hermes_constants import get_hermes_home as _get_hh
|
||||
_resolved = _pathlib.Path(path).expanduser().resolve()
|
||||
_hermes_home = _get_hh().resolve()
|
||||
_blocked_dirs = [
|
||||
_hermes_home / "skills" / ".hub" / "index-cache",
|
||||
|
|
@ -313,6 +327,27 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
|
|||
except ValueError:
|
||||
pass
|
||||
|
||||
# ── Pre-read file size guard ──────────────────────────────────
|
||||
# Stat the file before reading. If it's large and the model
|
||||
# didn't request a narrow range, block and tell it to use
|
||||
# offset/limit — cheaper than reading 200K chars then rejecting.
|
||||
_PRE_READ_MAX_BYTES = 100_000
|
||||
_NARROW_LIMIT = 200
|
||||
try:
|
||||
_fsize = os.path.getsize(str(_resolved))
|
||||
except OSError:
|
||||
_fsize = 0
|
||||
if _fsize > _PRE_READ_MAX_BYTES and limit > _NARROW_LIMIT:
|
||||
return json.dumps({
|
||||
"error": (
|
||||
f"File is too large to read in full ({_fsize:,} bytes). "
|
||||
f"Use offset and limit parameters to read specific sections "
|
||||
f"(e.g. offset=1, limit=100 for the first 100 lines)."
|
||||
),
|
||||
"path": path,
|
||||
"file_size": _fsize,
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# ── Dedup check ───────────────────────────────────────────────
|
||||
# If we already read this exact (path, offset, limit) and the
|
||||
# file hasn't been modified since, return a lightweight stub
|
||||
|
|
@ -726,7 +761,7 @@ def _check_file_reqs():
|
|||
|
||||
READ_FILE_SCHEMA = {
|
||||
"name": "read_file",
|
||||
"description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. Reads exceeding ~100K characters are rejected; use offset and limit to read specific sections of large files. NOTE: Cannot read images or binary files — use vision_analyze for images.",
|
||||
"description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. When you already know which part of the file you need, only read that part using offset and limit — this is important for larger files. Files over 100KB will be rejected unless you specify a narrow range (limit <= 200). NOTE: Cannot read images or binary files — use vision_analyze for images.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -817,7 +852,7 @@ def _handle_search_files(args, **kw):
|
|||
output_mode=args.get("output_mode", "content"), context=args.get("context", 0), task_id=tid)
|
||||
|
||||
|
||||
registry.register(name="read_file", toolset="file", schema=READ_FILE_SCHEMA, handler=_handle_read_file, check_fn=_check_file_reqs, emoji="📖")
|
||||
registry.register(name="write_file", toolset="file", schema=WRITE_FILE_SCHEMA, handler=_handle_write_file, check_fn=_check_file_reqs, emoji="✍️")
|
||||
registry.register(name="patch", toolset="file", schema=PATCH_SCHEMA, handler=_handle_patch, check_fn=_check_file_reqs, emoji="🔧")
|
||||
registry.register(name="search_files", toolset="file", schema=SEARCH_FILES_SCHEMA, handler=_handle_search_files, check_fn=_check_file_reqs, emoji="🔎")
|
||||
registry.register(name="read_file", toolset="file", schema=READ_FILE_SCHEMA, handler=_handle_read_file, check_fn=_check_file_reqs, emoji="📖", max_result_size_chars=float('inf'))
|
||||
registry.register(name="write_file", toolset="file", schema=WRITE_FILE_SCHEMA, handler=_handle_write_file, check_fn=_check_file_reqs, emoji="✍️", max_result_size_chars=100_000)
|
||||
registry.register(name="patch", toolset="file", schema=PATCH_SCHEMA, handler=_handle_patch, check_fn=_check_file_reqs, emoji="🔧", max_result_size_chars=100_000)
|
||||
registry.register(name="search_files", toolset="file", schema=SEARCH_FILES_SCHEMA, handler=_handle_search_files, check_fn=_check_file_reqs, emoji="🔎", max_result_size_chars=20_000)
|
||||
|
|
|
|||
|
|
@ -27,10 +27,12 @@ class ToolEntry:
|
|||
__slots__ = (
|
||||
"name", "toolset", "schema", "handler", "check_fn",
|
||||
"requires_env", "is_async", "description", "emoji",
|
||||
"max_result_size_chars",
|
||||
)
|
||||
|
||||
def __init__(self, name, toolset, schema, handler, check_fn,
|
||||
requires_env, is_async, description, emoji):
|
||||
requires_env, is_async, description, emoji,
|
||||
max_result_size_chars=None):
|
||||
self.name = name
|
||||
self.toolset = toolset
|
||||
self.schema = schema
|
||||
|
|
@ -40,6 +42,7 @@ class ToolEntry:
|
|||
self.is_async = is_async
|
||||
self.description = description
|
||||
self.emoji = emoji
|
||||
self.max_result_size_chars = max_result_size_chars
|
||||
|
||||
|
||||
class ToolRegistry:
|
||||
|
|
@ -64,6 +67,7 @@ class ToolRegistry:
|
|||
is_async: bool = False,
|
||||
description: str = "",
|
||||
emoji: str = "",
|
||||
max_result_size_chars: int | float | None = None,
|
||||
):
|
||||
"""Register a tool. Called at module-import time by each tool file."""
|
||||
existing = self._tools.get(name)
|
||||
|
|
@ -83,6 +87,7 @@ class ToolRegistry:
|
|||
is_async=is_async,
|
||||
description=description or schema.get("description", ""),
|
||||
emoji=emoji,
|
||||
max_result_size_chars=max_result_size_chars,
|
||||
)
|
||||
if check_fn and toolset not in self._toolset_checks:
|
||||
self._toolset_checks[toolset] = check_fn
|
||||
|
|
@ -164,6 +169,14 @@ class ToolRegistry:
|
|||
# Query helpers (replace redundant dicts in model_tools.py)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def get_max_result_size(self, name: str) -> int | float:
|
||||
"""Return per-tool max result size, or global default."""
|
||||
from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS
|
||||
entry = self._tools.get(name)
|
||||
if entry and entry.max_result_size_chars is not None:
|
||||
return entry.max_result_size_chars
|
||||
return DEFAULT_MAX_RESULT_SIZE_CHARS
|
||||
|
||||
def get_all_tool_names(self) -> List[str]:
|
||||
"""Return sorted list of all registered tool names."""
|
||||
return sorted(self._tools.keys())
|
||||
|
|
|
|||
|
|
@ -811,6 +811,12 @@ def _stop_cleanup_thread():
|
|||
pass
|
||||
|
||||
|
||||
def get_active_env(task_id: str):
|
||||
"""Return the active BaseEnvironment for *task_id*, or None."""
|
||||
with _env_lock:
|
||||
return _active_environments.get(task_id)
|
||||
|
||||
|
||||
def get_active_environments_info() -> Dict[str, Any]:
|
||||
"""Get information about currently active environments."""
|
||||
info = {
|
||||
|
|
@ -1617,4 +1623,5 @@ registry.register(
|
|||
handler=_handle_terminal,
|
||||
check_fn=check_terminal_requirements,
|
||||
emoji="💻",
|
||||
max_result_size_chars=30_000,
|
||||
)
|
||||
|
|
|
|||
223
tools/tool_result_storage.py
Normal file
223
tools/tool_result_storage.py
Normal file
|
|
@ -0,0 +1,223 @@
|
|||
"""Tool result persistence -- preserves large outputs instead of truncating.
|
||||
|
||||
Defense against context-window overflow operates at three levels:
|
||||
|
||||
1. **Per-tool output cap** (inside each tool): Tools like search_files
|
||||
pre-truncate their own output before returning. This is the first line
|
||||
of defense and the only one the tool author controls.
|
||||
|
||||
2. **Per-result persistence** (maybe_persist_tool_result): After a tool
|
||||
returns, if its output exceeds the tool's registered threshold
|
||||
(registry.get_max_result_size), the full output is written INTO THE
|
||||
SANDBOX at /tmp/hermes-results/{tool_use_id}.txt via env.execute().
|
||||
The in-context content is replaced with a preview + file path reference.
|
||||
The model can read_file to access the full output on any backend.
|
||||
|
||||
3. **Per-turn aggregate budget** (enforce_turn_budget): After all tool
|
||||
results in a single assistant turn are collected, if the total exceeds
|
||||
MAX_TURN_BUDGET_CHARS (200K), the largest non-persisted results are
|
||||
spilled to disk until the aggregate is under budget. This catches cases
|
||||
where many medium-sized results combine to overflow context.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_MAX_RESULT_SIZE_CHARS: int = 50_000
|
||||
MAX_TURN_BUDGET_CHARS: int = 200_000
|
||||
PREVIEW_SIZE_CHARS: int = 2_000
|
||||
PERSISTED_OUTPUT_TAG = "<persisted-output>"
|
||||
PERSISTED_OUTPUT_CLOSING_TAG = "</persisted-output>"
|
||||
STORAGE_DIR = "/tmp/hermes-results"
|
||||
HEREDOC_MARKER = "HERMES_PERSIST_EOF"
|
||||
_BUDGET_TOOL_NAME = "__budget_enforcement__"
|
||||
|
||||
|
||||
def generate_preview(content: str, max_chars: int = PREVIEW_SIZE_CHARS) -> tuple[str, bool]:
|
||||
"""Truncate at last newline within max_chars. Returns (preview, has_more)."""
|
||||
if len(content) <= max_chars:
|
||||
return content, False
|
||||
truncated = content[:max_chars]
|
||||
last_nl = truncated.rfind("\n")
|
||||
if last_nl > max_chars // 2:
|
||||
truncated = truncated[:last_nl + 1]
|
||||
return truncated, True
|
||||
|
||||
|
||||
def _heredoc_marker(content: str) -> str:
|
||||
"""Return a heredoc delimiter that doesn't collide with content."""
|
||||
if HEREDOC_MARKER not in content:
|
||||
return HEREDOC_MARKER
|
||||
return f"HERMES_PERSIST_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
|
||||
def _extract_raw_output(content: str) -> str:
|
||||
"""Extract the 'output' field from JSON tool results for cleaner persistence.
|
||||
|
||||
Tool handlers return json.dumps({"output": ..., "exit_code": ...}) for the
|
||||
API, but persisted files should contain readable text, not a JSON blob.
|
||||
"""
|
||||
try:
|
||||
data = json.loads(content)
|
||||
if isinstance(data, dict) and "output" in data:
|
||||
return data["output"]
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
return content
|
||||
|
||||
|
||||
def _write_to_sandbox(content: str, remote_path: str, env) -> bool:
|
||||
"""Write content into the sandbox via env.execute(). Returns True on success."""
|
||||
marker = _heredoc_marker(content)
|
||||
cmd = (
|
||||
f"mkdir -p {STORAGE_DIR} && cat > {remote_path} << '{marker}'\n"
|
||||
f"{content}\n"
|
||||
f"{marker}"
|
||||
)
|
||||
result = env.execute(cmd, timeout=30)
|
||||
return result.get("returncode", 1) == 0
|
||||
|
||||
|
||||
def _build_persisted_message(
|
||||
preview: str,
|
||||
has_more: bool,
|
||||
original_size: int,
|
||||
file_path: str,
|
||||
) -> str:
|
||||
"""Build the <persisted-output> replacement block."""
|
||||
size_kb = original_size / 1024
|
||||
if size_kb >= 1024:
|
||||
size_str = f"{size_kb / 1024:.1f} MB"
|
||||
else:
|
||||
size_str = f"{size_kb:.1f} KB"
|
||||
|
||||
msg = f"{PERSISTED_OUTPUT_TAG}\n"
|
||||
msg += f"This tool result was too large ({original_size:,} characters, {size_str}).\n"
|
||||
msg += f"Full output saved to: {file_path}\n"
|
||||
msg += "Use the read_file tool with offset and limit to access specific sections of this output.\n\n"
|
||||
msg += f"Preview (first {len(preview)} chars):\n"
|
||||
msg += preview
|
||||
if has_more:
|
||||
msg += "\n..."
|
||||
msg += f"\n{PERSISTED_OUTPUT_CLOSING_TAG}"
|
||||
return msg
|
||||
|
||||
|
||||
def maybe_persist_tool_result(
|
||||
content: str,
|
||||
tool_name: str,
|
||||
tool_use_id: str,
|
||||
env=None,
|
||||
threshold: int | float | None = None,
|
||||
) -> str:
|
||||
"""Layer 2: persist oversized result into the sandbox, return preview + path.
|
||||
|
||||
Writes via env.execute() so the file is accessible from any backend
|
||||
(local, Docker, SSH, Modal, Daytona). Falls back to inline truncation
|
||||
if write fails or no env is available.
|
||||
|
||||
Args:
|
||||
content: Raw tool result string.
|
||||
tool_name: Name of the tool (used for threshold lookup).
|
||||
tool_use_id: Unique ID for this tool call (used as filename).
|
||||
env: The active BaseEnvironment instance, or None.
|
||||
threshold: Override threshold; if None, looked up from registry.
|
||||
|
||||
Returns:
|
||||
Original content if small, or <persisted-output> replacement.
|
||||
"""
|
||||
if threshold is None:
|
||||
from tools.registry import registry
|
||||
threshold = registry.get_max_result_size(tool_name)
|
||||
|
||||
# Infinity means never persist (e.g. read_file)
|
||||
if threshold == float("inf"):
|
||||
return content
|
||||
|
||||
if len(content) <= threshold:
|
||||
return content
|
||||
|
||||
remote_path = f"{STORAGE_DIR}/{tool_use_id}.txt"
|
||||
# Write raw output (not JSON wrapper) so read_file returns readable text
|
||||
file_content = _extract_raw_output(content)
|
||||
preview, has_more = generate_preview(file_content)
|
||||
|
||||
# Try writing into the sandbox
|
||||
if env is not None:
|
||||
try:
|
||||
if _write_to_sandbox(file_content, remote_path, env):
|
||||
logger.info(
|
||||
"Persisted large tool result: %s (%s, %d chars -> %s)",
|
||||
tool_name, tool_use_id, len(content), remote_path,
|
||||
)
|
||||
return _build_persisted_message(preview, has_more, len(content), remote_path)
|
||||
except Exception as exc:
|
||||
logger.warning("Sandbox write failed for %s: %s", tool_use_id, exc)
|
||||
|
||||
# Fallback: inline truncation (no sandbox available or write failed)
|
||||
logger.info(
|
||||
"Inline-truncating large tool result: %s (%d chars, no sandbox write)",
|
||||
tool_name, len(content),
|
||||
)
|
||||
return (
|
||||
f"{preview}\n\n"
|
||||
f"[Truncated: tool response was {len(content):,} chars. "
|
||||
f"Full output could not be saved to sandbox.]"
|
||||
)
|
||||
|
||||
|
||||
def enforce_turn_budget(
|
||||
tool_messages: list[dict],
|
||||
env=None,
|
||||
budget: int = MAX_TURN_BUDGET_CHARS,
|
||||
) -> list[dict]:
|
||||
"""Layer 3: enforce aggregate budget across all tool results in a turn.
|
||||
|
||||
If total chars exceed budget, persist the largest non-persisted results
|
||||
first (via sandbox write) until under budget. Already-persisted results
|
||||
are skipped.
|
||||
|
||||
Mutates the list in-place and returns it.
|
||||
"""
|
||||
candidates = []
|
||||
total_size = 0
|
||||
for i, msg in enumerate(tool_messages):
|
||||
content = msg.get("content", "")
|
||||
size = len(content)
|
||||
total_size += size
|
||||
if PERSISTED_OUTPUT_TAG not in content:
|
||||
candidates.append((i, size))
|
||||
|
||||
if total_size <= budget:
|
||||
return tool_messages
|
||||
|
||||
# Sort candidates by size descending — persist largest first
|
||||
candidates.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
for idx, size in candidates:
|
||||
if total_size <= budget:
|
||||
break
|
||||
msg = tool_messages[idx]
|
||||
content = msg["content"]
|
||||
tool_use_id = msg.get("tool_call_id", f"budget_{idx}")
|
||||
|
||||
replacement = maybe_persist_tool_result(
|
||||
content=content,
|
||||
tool_name=_BUDGET_TOOL_NAME,
|
||||
tool_use_id=tool_use_id,
|
||||
env=env,
|
||||
threshold=0,
|
||||
)
|
||||
if replacement != content:
|
||||
total_size -= size
|
||||
total_size += len(replacement)
|
||||
tool_messages[idx]["content"] = replacement
|
||||
logger.info(
|
||||
"Budget enforcement: persisted tool result %s (%d chars)",
|
||||
tool_use_id, size,
|
||||
)
|
||||
|
||||
return tool_messages
|
||||
|
|
@ -2085,6 +2085,7 @@ registry.register(
|
|||
check_fn=check_web_api_key,
|
||||
requires_env=_web_requires_env(),
|
||||
emoji="🔍",
|
||||
max_result_size_chars=100_000,
|
||||
)
|
||||
registry.register(
|
||||
name="web_extract",
|
||||
|
|
@ -2096,4 +2097,5 @@ registry.register(
|
|||
requires_env=_web_requires_env(),
|
||||
is_async=True,
|
||||
emoji="📄",
|
||||
max_result_size_chars=100_000,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue