diff --git a/environments/agent_loop.py b/environments/agent_loop.py index cbf9c7742..891ce42f4 100644 --- a/environments/agent_loop.py +++ b/environments/agent_loop.py @@ -455,17 +455,13 @@ class HermesAgentLoop: pass tc_id = tc.get("id", "") if isinstance(tc, dict) else tc.id - try: - tool_result = maybe_persist_tool_result( - content=tool_result, - tool_name=tool_name, - tool_use_id=tc_id, - env=get_active_env(self.task_id), - threshold=self.budget_config.resolve_threshold(tool_name), - preview_size=self.budget_config.preview_size, - ) - except Exception: - pass # Persistence is best-effort in eval path + tool_result = maybe_persist_tool_result( + content=tool_result, + tool_name=tool_name, + tool_use_id=tc_id, + env=get_active_env(self.task_id), + config=self.budget_config, + ) messages.append( { @@ -475,17 +471,13 @@ class HermesAgentLoop: } ) - try: - num_tcs = len(assistant_msg.tool_calls) - if num_tcs > 0: - enforce_turn_budget( - messages[-num_tcs:], - env=get_active_env(self.task_id), - budget=self.budget_config.turn_budget, - preview_size=self.budget_config.preview_size, - ) - except Exception: - pass + num_tcs = len(assistant_msg.tool_calls) + if num_tcs > 0: + enforce_turn_budget( + messages[-num_tcs:], + env=get_active_env(self.task_id), + config=self.budget_config, + ) turn_elapsed = _time.monotonic() - turn_start logger.info( diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 7f6ab4c30..104881a03 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -1011,10 +1011,9 @@ class TestExecuteToolCalls: big_result = "x" * 150_000 with patch("run_agent.handle_function_call", return_value=big_result): agent._execute_tool_calls(mock_msg, messages, "task-1") - # Content should be replaced with preview + file path + # Content should be replaced with persisted-output or truncation assert len(messages[0]["content"]) < 150_000 - assert "Large tool response" in messages[0]["content"] - assert "Full output saved to:" in messages[0]["content"] + assert ("Truncated" in messages[0]["content"] or "" in messages[0]["content"]) class TestConcurrentToolExecution: @@ -1249,8 +1248,7 @@ class TestConcurrentToolExecution: assert len(messages) == 2 for m in messages: assert len(m["content"]) < 150_000 - assert "Large tool response" in m["content"] - assert "Full output saved to:" in m["content"] + assert ("Truncated" in m["content"] or "" in m["content"]) def test_invoke_tool_dispatches_to_handle_function_call(self, agent): """_invoke_tool should route regular tools through handle_function_call.""" diff --git a/tests/tools/test_tool_result_storage.py b/tests/tools/test_tool_result_storage.py index 7c757027a..96b904a57 100644 --- a/tests/tools/test_tool_result_storage.py +++ b/tests/tools/test_tool_result_storage.py @@ -3,16 +3,18 @@ import pytest from unittest.mock import MagicMock, patch +from tools.budget_config import ( + DEFAULT_RESULT_SIZE_CHARS, + DEFAULT_TURN_BUDGET_CHARS, + DEFAULT_PREVIEW_SIZE_CHARS, + BudgetConfig, +) from tools.tool_result_storage import ( - DEFAULT_MAX_RESULT_SIZE_CHARS, HEREDOC_MARKER, - MAX_TURN_BUDGET_CHARS, PERSISTED_OUTPUT_TAG, PERSISTED_OUTPUT_CLOSING_TAG, - PREVIEW_SIZE_CHARS, STORAGE_DIR, _build_persisted_message, - _extract_raw_output, _heredoc_marker, _write_to_sandbox, enforce_turn_budget, @@ -56,35 +58,12 @@ class TestGeneratePreview: assert has_more is False def test_exact_boundary(self): - text = "x" * PREVIEW_SIZE_CHARS + text = "x" * DEFAULT_PREVIEW_SIZE_CHARS preview, has_more = generate_preview(text) assert preview == text assert has_more is False -# ── _extract_raw_output ──────────────────────────────────────────────── - -class TestExtractRawOutput: - def test_extracts_output_from_terminal_json(self): - import json - content = json.dumps({"output": "hello world\nline2", "exit_code": 0, "error": None}) - assert _extract_raw_output(content) == "hello world\nline2" - - def test_passes_through_non_json(self): - assert _extract_raw_output("plain text output") == "plain text output" - - def test_passes_through_json_without_output_key(self): - import json - content = json.dumps({"result": "something", "status": "ok"}) - assert _extract_raw_output(content) == content - - def test_extracts_large_output(self): - import json - big = "x\n" * 30_000 - content = json.dumps({"output": big, "exit_code": 0, "error": None}) - assert _extract_raw_output(content) == big - - # ── _heredoc_marker ─────────────────────────────────────────────────── class TestHeredocMarker: @@ -206,8 +185,8 @@ class TestMaybePersistToolResult: assert len(result) < len(content) env.execute.assert_called_once() - def test_persists_raw_output_not_json_wrapper(self): - """When content is JSON with 'output' key, file should contain raw output.""" + def test_persists_full_content_as_is(self): + """Content is persisted verbatim — no JSON extraction.""" import json env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} @@ -221,10 +200,9 @@ class TestMaybePersistToolResult: threshold=30_000, ) assert PERSISTED_OUTPUT_TAG in result - # The heredoc written to sandbox should contain raw text, not JSON + # The heredoc written to sandbox should contain the full JSON blob cmd = env.execute.call_args[0][0] - assert "line1\nline2\n" in cmd - assert '"exit_code"' not in cmd + assert '"exit_code"' in cmd def test_above_threshold_no_env_truncates_inline(self): content = "x" * 60_000 @@ -386,7 +364,7 @@ class TestEnforceTurnBudget: {"role": "tool", "tool_call_id": "t1", "content": "small"}, {"role": "tool", "tool_call_id": "t2", "content": "also small"}, ] - result = enforce_turn_budget(msgs, env=None, budget=200_000) + result = enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000)) assert result[0]["content"] == "small" assert result[1]["content"] == "also small" @@ -398,7 +376,7 @@ class TestEnforceTurnBudget: {"role": "tool", "tool_call_id": "t2", "content": "b" * 130_000}, ] # Total 210K > 200K budget - enforce_turn_budget(msgs, env=env, budget=200_000) + enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000)) # The larger one (130K) should be persisted first assert PERSISTED_OUTPUT_TAG in msgs[1]["content"] @@ -410,7 +388,7 @@ class TestEnforceTurnBudget: "content": f"{PERSISTED_OUTPUT_TAG}\nalready persisted\n{PERSISTED_OUTPUT_CLOSING_TAG}"}, {"role": "tool", "tool_call_id": "t2", "content": "x" * 250_000}, ] - enforce_turn_budget(msgs, env=env, budget=200_000) + enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000)) # t1 should be untouched (already persisted) assert msgs[0]["content"].startswith(PERSISTED_OUTPUT_TAG) # t2 should be persisted @@ -425,7 +403,7 @@ class TestEnforceTurnBudget: {"role": "tool", "tool_call_id": f"t{i}", "content": "x" * 42_000} for i in range(6) ] - enforce_turn_budget(msgs, env=env, budget=200_000) + enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000)) # At least some results should be persisted to get under 200K persisted_count = sum( 1 for m in msgs if PERSISTED_OUTPUT_TAG in m["content"] @@ -436,17 +414,17 @@ class TestEnforceTurnBudget: msgs = [ {"role": "tool", "tool_call_id": "t1", "content": "x" * 250_000}, ] - enforce_turn_budget(msgs, env=None, budget=200_000) + enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000)) # Should be truncated (no sandbox available) assert "Truncated" in msgs[0]["content"] or PERSISTED_OUTPUT_TAG in msgs[0]["content"] def test_returns_same_list(self): msgs = [{"role": "tool", "tool_call_id": "t1", "content": "ok"}] - result = enforce_turn_budget(msgs, env=None, budget=200_000) + result = enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000)) assert result is msgs def test_empty_messages(self): - result = enforce_turn_budget([], env=None, budget=200_000) + result = enforce_turn_budget([], env=None, config=BudgetConfig(turn_budget=200_000)) assert result == [] @@ -463,7 +441,7 @@ class TestPerToolThresholds: from tools.registry import registry # Unknown tool should return the default val = registry.get_max_result_size("nonexistent_tool_xyz") - assert val == DEFAULT_MAX_RESULT_SIZE_CHARS + assert val == DEFAULT_RESULT_SIZE_CHARS def test_terminal_threshold(self): from tools.registry import registry diff --git a/tools/binary_extensions.py b/tools/binary_extensions.py index f7e63bdad..bd4bb8d1d 100644 --- a/tools/binary_extensions.py +++ b/tools/binary_extensions.py @@ -16,8 +16,8 @@ BINARY_EXTENSIONS = frozenset({ # Executables/binaries ".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a", ".obj", ".lib", ".app", ".msi", ".deb", ".rpm", - # Documents (PDF is here; read_file excludes it at the call site) - ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", + # Documents (exclude .pdf — text-based, agents may want to inspect) + ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", # Fonts ".ttf", ".otf", ".woff", ".woff2", ".eot", diff --git a/tools/file_operations.py b/tools/file_operations.py index 052f77a80..f2b37505f 100644 --- a/tools/file_operations.py +++ b/tools/file_operations.py @@ -33,6 +33,7 @@ from dataclasses import dataclass, field from typing import Optional, List, Dict, Any from pathlib import Path from hermes_constants import get_hermes_home +from tools.binary_extensions import BINARY_EXTENSIONS # --------------------------------------------------------------------------- @@ -280,26 +281,6 @@ class FileOperations(ABC): # Shell-based Implementation # ============================================================================= -# Binary file extensions (fast path check) -BINARY_EXTENSIONS = { - # Images - '.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.ico', '.tiff', '.tif', - '.svg', # SVG is text but often treated as binary - # Audio/Video - '.mp3', '.mp4', '.wav', '.avi', '.mov', '.mkv', '.flac', '.ogg', '.webm', - # Archives - '.zip', '.tar', '.gz', '.bz2', '.xz', '.7z', '.rar', - # Documents - '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', - # Compiled/Binary - '.exe', '.dll', '.so', '.dylib', '.o', '.a', '.pyc', '.pyo', '.class', - '.wasm', '.bin', - # Fonts - '.ttf', '.otf', '.woff', '.woff2', '.eot', - # Other - '.db', '.sqlite', '.sqlite3', -} - # Image extensions (subset of binary that we can return as base64) IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.ico'} diff --git a/tools/file_tools.py b/tools/file_tools.py index 265c9ed2e..4ca10b2dc 100644 --- a/tools/file_tools.py +++ b/tools/file_tools.py @@ -26,6 +26,8 @@ _EXPECTED_WRITE_ERRNOS = {errno.EACCES, errno.EPERM, errno.EROFS} # Configurable via config.yaml: file_read_max_chars: 200000 # --------------------------------------------------------------------------- _DEFAULT_MAX_READ_CHARS = 100_000 +_PRE_READ_MAX_BYTES = 256_000 # reject full-file reads on files larger than this +_DEFAULT_READ_LIMIT = 500 _max_read_chars_cached: int | None = None @@ -277,7 +279,7 @@ def clear_file_ops_cache(task_id: str = None): _file_ops_cache.clear() -def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str = "default") -> str: +def read_file_tool(path: str, offset: int = 1, limit: int | None = None, task_id: str = "default") -> str: """Read a file with pagination and line numbers.""" try: # ── Device path guard ───────────────────────────────────────── @@ -291,9 +293,7 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str = ), }) - # Resolve path once for all guards below - import pathlib as _pathlib - _resolved = _pathlib.Path(path).expanduser().resolve() + _resolved = Path(path).expanduser().resolve() # ── Binary file guard ───────────────────────────────────────── # Block binary files by extension (no I/O). @@ -328,25 +328,26 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str = pass # ── Pre-read file size guard ────────────────────────────────── - # Stat the file before reading. If it's large and the model - # didn't request a narrow range, block and tell it to use - # offset/limit — cheaper than reading 200K chars then rejecting. - _PRE_READ_MAX_BYTES = 100_000 - _NARROW_LIMIT = 200 - try: - _fsize = os.path.getsize(str(_resolved)) - except OSError: - _fsize = 0 - if _fsize > _PRE_READ_MAX_BYTES and limit > _NARROW_LIMIT: - return json.dumps({ - "error": ( - f"File is too large to read in full ({_fsize:,} bytes). " - f"Use offset and limit parameters to read specific sections " - f"(e.g. offset=1, limit=100 for the first 100 lines)." - ), - "path": path, - "file_size": _fsize, - }, ensure_ascii=False) + # Guard only when the caller omits limit; an explicit limit means + # the caller knows what slice it wants. + if limit is None: + try: + _fsize = os.path.getsize(str(_resolved)) + except OSError: + _fsize = 0 + if _fsize > _PRE_READ_MAX_BYTES: + return json.dumps({ + "error": ( + f"File is too large to read in full ({_fsize:,} bytes). " + f"Use offset and limit parameters to read specific sections " + f"(e.g. offset=1, limit=100 for the first 100 lines)." + ), + "path": path, + "file_size": _fsize, + }, ensure_ascii=False) + + if limit is None: + limit = _DEFAULT_READ_LIMIT # ── Dedup check ─────────────────────────────────────────────── # If we already read this exact (path, offset, limit) and the @@ -761,7 +762,7 @@ def _check_file_reqs(): READ_FILE_SCHEMA = { "name": "read_file", - "description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. When you already know which part of the file you need, only read that part using offset and limit — this is important for larger files. Files over 100KB will be rejected unless you specify a narrow range (limit <= 200). NOTE: Cannot read images or binary files — use vision_analyze for images.", + "description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. When you already know which part of the file you need, only read that part using offset and limit — this is important for larger files. Files over 256KB will be rejected unless you provide a limit parameter. NOTE: Cannot read images or binary files — use vision_analyze for images.", "parameters": { "type": "object", "properties": { @@ -825,7 +826,7 @@ SEARCH_FILES_SCHEMA = { def _handle_read_file(args, **kw): tid = kw.get("task_id") or "default" - return read_file_tool(path=args.get("path", ""), offset=args.get("offset", 1), limit=args.get("limit", 500), task_id=tid) + return read_file_tool(path=args.get("path", ""), offset=args.get("offset", 1), limit=args.get("limit"), task_id=tid) def _handle_write_file(args, **kw): diff --git a/tools/registry.py b/tools/registry.py index 9437a6b41..d3590a42c 100644 --- a/tools/registry.py +++ b/tools/registry.py @@ -176,8 +176,8 @@ class ToolRegistry: return entry.max_result_size_chars if default is not None: return default - from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS - return DEFAULT_MAX_RESULT_SIZE_CHARS + from tools.budget_config import DEFAULT_RESULT_SIZE_CHARS + return DEFAULT_RESULT_SIZE_CHARS def get_all_tool_names(self) -> List[str]: """Return sorted list of all registered tool names.""" diff --git a/tools/tool_result_storage.py b/tools/tool_result_storage.py index 8b2abb918..076d37ae0 100644 --- a/tools/tool_result_storage.py +++ b/tools/tool_result_storage.py @@ -20,14 +20,13 @@ Defense against context-window overflow operates at three levels: where many medium-sized results combine to overflow context. """ -import json import logging import uuid from tools.budget_config import ( - DEFAULT_RESULT_SIZE_CHARS as DEFAULT_MAX_RESULT_SIZE_CHARS, - DEFAULT_TURN_BUDGET_CHARS as MAX_TURN_BUDGET_CHARS, - DEFAULT_PREVIEW_SIZE_CHARS as PREVIEW_SIZE_CHARS, + DEFAULT_PREVIEW_SIZE_CHARS, + BudgetConfig, + DEFAULT_BUDGET, ) logger = logging.getLogger(__name__) @@ -38,7 +37,7 @@ HEREDOC_MARKER = "HERMES_PERSIST_EOF" _BUDGET_TOOL_NAME = "__budget_enforcement__" -def generate_preview(content: str, max_chars: int = PREVIEW_SIZE_CHARS) -> tuple[str, bool]: +def generate_preview(content: str, max_chars: int = DEFAULT_PREVIEW_SIZE_CHARS) -> tuple[str, bool]: """Truncate at last newline within max_chars. Returns (preview, has_more).""" if len(content) <= max_chars: return content, False @@ -56,21 +55,6 @@ def _heredoc_marker(content: str) -> str: return f"HERMES_PERSIST_{uuid.uuid4().hex[:8]}" -def _extract_raw_output(content: str) -> str: - """Extract the 'output' field from JSON tool results for cleaner persistence. - - Tool handlers return json.dumps({"output": ..., "exit_code": ...}) for the - API, but persisted files should contain readable text, not a JSON blob. - """ - try: - data = json.loads(content) - if isinstance(data, dict) and "output" in data: - return data["output"] - except (json.JSONDecodeError, TypeError): - pass - return content - - def _write_to_sandbox(content: str, remote_path: str, env) -> bool: """Write content into the sandbox via env.execute(). Returns True on success.""" marker = _heredoc_marker(content) @@ -113,8 +97,8 @@ def maybe_persist_tool_result( tool_name: str, tool_use_id: str, env=None, + config: BudgetConfig = DEFAULT_BUDGET, threshold: int | float | None = None, - preview_size: int = PREVIEW_SIZE_CHARS, ) -> str: """Layer 2: persist oversized result into the sandbox, return preview + path. @@ -127,32 +111,26 @@ def maybe_persist_tool_result( tool_name: Name of the tool (used for threshold lookup). tool_use_id: Unique ID for this tool call (used as filename). env: The active BaseEnvironment instance, or None. - threshold: Override threshold; if None, looked up from registry. - preview_size: Max chars for the inline preview after persistence. + config: BudgetConfig controlling thresholds and preview size. + threshold: Explicit override; takes precedence over config resolution. Returns: Original content if small, or replacement. """ - if threshold is None: - from tools.registry import registry - threshold = registry.get_max_result_size(tool_name) + effective_threshold = threshold if threshold is not None else config.resolve_threshold(tool_name) - # Infinity means never persist (e.g. read_file) - if threshold == float("inf"): + if effective_threshold == float("inf"): return content - if len(content) <= threshold: + if len(content) <= effective_threshold: return content remote_path = f"{STORAGE_DIR}/{tool_use_id}.txt" - # Write raw output (not JSON wrapper) so read_file returns readable text - file_content = _extract_raw_output(content) - preview, has_more = generate_preview(file_content, max_chars=preview_size) + preview, has_more = generate_preview(content, max_chars=config.preview_size) - # Try writing into the sandbox if env is not None: try: - if _write_to_sandbox(file_content, remote_path, env): + if _write_to_sandbox(content, remote_path, env): logger.info( "Persisted large tool result: %s (%s, %d chars -> %s)", tool_name, tool_use_id, len(content), remote_path, @@ -161,7 +139,6 @@ def maybe_persist_tool_result( except Exception as exc: logger.warning("Sandbox write failed for %s: %s", tool_use_id, exc) - # Fallback: inline truncation (no sandbox available or write failed) logger.info( "Inline-truncating large tool result: %s (%d chars, no sandbox write)", tool_name, len(content), @@ -176,8 +153,7 @@ def maybe_persist_tool_result( def enforce_turn_budget( tool_messages: list[dict], env=None, - budget: int = MAX_TURN_BUDGET_CHARS, - preview_size: int = PREVIEW_SIZE_CHARS, + config: BudgetConfig = DEFAULT_BUDGET, ) -> list[dict]: """Layer 3: enforce aggregate budget across all tool results in a turn. @@ -196,14 +172,13 @@ def enforce_turn_budget( if PERSISTED_OUTPUT_TAG not in content: candidates.append((i, size)) - if total_size <= budget: + if total_size <= config.turn_budget: return tool_messages - # Sort candidates by size descending — persist largest first candidates.sort(key=lambda x: x[1], reverse=True) for idx, size in candidates: - if total_size <= budget: + if total_size <= config.turn_budget: break msg = tool_messages[idx] content = msg["content"] @@ -214,8 +189,8 @@ def enforce_turn_budget( tool_name=_BUDGET_TOOL_NAME, tool_use_id=tool_use_id, env=env, + config=config, threshold=0, - preview_size=preview_size, ) if replacement != content: total_size -= size