"""Tests for tools/tool_result_storage.py -- 3-layer tool result persistence.""" import pytest from unittest.mock import MagicMock, patch from tools.budget_config import ( DEFAULT_RESULT_SIZE_CHARS, DEFAULT_TURN_BUDGET_CHARS, DEFAULT_PREVIEW_SIZE_CHARS, BudgetConfig, ) from tools.tool_result_storage import ( HEREDOC_MARKER, PERSISTED_OUTPUT_TAG, PERSISTED_OUTPUT_CLOSING_TAG, STORAGE_DIR, _build_persisted_message, _heredoc_marker, _write_to_sandbox, enforce_turn_budget, generate_preview, maybe_persist_tool_result, ) # ── generate_preview ────────────────────────────────────────────────── class TestGeneratePreview: def test_short_content_unchanged(self): text = "short result" preview, has_more = generate_preview(text) assert preview == text assert has_more is False def test_long_content_truncated(self): text = "x" * 5000 preview, has_more = generate_preview(text, max_chars=2000) assert len(preview) <= 2000 assert has_more is True def test_truncates_at_newline_boundary(self): # 1500 chars + newline + 600 chars (past halfway) text = "a" * 1500 + "\n" + "b" * 600 preview, has_more = generate_preview(text, max_chars=2000) assert preview == "a" * 1500 + "\n" assert has_more is True def test_ignores_early_newline(self): # Newline at position 100, well before halfway of 2000 text = "a" * 100 + "\n" + "b" * 3000 preview, has_more = generate_preview(text, max_chars=2000) assert len(preview) == 2000 assert has_more is True def test_empty_content(self): preview, has_more = generate_preview("") assert preview == "" assert has_more is False def test_exact_boundary(self): text = "x" * DEFAULT_PREVIEW_SIZE_CHARS preview, has_more = generate_preview(text) assert preview == text assert has_more is False # ── _heredoc_marker ─────────────────────────────────────────────────── class TestHeredocMarker: def test_default_marker_when_no_collision(self): assert _heredoc_marker("normal content") == HEREDOC_MARKER def test_uuid_marker_on_collision(self): content = f"some text with {HEREDOC_MARKER} embedded" marker = _heredoc_marker(content) assert marker != HEREDOC_MARKER assert marker.startswith("HERMES_PERSIST_") assert marker not in content # ── _write_to_sandbox ───────────────────────────────────────────────── class TestWriteToSandbox: def test_success(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} result = _write_to_sandbox("hello world", "/tmp/hermes-results/abc.txt", env) assert result is True env.execute.assert_called_once() cmd = env.execute.call_args[0][0] assert "mkdir -p" in cmd assert "hello world" in cmd assert HEREDOC_MARKER in cmd def test_failure_returns_false(self): env = MagicMock() env.execute.return_value = {"output": "error", "returncode": 1} result = _write_to_sandbox("content", "/tmp/hermes-results/abc.txt", env) assert result is False def test_heredoc_collision_uses_uuid_marker(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} content = f"text with {HEREDOC_MARKER} inside" _write_to_sandbox(content, "/tmp/hermes-results/abc.txt", env) cmd = env.execute.call_args[0][0] # The default marker should NOT be used as the delimiter lines = cmd.split("\n") # The first and last lines contain the actual delimiter assert HEREDOC_MARKER not in lines[0].split("<<")[1] def test_timeout_passed(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} _write_to_sandbox("content", "/tmp/hermes-results/abc.txt", env) assert env.execute.call_args[1]["timeout"] == 30 # ── _build_persisted_message ────────────────────────────────────────── class TestBuildPersistedMessage: def test_structure(self): msg = _build_persisted_message( preview="first 100 chars...", has_more=True, original_size=50_000, file_path="/tmp/hermes-results/test123.txt", ) assert msg.startswith(PERSISTED_OUTPUT_TAG) assert msg.endswith(PERSISTED_OUTPUT_CLOSING_TAG) assert "50,000 characters" in msg assert "/tmp/hermes-results/test123.txt" in msg assert "read_file" in msg assert "first 100 chars..." in msg assert "..." in msg # has_more indicator def test_no_ellipsis_when_complete(self): msg = _build_persisted_message( preview="complete content", has_more=False, original_size=16, file_path="/tmp/hermes-results/x.txt", ) # Should not have the trailing "..." indicator before closing tag lines = msg.strip().split("\n") assert lines[-2] != "..." def test_large_size_shows_mb(self): msg = _build_persisted_message( preview="x", has_more=True, original_size=2_000_000, file_path="/tmp/hermes-results/big.txt", ) assert "MB" in msg # ── maybe_persist_tool_result ───────────────────────────────────────── class TestMaybePersistToolResult: def test_below_threshold_returns_unchanged(self): content = "small result" result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_123", env=None, threshold=50_000, ) assert result == content def test_above_threshold_with_env_persists(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} content = "x" * 60_000 result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_456", env=env, threshold=30_000, ) assert PERSISTED_OUTPUT_TAG in result assert "tc_456.txt" in result assert len(result) < len(content) env.execute.assert_called_once() def test_persists_full_content_as_is(self): """Content is persisted verbatim — no JSON extraction.""" import json env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} raw = "line1\nline2\n" * 5_000 content = json.dumps({"output": raw, "exit_code": 0, "error": None}) result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_json", env=env, threshold=30_000, ) assert PERSISTED_OUTPUT_TAG in result # The heredoc written to sandbox should contain the full JSON blob cmd = env.execute.call_args[0][0] assert '"exit_code"' in cmd def test_above_threshold_no_env_truncates_inline(self): content = "x" * 60_000 result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_789", env=None, threshold=30_000, ) assert PERSISTED_OUTPUT_TAG not in result assert "Truncated" in result assert len(result) < len(content) def test_env_write_failure_falls_back_to_truncation(self): env = MagicMock() env.execute.return_value = {"output": "disk full", "returncode": 1} content = "x" * 60_000 result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_fail", env=env, threshold=30_000, ) assert PERSISTED_OUTPUT_TAG not in result assert "Truncated" in result def test_env_execute_exception_falls_back(self): env = MagicMock() env.execute.side_effect = RuntimeError("connection lost") content = "x" * 60_000 result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_exc", env=env, threshold=30_000, ) assert "Truncated" in result def test_read_file_never_persisted(self): """read_file has threshold=inf, should never be persisted.""" env = MagicMock() content = "x" * 200_000 result = maybe_persist_tool_result( content=content, tool_name="read_file", tool_use_id="tc_rf", env=env, threshold=float("inf"), ) assert result == content env.execute.assert_not_called() def test_uses_registry_threshold_when_not_provided(self): """When threshold=None, looks up from registry.""" env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} content = "x" * 60_000 mock_registry = MagicMock() mock_registry.get_max_result_size.return_value = 30_000 with patch("tools.registry.registry", mock_registry): result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_reg", env=env, threshold=None, ) # Should have persisted since 60K > 30K assert PERSISTED_OUTPUT_TAG in result or "Truncated" in result def test_unicode_content_survives(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} content = "日本語テスト " * 10_000 # ~60K chars of unicode result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_uni", env=env, threshold=30_000, ) assert PERSISTED_OUTPUT_TAG in result # Preview should contain unicode assert "日本語テスト" in result def test_empty_content_returns_unchanged(self): result = maybe_persist_tool_result( content="", tool_name="terminal", tool_use_id="tc_empty", env=None, threshold=30_000, ) assert result == "" def test_whitespace_only_below_threshold(self): content = " " * 100 result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_ws", env=None, threshold=30_000, ) assert result == content def test_file_path_uses_tool_use_id(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} content = "x" * 60_000 result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="unique_id_abc", env=env, threshold=30_000, ) assert "unique_id_abc.txt" in result def test_preview_included_in_persisted_output(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} # Create content with a distinctive start content = "DISTINCTIVE_START_MARKER" + "x" * 60_000 result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_prev", env=env, threshold=30_000, ) assert "DISTINCTIVE_START_MARKER" in result def test_threshold_zero_forces_persist(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} content = "even short content" result = maybe_persist_tool_result( content=content, tool_name="terminal", tool_use_id="tc_zero", env=env, threshold=0, ) # Any non-empty content with threshold=0 should be persisted assert PERSISTED_OUTPUT_TAG in result # ── enforce_turn_budget ─────────────────────────────────────────────── class TestEnforceTurnBudget: def test_under_budget_no_changes(self): msgs = [ {"role": "tool", "tool_call_id": "t1", "content": "small"}, {"role": "tool", "tool_call_id": "t2", "content": "also small"}, ] result = enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000)) assert result[0]["content"] == "small" assert result[1]["content"] == "also small" def test_over_budget_largest_persisted_first(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} msgs = [ {"role": "tool", "tool_call_id": "t1", "content": "a" * 80_000}, {"role": "tool", "tool_call_id": "t2", "content": "b" * 130_000}, ] # Total 210K > 200K budget enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000)) # The larger one (130K) should be persisted first assert PERSISTED_OUTPUT_TAG in msgs[1]["content"] def test_already_persisted_results_skipped(self): env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} msgs = [ {"role": "tool", "tool_call_id": "t1", "content": f"{PERSISTED_OUTPUT_TAG}\nalready persisted\n{PERSISTED_OUTPUT_CLOSING_TAG}"}, {"role": "tool", "tool_call_id": "t2", "content": "x" * 250_000}, ] enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000)) # t1 should be untouched (already persisted) assert msgs[0]["content"].startswith(PERSISTED_OUTPUT_TAG) # t2 should be persisted assert PERSISTED_OUTPUT_TAG in msgs[1]["content"] def test_medium_result_regression(self): """6 results of 42K chars each (252K total) — each under 100K default threshold but aggregate exceeds 200K budget. L3 should persist.""" env = MagicMock() env.execute.return_value = {"output": "", "returncode": 0} msgs = [ {"role": "tool", "tool_call_id": f"t{i}", "content": "x" * 42_000} for i in range(6) ] enforce_turn_budget(msgs, env=env, config=BudgetConfig(turn_budget=200_000)) # At least some results should be persisted to get under 200K persisted_count = sum( 1 for m in msgs if PERSISTED_OUTPUT_TAG in m["content"] ) assert persisted_count >= 2 # Need to shed at least ~52K def test_no_env_falls_back_to_truncation(self): msgs = [ {"role": "tool", "tool_call_id": "t1", "content": "x" * 250_000}, ] enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000)) # Should be truncated (no sandbox available) assert "Truncated" in msgs[0]["content"] or PERSISTED_OUTPUT_TAG in msgs[0]["content"] def test_returns_same_list(self): msgs = [{"role": "tool", "tool_call_id": "t1", "content": "ok"}] result = enforce_turn_budget(msgs, env=None, config=BudgetConfig(turn_budget=200_000)) assert result is msgs def test_empty_messages(self): result = enforce_turn_budget([], env=None, config=BudgetConfig(turn_budget=200_000)) assert result == [] # ── Per-tool threshold integration ──────────────────────────────────── class TestPerToolThresholds: """Verify registry wiring for per-tool thresholds.""" def test_registry_has_get_max_result_size(self): from tools.registry import registry assert hasattr(registry, "get_max_result_size") def test_default_threshold(self): from tools.registry import registry # Unknown tool should return the default val = registry.get_max_result_size("nonexistent_tool_xyz") assert val == DEFAULT_RESULT_SIZE_CHARS def test_terminal_threshold(self): from tools.registry import registry # Trigger import of terminal_tool to register the tool try: import tools.terminal_tool # noqa: F401 val = registry.get_max_result_size("terminal") assert val == 100_000 except ImportError: pytest.skip("terminal_tool not importable in test env") def test_read_file_never_persisted(self): from tools.registry import registry try: import tools.file_tools # noqa: F401 val = registry.get_max_result_size("read_file") assert val == float("inf") except ImportError: pytest.skip("file_tools not importable in test env") def test_search_files_threshold(self): from tools.registry import registry try: import tools.file_tools # noqa: F401 val = registry.get_max_result_size("search_files") assert val == 100_000 except ImportError: pytest.skip("file_tools not importable in test env")