mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-27 01:11:40 +00:00
- Introduced new skills for editing and creating PPTX presentations, including a detailed guide on template-based workflows and script usage. - Added scripts for slide management, cleaning, and packing PPTX files, enhancing the overall functionality for users. - Included a LICENSE file to clarify usage rights and restrictions. - Created a SKILL.md file to provide an overview and quick reference for PPTX-related tasks. - Documented various formatting rules, common pitfalls, and design ideas to improve presentation quality.
286 lines
9.4 KiB
Python
286 lines
9.4 KiB
Python
"""Remove unreferenced files from an unpacked PPTX directory.
|
|
|
|
Usage: python clean.py <unpacked_dir>
|
|
|
|
Example:
|
|
python clean.py unpacked/
|
|
|
|
This script removes:
|
|
- Orphaned slides (not in sldIdLst) and their relationships
|
|
- [trash] directory (unreferenced files)
|
|
- Orphaned .rels files for deleted resources
|
|
- Unreferenced media, embeddings, charts, diagrams, drawings, ink files
|
|
- Unreferenced theme files
|
|
- Unreferenced notes slides
|
|
- Content-Type overrides for deleted files
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import defusedxml.minidom
|
|
|
|
|
|
import re
|
|
|
|
|
|
def get_slides_in_sldidlst(unpacked_dir: Path) -> set[str]:
|
|
pres_path = unpacked_dir / "ppt" / "presentation.xml"
|
|
pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels"
|
|
|
|
if not pres_path.exists() or not pres_rels_path.exists():
|
|
return set()
|
|
|
|
rels_dom = defusedxml.minidom.parse(str(pres_rels_path))
|
|
rid_to_slide = {}
|
|
for rel in rels_dom.getElementsByTagName("Relationship"):
|
|
rid = rel.getAttribute("Id")
|
|
target = rel.getAttribute("Target")
|
|
rel_type = rel.getAttribute("Type")
|
|
if "slide" in rel_type and target.startswith("slides/"):
|
|
rid_to_slide[rid] = target.replace("slides/", "")
|
|
|
|
pres_content = pres_path.read_text(encoding="utf-8")
|
|
referenced_rids = set(re.findall(r'<p:sldId[^>]*r:id="([^"]+)"', pres_content))
|
|
|
|
return {rid_to_slide[rid] for rid in referenced_rids if rid in rid_to_slide}
|
|
|
|
|
|
def remove_orphaned_slides(unpacked_dir: Path) -> list[str]:
|
|
slides_dir = unpacked_dir / "ppt" / "slides"
|
|
slides_rels_dir = slides_dir / "_rels"
|
|
pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels"
|
|
|
|
if not slides_dir.exists():
|
|
return []
|
|
|
|
referenced_slides = get_slides_in_sldidlst(unpacked_dir)
|
|
removed = []
|
|
|
|
for slide_file in slides_dir.glob("slide*.xml"):
|
|
if slide_file.name not in referenced_slides:
|
|
rel_path = slide_file.relative_to(unpacked_dir)
|
|
slide_file.unlink()
|
|
removed.append(str(rel_path))
|
|
|
|
rels_file = slides_rels_dir / f"{slide_file.name}.rels"
|
|
if rels_file.exists():
|
|
rels_file.unlink()
|
|
removed.append(str(rels_file.relative_to(unpacked_dir)))
|
|
|
|
if removed and pres_rels_path.exists():
|
|
rels_dom = defusedxml.minidom.parse(str(pres_rels_path))
|
|
changed = False
|
|
|
|
for rel in list(rels_dom.getElementsByTagName("Relationship")):
|
|
target = rel.getAttribute("Target")
|
|
if target.startswith("slides/"):
|
|
slide_name = target.replace("slides/", "")
|
|
if slide_name not in referenced_slides:
|
|
if rel.parentNode:
|
|
rel.parentNode.removeChild(rel)
|
|
changed = True
|
|
|
|
if changed:
|
|
with open(pres_rels_path, "wb") as f:
|
|
f.write(rels_dom.toxml(encoding="utf-8"))
|
|
|
|
return removed
|
|
|
|
|
|
def remove_trash_directory(unpacked_dir: Path) -> list[str]:
|
|
trash_dir = unpacked_dir / "[trash]"
|
|
removed = []
|
|
|
|
if trash_dir.exists() and trash_dir.is_dir():
|
|
for file_path in trash_dir.iterdir():
|
|
if file_path.is_file():
|
|
rel_path = file_path.relative_to(unpacked_dir)
|
|
removed.append(str(rel_path))
|
|
file_path.unlink()
|
|
trash_dir.rmdir()
|
|
|
|
return removed
|
|
|
|
|
|
def get_slide_referenced_files(unpacked_dir: Path) -> set:
|
|
referenced = set()
|
|
slides_rels_dir = unpacked_dir / "ppt" / "slides" / "_rels"
|
|
|
|
if not slides_rels_dir.exists():
|
|
return referenced
|
|
|
|
for rels_file in slides_rels_dir.glob("*.rels"):
|
|
dom = defusedxml.minidom.parse(str(rels_file))
|
|
for rel in dom.getElementsByTagName("Relationship"):
|
|
target = rel.getAttribute("Target")
|
|
if not target:
|
|
continue
|
|
target_path = (rels_file.parent.parent / target).resolve()
|
|
try:
|
|
referenced.add(target_path.relative_to(unpacked_dir.resolve()))
|
|
except ValueError:
|
|
pass
|
|
|
|
return referenced
|
|
|
|
|
|
def remove_orphaned_rels_files(unpacked_dir: Path) -> list[str]:
|
|
resource_dirs = ["charts", "diagrams", "drawings"]
|
|
removed = []
|
|
slide_referenced = get_slide_referenced_files(unpacked_dir)
|
|
|
|
for dir_name in resource_dirs:
|
|
rels_dir = unpacked_dir / "ppt" / dir_name / "_rels"
|
|
if not rels_dir.exists():
|
|
continue
|
|
|
|
for rels_file in rels_dir.glob("*.rels"):
|
|
resource_file = rels_dir.parent / rels_file.name.replace(".rels", "")
|
|
try:
|
|
resource_rel_path = resource_file.resolve().relative_to(unpacked_dir.resolve())
|
|
except ValueError:
|
|
continue
|
|
|
|
if not resource_file.exists() or resource_rel_path not in slide_referenced:
|
|
rels_file.unlink()
|
|
rel_path = rels_file.relative_to(unpacked_dir)
|
|
removed.append(str(rel_path))
|
|
|
|
return removed
|
|
|
|
|
|
def get_referenced_files(unpacked_dir: Path) -> set:
|
|
referenced = set()
|
|
|
|
for rels_file in unpacked_dir.rglob("*.rels"):
|
|
dom = defusedxml.minidom.parse(str(rels_file))
|
|
for rel in dom.getElementsByTagName("Relationship"):
|
|
target = rel.getAttribute("Target")
|
|
if not target:
|
|
continue
|
|
target_path = (rels_file.parent.parent / target).resolve()
|
|
try:
|
|
referenced.add(target_path.relative_to(unpacked_dir.resolve()))
|
|
except ValueError:
|
|
pass
|
|
|
|
return referenced
|
|
|
|
|
|
def remove_orphaned_files(unpacked_dir: Path, referenced: set) -> list[str]:
|
|
resource_dirs = ["media", "embeddings", "charts", "diagrams", "tags", "drawings", "ink"]
|
|
removed = []
|
|
|
|
for dir_name in resource_dirs:
|
|
dir_path = unpacked_dir / "ppt" / dir_name
|
|
if not dir_path.exists():
|
|
continue
|
|
|
|
for file_path in dir_path.glob("*"):
|
|
if not file_path.is_file():
|
|
continue
|
|
rel_path = file_path.relative_to(unpacked_dir)
|
|
if rel_path not in referenced:
|
|
file_path.unlink()
|
|
removed.append(str(rel_path))
|
|
|
|
theme_dir = unpacked_dir / "ppt" / "theme"
|
|
if theme_dir.exists():
|
|
for file_path in theme_dir.glob("theme*.xml"):
|
|
rel_path = file_path.relative_to(unpacked_dir)
|
|
if rel_path not in referenced:
|
|
file_path.unlink()
|
|
removed.append(str(rel_path))
|
|
theme_rels = theme_dir / "_rels" / f"{file_path.name}.rels"
|
|
if theme_rels.exists():
|
|
theme_rels.unlink()
|
|
removed.append(str(theme_rels.relative_to(unpacked_dir)))
|
|
|
|
notes_dir = unpacked_dir / "ppt" / "notesSlides"
|
|
if notes_dir.exists():
|
|
for file_path in notes_dir.glob("*.xml"):
|
|
if not file_path.is_file():
|
|
continue
|
|
rel_path = file_path.relative_to(unpacked_dir)
|
|
if rel_path not in referenced:
|
|
file_path.unlink()
|
|
removed.append(str(rel_path))
|
|
|
|
notes_rels_dir = notes_dir / "_rels"
|
|
if notes_rels_dir.exists():
|
|
for file_path in notes_rels_dir.glob("*.rels"):
|
|
notes_file = notes_dir / file_path.name.replace(".rels", "")
|
|
if not notes_file.exists():
|
|
file_path.unlink()
|
|
removed.append(str(file_path.relative_to(unpacked_dir)))
|
|
|
|
return removed
|
|
|
|
|
|
def update_content_types(unpacked_dir: Path, removed_files: list[str]) -> None:
|
|
ct_path = unpacked_dir / "[Content_Types].xml"
|
|
if not ct_path.exists():
|
|
return
|
|
|
|
dom = defusedxml.minidom.parse(str(ct_path))
|
|
changed = False
|
|
|
|
for override in list(dom.getElementsByTagName("Override")):
|
|
part_name = override.getAttribute("PartName").lstrip("/")
|
|
if part_name in removed_files:
|
|
if override.parentNode:
|
|
override.parentNode.removeChild(override)
|
|
changed = True
|
|
|
|
if changed:
|
|
with open(ct_path, "wb") as f:
|
|
f.write(dom.toxml(encoding="utf-8"))
|
|
|
|
|
|
def clean_unused_files(unpacked_dir: Path) -> list[str]:
|
|
all_removed = []
|
|
|
|
slides_removed = remove_orphaned_slides(unpacked_dir)
|
|
all_removed.extend(slides_removed)
|
|
|
|
trash_removed = remove_trash_directory(unpacked_dir)
|
|
all_removed.extend(trash_removed)
|
|
|
|
while True:
|
|
removed_rels = remove_orphaned_rels_files(unpacked_dir)
|
|
referenced = get_referenced_files(unpacked_dir)
|
|
removed_files = remove_orphaned_files(unpacked_dir, referenced)
|
|
|
|
total_removed = removed_rels + removed_files
|
|
if not total_removed:
|
|
break
|
|
|
|
all_removed.extend(total_removed)
|
|
|
|
if all_removed:
|
|
update_content_types(unpacked_dir, all_removed)
|
|
|
|
return all_removed
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python clean.py <unpacked_dir>", file=sys.stderr)
|
|
print("Example: python clean.py unpacked/", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
unpacked_dir = Path(sys.argv[1])
|
|
|
|
if not unpacked_dir.exists():
|
|
print(f"Error: {unpacked_dir} not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
removed = clean_unused_files(unpacked_dir)
|
|
|
|
if removed:
|
|
print(f"Removed {len(removed)} unreferenced files:")
|
|
for f in removed:
|
|
print(f" {f}")
|
|
else:
|
|
print("No unreferenced files found")
|