mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-27 01:11:40 +00:00
- Introduced new skills for editing and creating PPTX presentations, including a detailed guide on template-based workflows and script usage. - Added scripts for slide management, cleaning, and packing PPTX files, enhancing the overall functionality for users. - Included a LICENSE file to clarify usage rights and restrictions. - Created a SKILL.md file to provide an overview and quick reference for PPTX-related tasks. - Documented various formatting rules, common pitfalls, and design ideas to improve presentation quality.
199 lines
5.4 KiB
Python
199 lines
5.4 KiB
Python
"""Merge adjacent runs with identical formatting in DOCX.
|
|
|
|
Merges adjacent <w:r> elements that have identical <w:rPr> properties.
|
|
Works on runs in paragraphs and inside tracked changes (<w:ins>, <w:del>).
|
|
|
|
Also:
|
|
- Removes rsid attributes from runs (revision metadata that doesn't affect rendering)
|
|
- Removes proofErr elements (spell/grammar markers that block merging)
|
|
"""
|
|
|
|
from pathlib import Path
|
|
|
|
import defusedxml.minidom
|
|
|
|
|
|
def merge_runs(input_dir: str) -> tuple[int, str]:
|
|
doc_xml = Path(input_dir) / "word" / "document.xml"
|
|
|
|
if not doc_xml.exists():
|
|
return 0, f"Error: {doc_xml} not found"
|
|
|
|
try:
|
|
dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
|
|
root = dom.documentElement
|
|
|
|
_remove_elements(root, "proofErr")
|
|
_strip_run_rsid_attrs(root)
|
|
|
|
containers = {run.parentNode for run in _find_elements(root, "r")}
|
|
|
|
merge_count = 0
|
|
for container in containers:
|
|
merge_count += _merge_runs_in(container)
|
|
|
|
doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))
|
|
return merge_count, f"Merged {merge_count} runs"
|
|
|
|
except Exception as e:
|
|
return 0, f"Error: {e}"
|
|
|
|
|
|
|
|
|
|
def _find_elements(root, tag: str) -> list:
|
|
results = []
|
|
|
|
def traverse(node):
|
|
if node.nodeType == node.ELEMENT_NODE:
|
|
name = node.localName or node.tagName
|
|
if name == tag or name.endswith(f":{tag}"):
|
|
results.append(node)
|
|
for child in node.childNodes:
|
|
traverse(child)
|
|
|
|
traverse(root)
|
|
return results
|
|
|
|
|
|
def _get_child(parent, tag: str):
|
|
for child in parent.childNodes:
|
|
if child.nodeType == child.ELEMENT_NODE:
|
|
name = child.localName or child.tagName
|
|
if name == tag or name.endswith(f":{tag}"):
|
|
return child
|
|
return None
|
|
|
|
|
|
def _get_children(parent, tag: str) -> list:
|
|
results = []
|
|
for child in parent.childNodes:
|
|
if child.nodeType == child.ELEMENT_NODE:
|
|
name = child.localName or child.tagName
|
|
if name == tag or name.endswith(f":{tag}"):
|
|
results.append(child)
|
|
return results
|
|
|
|
|
|
def _is_adjacent(elem1, elem2) -> bool:
|
|
node = elem1.nextSibling
|
|
while node:
|
|
if node == elem2:
|
|
return True
|
|
if node.nodeType == node.ELEMENT_NODE:
|
|
return False
|
|
if node.nodeType == node.TEXT_NODE and node.data.strip():
|
|
return False
|
|
node = node.nextSibling
|
|
return False
|
|
|
|
|
|
|
|
|
|
def _remove_elements(root, tag: str):
|
|
for elem in _find_elements(root, tag):
|
|
if elem.parentNode:
|
|
elem.parentNode.removeChild(elem)
|
|
|
|
|
|
def _strip_run_rsid_attrs(root):
|
|
for run in _find_elements(root, "r"):
|
|
for attr in list(run.attributes.values()):
|
|
if "rsid" in attr.name.lower():
|
|
run.removeAttribute(attr.name)
|
|
|
|
|
|
|
|
|
|
def _merge_runs_in(container) -> int:
|
|
merge_count = 0
|
|
run = _first_child_run(container)
|
|
|
|
while run:
|
|
while True:
|
|
next_elem = _next_element_sibling(run)
|
|
if next_elem and _is_run(next_elem) and _can_merge(run, next_elem):
|
|
_merge_run_content(run, next_elem)
|
|
container.removeChild(next_elem)
|
|
merge_count += 1
|
|
else:
|
|
break
|
|
|
|
_consolidate_text(run)
|
|
run = _next_sibling_run(run)
|
|
|
|
return merge_count
|
|
|
|
|
|
def _first_child_run(container):
|
|
for child in container.childNodes:
|
|
if child.nodeType == child.ELEMENT_NODE and _is_run(child):
|
|
return child
|
|
return None
|
|
|
|
|
|
def _next_element_sibling(node):
|
|
sibling = node.nextSibling
|
|
while sibling:
|
|
if sibling.nodeType == sibling.ELEMENT_NODE:
|
|
return sibling
|
|
sibling = sibling.nextSibling
|
|
return None
|
|
|
|
|
|
def _next_sibling_run(node):
|
|
sibling = node.nextSibling
|
|
while sibling:
|
|
if sibling.nodeType == sibling.ELEMENT_NODE:
|
|
if _is_run(sibling):
|
|
return sibling
|
|
sibling = sibling.nextSibling
|
|
return None
|
|
|
|
|
|
def _is_run(node) -> bool:
|
|
name = node.localName or node.tagName
|
|
return name == "r" or name.endswith(":r")
|
|
|
|
|
|
def _can_merge(run1, run2) -> bool:
|
|
rpr1 = _get_child(run1, "rPr")
|
|
rpr2 = _get_child(run2, "rPr")
|
|
|
|
if (rpr1 is None) != (rpr2 is None):
|
|
return False
|
|
if rpr1 is None:
|
|
return True
|
|
return rpr1.toxml() == rpr2.toxml()
|
|
|
|
|
|
def _merge_run_content(target, source):
|
|
for child in list(source.childNodes):
|
|
if child.nodeType == child.ELEMENT_NODE:
|
|
name = child.localName or child.tagName
|
|
if name != "rPr" and not name.endswith(":rPr"):
|
|
target.appendChild(child)
|
|
|
|
|
|
def _consolidate_text(run):
|
|
t_elements = _get_children(run, "t")
|
|
|
|
for i in range(len(t_elements) - 1, 0, -1):
|
|
curr, prev = t_elements[i], t_elements[i - 1]
|
|
|
|
if _is_adjacent(prev, curr):
|
|
prev_text = prev.firstChild.data if prev.firstChild else ""
|
|
curr_text = curr.firstChild.data if curr.firstChild else ""
|
|
merged = prev_text + curr_text
|
|
|
|
if prev.firstChild:
|
|
prev.firstChild.data = merged
|
|
else:
|
|
prev.appendChild(run.ownerDocument.createTextNode(merged))
|
|
|
|
if merged.startswith(" ") or merged.endswith(" "):
|
|
prev.setAttribute("xml:space", "preserve")
|
|
elif prev.hasAttribute("xml:space"):
|
|
prev.removeAttribute("xml:space")
|
|
|
|
run.removeChild(curr)
|