mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
The skills directory was getting disorganized — mlops alone had 40 skills in a flat list, and 12 categories were singletons with just one skill each. Code change: - prompt_builder.py: Support sub-categories in skill scanner. skills/mlops/training/axolotl/SKILL.md now shows as category 'mlops/training' instead of just 'mlops'. Backwards-compatible with existing flat structure. Split mlops (40 skills) into 7 sub-categories: - mlops/training (12): accelerate, axolotl, flash-attention, grpo-rl-training, peft, pytorch-fsdp, pytorch-lightning, simpo, slime, torchtitan, trl-fine-tuning, unsloth - mlops/inference (8): gguf, guidance, instructor, llama-cpp, obliteratus, outlines, tensorrt-llm, vllm - mlops/models (6): audiocraft, clip, llava, segment-anything, stable-diffusion, whisper - mlops/vector-databases (4): chroma, faiss, pinecone, qdrant - mlops/evaluation (5): huggingface-tokenizers, lm-evaluation-harness, nemo-curator, saelens, weights-and-biases - mlops/cloud (2): lambda-labs, modal - mlops/research (1): dspy Merged singleton categories: - gifs → media (gif-search joins youtube-content) - music-creation → media (heartmula, songsee) - diagramming → creative (excalidraw joins ascii-art) - ocr-and-documents → productivity - domain → research (domain-intel) - feeds → research (blogwatcher) - market-data → research (polymarket) Fixed misplaced skills: - mlops/code-review → software-development (not ML-specific) - mlops/ml-paper-writing → research (academic writing) Added DESCRIPTION.md files for all new/updated categories.
98 lines
3.1 KiB
Python
98 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Extract text from documents using pymupdf. Lightweight (~25MB), no models.
|
|
|
|
Usage:
|
|
python extract_pymupdf.py document.pdf
|
|
python extract_pymupdf.py document.pdf --markdown
|
|
python extract_pymupdf.py document.pdf --pages 0-4
|
|
python extract_pymupdf.py document.pdf --images output_dir/
|
|
python extract_pymupdf.py document.pdf --tables
|
|
python extract_pymupdf.py document.pdf --metadata
|
|
"""
|
|
import sys
|
|
import json
|
|
|
|
def extract_text(path, pages=None):
|
|
import pymupdf
|
|
doc = pymupdf.open(path)
|
|
page_range = range(len(doc)) if pages is None else pages
|
|
for i in page_range:
|
|
if i < len(doc):
|
|
print(f"\n--- Page {i+1}/{len(doc)} ---\n")
|
|
print(doc[i].get_text())
|
|
|
|
def extract_markdown(path, pages=None):
|
|
import pymupdf4llm
|
|
md = pymupdf4llm.to_markdown(path, pages=pages)
|
|
print(md)
|
|
|
|
def extract_tables(path):
|
|
import pymupdf
|
|
doc = pymupdf.open(path)
|
|
for i, page in enumerate(doc):
|
|
tables = page.find_tables()
|
|
for j, table in enumerate(tables.tables):
|
|
print(f"\n--- Page {i+1}, Table {j+1} ---\n")
|
|
df = table.to_pandas()
|
|
print(df.to_markdown(index=False))
|
|
|
|
def extract_images(path, output_dir):
|
|
import pymupdf
|
|
from pathlib import Path
|
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
doc = pymupdf.open(path)
|
|
count = 0
|
|
for i, page in enumerate(doc):
|
|
for img_idx, img in enumerate(page.get_images(full=True)):
|
|
xref = img[0]
|
|
pix = pymupdf.Pixmap(doc, xref)
|
|
if pix.n >= 5:
|
|
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
|
|
out_path = f"{output_dir}/page{i+1}_img{img_idx+1}.png"
|
|
pix.save(out_path)
|
|
count += 1
|
|
print(f"Extracted {count} images to {output_dir}/")
|
|
|
|
def show_metadata(path):
|
|
import pymupdf
|
|
doc = pymupdf.open(path)
|
|
print(json.dumps({
|
|
"pages": len(doc),
|
|
"title": doc.metadata.get("title", ""),
|
|
"author": doc.metadata.get("author", ""),
|
|
"subject": doc.metadata.get("subject", ""),
|
|
"creator": doc.metadata.get("creator", ""),
|
|
"producer": doc.metadata.get("producer", ""),
|
|
"format": doc.metadata.get("format", ""),
|
|
}, indent=2))
|
|
|
|
if __name__ == "__main__":
|
|
args = sys.argv[1:]
|
|
if not args or args[0] in ("-h", "--help"):
|
|
print(__doc__)
|
|
sys.exit(0)
|
|
|
|
path = args[0]
|
|
pages = None
|
|
|
|
if "--pages" in args:
|
|
idx = args.index("--pages")
|
|
p = args[idx + 1]
|
|
if "-" in p:
|
|
start, end = p.split("-")
|
|
pages = list(range(int(start), int(end) + 1))
|
|
else:
|
|
pages = [int(p)]
|
|
|
|
if "--metadata" in args:
|
|
show_metadata(path)
|
|
elif "--tables" in args:
|
|
extract_tables(path)
|
|
elif "--images" in args:
|
|
idx = args.index("--images")
|
|
output_dir = args[idx + 1] if idx + 1 < len(args) else "./images"
|
|
extract_images(path, output_dir)
|
|
elif "--markdown" in args:
|
|
extract_markdown(path, pages=pages)
|
|
else:
|
|
extract_text(path, pages=pages)
|