#!/usr/bin/env python3 """Extract skill metadata from SKILL.md files and index caches into JSON.""" import json import os from collections import Counter import yaml REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) LOCAL_SKILL_DIRS = [ ("skills", "built-in"), ("optional-skills", "optional"), ] INDEX_CACHE_DIR = os.path.join(REPO_ROOT, "skills", "index-cache") OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills.json") CATEGORY_LABELS = { "apple": "Apple", "autonomous-ai-agents": "AI Agents", "blockchain": "Blockchain", "communication": "Communication", "creative": "Creative", "data-science": "Data Science", "devops": "DevOps", "dogfood": "Dogfood", "domain": "Domain", "email": "Email", "gaming": "Gaming", "gifs": "GIFs", "github": "GitHub", "health": "Health", "inference-sh": "Inference", "leisure": "Leisure", "mcp": "MCP", "media": "Media", "migration": "Migration", "mlops": "MLOps", "note-taking": "Note-Taking", "productivity": "Productivity", "red-teaming": "Red Teaming", "research": "Research", "security": "Security", "smart-home": "Smart Home", "social-media": "Social Media", "software-development": "Software Dev", "translation": "Translation", "other": "Other", } SOURCE_LABELS = { "anthropics_skills": "Anthropic", "openai_skills": "OpenAI", "claude_marketplace": "Claude Marketplace", "lobehub": "LobeHub", } def _extract_overview(body: str) -> str: """Pull the first non-heading paragraph from a SKILL.md body. Skips H1/H2/etc. lines so the overview is real prose, not a heading. Strips markdown links/code-fence syntax to plain-ish text. Capped at ~500 chars so the SkillCard panel stays a reasonable size. """ if not body: return "" paragraphs = [p.strip() for p in body.split("\n\n") if p.strip()] for p in paragraphs[:6]: # Skip pure heading paragraphs ("# Foo", "## Foo") if p.startswith("#"): # If a heading paragraph also has body text on later lines, take those lines = [ln for ln in p.split("\n") if ln.strip() and not ln.lstrip().startswith("#")] if lines: p = "\n".join(lines).strip() else: continue # Skip a leading admonition fence (:::tip / :::info / etc.) if p.startswith(":::"): continue # Skip pure code fences and frontmatter-style blocks if p.startswith("```") or p.startswith("~~~"): continue # Trim to roughly 500 chars at a sentence boundary if len(p) > 500: cut = p[:500] last_period = cut.rfind(". ") if last_period > 200: p = cut[: last_period + 1] else: p = cut.rstrip() + "…" return p return "" def _docs_page_path(rel_dir: str, source_label: str) -> str: """Compute the per-skill docs-site URL slug for a given SKILL.md location. Mirrors the slug logic in website/scripts/generate-skill-docs.py: bundled + skills///SKILL.md -> bundled//- bundled + skills//_{//SKILL.md -> bundled//-_{-
optional + optional-skills///SKILL.md -> optional//-
"""
parts = [p for p in rel_dir.split(os.sep) if p]
if not parts:
return ""
source_dir = "bundled" if source_label == "built-in" else "optional"
if len(parts) == 1:
category, slug = parts[0], parts[0]
return f"{source_dir}/{category}/{category}-{slug}"
if len(parts) == 2:
category, slug = parts
return f"{source_dir}/{category}/{category}-{slug}"
if len(parts) == 3:
category, sub, slug = parts
return f"{source_dir}/{category}/{category}-{sub}-{slug}"
return ""

def extract_local_skills():
skills = []

for base_dir, source_label in LOCAL_SKILL_DIRS:
base_path = os.path.join(REPO_ROOT, base_dir)
if not os.path.isdir(base_path):
continue

for root, _dirs, files in os.walk(base_path):
if "SKILL.md" not in files:
continue

skill_path = os.path.join(root, "SKILL.md")
with open(skill_path, encoding="utf-8") as f:
content = f.read()

if not content.startswith("---"):
continue

parts = content.split("---", 2)
if len(parts) < 3:
continue

try:
fm = yaml.safe_load(parts[1])
except yaml.YAMLError:
continue

if not fm or not isinstance(fm, dict):
continue

body = parts[2].strip()
overview = _extract_overview(body)

rel = os.path.relpath(root, base_path)
category = rel.split(os.sep)[0]

tags = []
metadata = fm.get("metadata")
if isinstance(metadata, dict):
hermes_meta = metadata.get("hermes", {})
if isinstance(hermes_meta, dict):
tags = hermes_meta.get("tags", [])
if not tags:
tags = fm.get("tags", [])
if isinstance(tags, str):
tags = [tags]

# Optional structured prerequisites — surfaced in the SkillCard panel
prereq = fm.get("prerequisites") or {}
env_vars = []
commands = []
if isinstance(prereq, dict):
ev = prereq.get("env_vars")
if isinstance(ev, list):
env_vars = [str(x) for x in ev if x]
elif isinstance(ev, str) and ev.strip():
env_vars = [ev.strip()]
cmds = prereq.get("commands")
if isinstance(cmds, list):
commands = [str(x) for x in cmds if x]
elif isinstance(cmds, str) and cmds.strip():
commands = [cmds.strip()]

skills.append({
"name": fm.get("name", os.path.basename(root)),
"description": fm.get("description", ""),
"overview": overview,
"category": category,
"categoryLabel": CATEGORY_LABELS.get(category, category.replace("-", " ").title()),
"source": source_label,
"tags": tags or [],
"platforms": fm.get("platforms", []),
"author": fm.get("author", ""),
"version": fm.get("version", ""),
"license": fm.get("license", ""),
"envVars": env_vars,
"commands": commands,
"docsPath": _docs_page_path(rel, source_label),
})

return skills

def extract_cached_index_skills():
skills = []

if not os.path.isdir(INDEX_CACHE_DIR):
return skills

for filename in os.listdir(INDEX_CACHE_DIR):
if not filename.endswith(".json"):
continue

filepath = os.path.join(INDEX_CACHE_DIR, filename)
try:
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
except (json.JSONDecodeError, OSError):
continue

stem = filename.replace(".json", "")
source_label = "community"
for key, label in SOURCE_LABELS.items():
if key in stem:
source_label = label
break

if isinstance(data, dict) and "agents" in data:
for agent in data["agents"]:
if not isinstance(agent, dict):
continue
skills.append({
"name": agent.get("identifier", agent.get("meta", {}).get("title", "unknown")),
"description": (agent.get("meta", {}).get("description", "") or "").split("\n")[0][:200],
"category": _guess_category(agent.get("meta", {}).get("tags", [])),
"categoryLabel": "", # filled below
"source": source_label,
"tags": agent.get("meta", {}).get("tags", []),
"platforms": [],
"author": agent.get("author", ""),
"version": "",
})
continue

if isinstance(data, list):
for entry in data:
if not isinstance(entry, dict) or not entry.get("name"):
continue
if "skills" in entry and isinstance(entry["skills"], list):
continue
skills.append({
"name": entry.get("name", ""),
"description": entry.get("description", ""),
"category": "uncategorized",
"categoryLabel": "",
"source": source_label,
"tags": entry.get("tags", []),
"platforms": [],
"author": "",
"version": "",
})

for s in skills:
if not s["categoryLabel"]:
s["categoryLabel"] = CATEGORY_LABELS.get(
s["category"],
s["category"].replace("-", " ").title() if s["category"] else "Uncategorized",
)

return skills

TAG_TO_CATEGORY = {}
for _cat, _tags in {
"software-development": [
"programming", "code", "coding", "software-development",
"frontend-development", "backend-development", "web-development",
"react", "python", "typescript", "java", "rust",
],
"creative": ["writing", "design", "creative", "art", "image-generation"],
"research": ["education", "academic", "research"],
"social-media": ["marketing", "seo", "social-media"],
"productivity": ["productivity", "business"],
"data-science": ["data", "data-science"],
"mlops": ["machine-learning", "deep-learning"],
"devops": ["devops"],
"gaming": ["gaming", "game", "game-development"],
"media": ["music", "media", "video"],
"health": ["health", "fitness"],
"translation": ["translation", "language-learning"],
"security": ["security", "cybersecurity"],
}.items():
for _t in _tags:
TAG_TO_CATEGORY[_t] = _cat

def _guess_category(tags: list) -> str:
if not tags:
return "uncategorized"
for tag in tags:
cat = TAG_TO_CATEGORY.get(tag.lower())
if cat:
return cat
return tags[0].lower().replace(" ", "-")

MIN_CATEGORY_SIZE = 4

def _consolidate_small_categories(skills: list) -> list:
for s in skills:
if s["category"] in ("uncategorized", ""):
s["category"] = "other"
s["categoryLabel"] = "Other"

counts = Counter(s["category"] for s in skills)
small_cats = {cat for cat, n in counts.items() if n < MIN_CATEGORY_SIZE}

for s in skills:
if s["category"] in small_cats:
s["category"] = "other"
s["categoryLabel"] = "Other"

return skills

def main():
local = extract_local_skills()
external = extract_cached_index_skills()

all_skills = _consolidate_small_categories(local + external)

source_order = {"built-in": 0, "optional": 1}
all_skills.sort(key=lambda s: (
source_order.get(s["source"], 2),
1 if s["category"] == "other" else 0,
s["category"],
s["name"],
))

os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
with open(OUTPUT, "w", encoding="utf-8") as f:
json.dump(all_skills, f, indent=2)

print(f"Extracted {len(all_skills)} skills to {OUTPUT}")
print(f" {len(local)} local ({sum(1 for s in local if s['source'] == 'built-in')} built-in, "
f"{sum(1 for s in local if s['source'] == 'optional')} optional)")
print(f" {len(external)} from external indexes")

if __name__ == "__main__":
main()}}