hermes-agent/website/scripts/extract-skills.py
Teknium 5971a4e092
feat(docs): richer info panels on the Skills Hub for built-in + optional skills (#22905)
The Skills Hub at /skills had cards that, when expanded, showed only the
one-line description, tags, author, version, and an install command. For
the 163 bundled and optional skills shipped with the repo, this was thinner
than the data we already have on disk.

Three changes, all under website/:

1. extract-skills.py now pulls four extra fields per local skill:
   - 'overview' — first non-heading body paragraph from SKILL.md (stripped
     of admonitions/code fences, capped at ~500 chars at a sentence boundary)
   - 'envVars' / 'commands' — from the prerequisites: block in frontmatter
   - 'license' — from the top-level frontmatter
   - 'docsPath' — slug to the per-skill /docs/user-guide/skills/.../* page,
     computed with the same logic as generate-skill-docs.py

   162 of 163 local skills get a non-empty overview automatically. The
   remaining one (media/heartmula) has only headings/code in its body and
   falls through to the description.

2. Skill TS interface + SkillCard expanded-panel render the new fields:
   - Overview paragraph at the top of the panel
   - Prerequisites box (env vars + required commands) when frontmatter
     declares them
   - License row alongside author/version
   - 'View full documentation →' link to the per-skill docs page

   Search now covers the overview text too, so users can find skills by
   matching content from inside SKILL.md, not just the one-line description.

3. styles.module.css gains six new classes (overviewBlock, detailLabel,
   overviewText, prereqBlock/Row/Kind/List/Item, docsLink) styled to match
   the existing dark panel aesthetic.

External / community skills (Anthropic, LobeHub, Claude Marketplace cached
indexes) keep the old behavior — overview is empty, no prereqs, no docsPath.

Validation: 'npm run build' clean (exit 0); broken-link count unchanged at
155 baseline; all 163 generated docsPath values resolve to existing pages
under website/docs/user-guide/skills/.
2026-05-09 18:17:39 -07:00

352 lines
12 KiB
Python

#!/usr/bin/env python3
"""Extract skill metadata from SKILL.md files and index caches into JSON."""
import json
import os
from collections import Counter
import yaml
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
LOCAL_SKILL_DIRS = [
("skills", "built-in"),
("optional-skills", "optional"),
]
INDEX_CACHE_DIR = os.path.join(REPO_ROOT, "skills", "index-cache")
OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills.json")
CATEGORY_LABELS = {
"apple": "Apple",
"autonomous-ai-agents": "AI Agents",
"blockchain": "Blockchain",
"communication": "Communication",
"creative": "Creative",
"data-science": "Data Science",
"devops": "DevOps",
"dogfood": "Dogfood",
"domain": "Domain",
"email": "Email",
"gaming": "Gaming",
"gifs": "GIFs",
"github": "GitHub",
"health": "Health",
"inference-sh": "Inference",
"leisure": "Leisure",
"mcp": "MCP",
"media": "Media",
"migration": "Migration",
"mlops": "MLOps",
"note-taking": "Note-Taking",
"productivity": "Productivity",
"red-teaming": "Red Teaming",
"research": "Research",
"security": "Security",
"smart-home": "Smart Home",
"social-media": "Social Media",
"software-development": "Software Dev",
"translation": "Translation",
"other": "Other",
}
SOURCE_LABELS = {
"anthropics_skills": "Anthropic",
"openai_skills": "OpenAI",
"claude_marketplace": "Claude Marketplace",
"lobehub": "LobeHub",
}
def _extract_overview(body: str) -> str:
"""Pull the first non-heading paragraph from a SKILL.md body.
Skips H1/H2/etc. lines so the overview is real prose, not a heading.
Strips markdown links/code-fence syntax to plain-ish text. Capped at
~500 chars so the SkillCard panel stays a reasonable size.
"""
if not body:
return ""
paragraphs = [p.strip() for p in body.split("\n\n") if p.strip()]
for p in paragraphs[:6]:
# Skip pure heading paragraphs ("# Foo", "## Foo")
if p.startswith("#"):
# If a heading paragraph also has body text on later lines, take those
lines = [ln for ln in p.split("\n") if ln.strip() and not ln.lstrip().startswith("#")]
if lines:
p = "\n".join(lines).strip()
else:
continue
# Skip a leading admonition fence (:::tip / :::info / etc.)
if p.startswith(":::"):
continue
# Skip pure code fences and frontmatter-style blocks
if p.startswith("```") or p.startswith("~~~"):
continue
# Trim to roughly 500 chars at a sentence boundary
if len(p) > 500:
cut = p[:500]
last_period = cut.rfind(". ")
if last_period > 200:
p = cut[: last_period + 1]
else:
p = cut.rstrip() + ""
return p
return ""
def _docs_page_path(rel_dir: str, source_label: str) -> str:
"""Compute the per-skill docs-site URL slug for a given SKILL.md location.
Mirrors the slug logic in website/scripts/generate-skill-docs.py:
bundled + skills/<cat>/<slug>/SKILL.md -> bundled/<cat>/<cat>-<slug>
bundled + skills/<cat>/<sub>/<slug>/SKILL.md -> bundled/<cat>/<cat>-<sub>-<slug>
optional + optional-skills/<cat>/<slug>/SKILL.md -> optional/<cat>/<cat>-<slug>
"""
parts = [p for p in rel_dir.split(os.sep) if p]
if not parts:
return ""
source_dir = "bundled" if source_label == "built-in" else "optional"
if len(parts) == 1:
category, slug = parts[0], parts[0]
return f"{source_dir}/{category}/{category}-{slug}"
if len(parts) == 2:
category, slug = parts
return f"{source_dir}/{category}/{category}-{slug}"
if len(parts) == 3:
category, sub, slug = parts
return f"{source_dir}/{category}/{category}-{sub}-{slug}"
return ""
def extract_local_skills():
skills = []
for base_dir, source_label in LOCAL_SKILL_DIRS:
base_path = os.path.join(REPO_ROOT, base_dir)
if not os.path.isdir(base_path):
continue
for root, _dirs, files in os.walk(base_path):
if "SKILL.md" not in files:
continue
skill_path = os.path.join(root, "SKILL.md")
with open(skill_path, encoding="utf-8") as f:
content = f.read()
if not content.startswith("---"):
continue
parts = content.split("---", 2)
if len(parts) < 3:
continue
try:
fm = yaml.safe_load(parts[1])
except yaml.YAMLError:
continue
if not fm or not isinstance(fm, dict):
continue
body = parts[2].strip()
overview = _extract_overview(body)
rel = os.path.relpath(root, base_path)
category = rel.split(os.sep)[0]
tags = []
metadata = fm.get("metadata")
if isinstance(metadata, dict):
hermes_meta = metadata.get("hermes", {})
if isinstance(hermes_meta, dict):
tags = hermes_meta.get("tags", [])
if not tags:
tags = fm.get("tags", [])
if isinstance(tags, str):
tags = [tags]
# Optional structured prerequisites — surfaced in the SkillCard panel
prereq = fm.get("prerequisites") or {}
env_vars = []
commands = []
if isinstance(prereq, dict):
ev = prereq.get("env_vars")
if isinstance(ev, list):
env_vars = [str(x) for x in ev if x]
elif isinstance(ev, str) and ev.strip():
env_vars = [ev.strip()]
cmds = prereq.get("commands")
if isinstance(cmds, list):
commands = [str(x) for x in cmds if x]
elif isinstance(cmds, str) and cmds.strip():
commands = [cmds.strip()]
skills.append({
"name": fm.get("name", os.path.basename(root)),
"description": fm.get("description", ""),
"overview": overview,
"category": category,
"categoryLabel": CATEGORY_LABELS.get(category, category.replace("-", " ").title()),
"source": source_label,
"tags": tags or [],
"platforms": fm.get("platforms", []),
"author": fm.get("author", ""),
"version": fm.get("version", ""),
"license": fm.get("license", ""),
"envVars": env_vars,
"commands": commands,
"docsPath": _docs_page_path(rel, source_label),
})
return skills
def extract_cached_index_skills():
skills = []
if not os.path.isdir(INDEX_CACHE_DIR):
return skills
for filename in os.listdir(INDEX_CACHE_DIR):
if not filename.endswith(".json"):
continue
filepath = os.path.join(INDEX_CACHE_DIR, filename)
try:
with open(filepath, encoding="utf-8") as f:
data = json.load(f)
except (json.JSONDecodeError, OSError):
continue
stem = filename.replace(".json", "")
source_label = "community"
for key, label in SOURCE_LABELS.items():
if key in stem:
source_label = label
break
if isinstance(data, dict) and "agents" in data:
for agent in data["agents"]:
if not isinstance(agent, dict):
continue
skills.append({
"name": agent.get("identifier", agent.get("meta", {}).get("title", "unknown")),
"description": (agent.get("meta", {}).get("description", "") or "").split("\n")[0][:200],
"category": _guess_category(agent.get("meta", {}).get("tags", [])),
"categoryLabel": "", # filled below
"source": source_label,
"tags": agent.get("meta", {}).get("tags", []),
"platforms": [],
"author": agent.get("author", ""),
"version": "",
})
continue
if isinstance(data, list):
for entry in data:
if not isinstance(entry, dict) or not entry.get("name"):
continue
if "skills" in entry and isinstance(entry["skills"], list):
continue
skills.append({
"name": entry.get("name", ""),
"description": entry.get("description", ""),
"category": "uncategorized",
"categoryLabel": "",
"source": source_label,
"tags": entry.get("tags", []),
"platforms": [],
"author": "",
"version": "",
})
for s in skills:
if not s["categoryLabel"]:
s["categoryLabel"] = CATEGORY_LABELS.get(
s["category"],
s["category"].replace("-", " ").title() if s["category"] else "Uncategorized",
)
return skills
TAG_TO_CATEGORY = {}
for _cat, _tags in {
"software-development": [
"programming", "code", "coding", "software-development",
"frontend-development", "backend-development", "web-development",
"react", "python", "typescript", "java", "rust",
],
"creative": ["writing", "design", "creative", "art", "image-generation"],
"research": ["education", "academic", "research"],
"social-media": ["marketing", "seo", "social-media"],
"productivity": ["productivity", "business"],
"data-science": ["data", "data-science"],
"mlops": ["machine-learning", "deep-learning"],
"devops": ["devops"],
"gaming": ["gaming", "game", "game-development"],
"media": ["music", "media", "video"],
"health": ["health", "fitness"],
"translation": ["translation", "language-learning"],
"security": ["security", "cybersecurity"],
}.items():
for _t in _tags:
TAG_TO_CATEGORY[_t] = _cat
def _guess_category(tags: list) -> str:
if not tags:
return "uncategorized"
for tag in tags:
cat = TAG_TO_CATEGORY.get(tag.lower())
if cat:
return cat
return tags[0].lower().replace(" ", "-")
MIN_CATEGORY_SIZE = 4
def _consolidate_small_categories(skills: list) -> list:
for s in skills:
if s["category"] in ("uncategorized", ""):
s["category"] = "other"
s["categoryLabel"] = "Other"
counts = Counter(s["category"] for s in skills)
small_cats = {cat for cat, n in counts.items() if n < MIN_CATEGORY_SIZE}
for s in skills:
if s["category"] in small_cats:
s["category"] = "other"
s["categoryLabel"] = "Other"
return skills
def main():
local = extract_local_skills()
external = extract_cached_index_skills()
all_skills = _consolidate_small_categories(local + external)
source_order = {"built-in": 0, "optional": 1}
all_skills.sort(key=lambda s: (
source_order.get(s["source"], 2),
1 if s["category"] == "other" else 0,
s["category"],
s["name"],
))
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
with open(OUTPUT, "w", encoding="utf-8") as f:
json.dump(all_skills, f, indent=2)
print(f"Extracted {len(all_skills)} skills to {OUTPUT}")
print(f" {len(local)} local ({sum(1 for s in local if s['source'] == 'built-in')} built-in, "
f"{sum(1 for s in local if s['source'] == 'optional')} optional)")
print(f" {len(external)} from external indexes")
if __name__ == "__main__":
main()