#!/usr/bin/env python3 """Extract skill metadata into website/src/data/skills.json for the Skills Hub page. Two data sources: 1. Local SKILL.md files under ``skills/`` (built-in) and ``optional-skills/`` (official optional). These give us full metadata — overview prose, version, license, env vars, commands — that the unified index doesn't carry. 2. The unified Hermes Skills Index at ``website/static/api/skills-index.json``, built twice daily by ``scripts/build_skills_index.py`` (workflow ``.github/workflows/skills-index.yml``). Covers skills.sh, ClawHub, browse.sh, LobeHub, Claude Marketplace, well-known endpoints, and the GitHub taps (openai/skills, anthropics/skills, huggingface/skills, VoltAgent, etc.). Legacy fallback: if the unified index is missing AND ``skills/index-cache/`` contains pre-baked JSON dumps, we read those (preserves behaviour from before the unified index existed). """ import json import os from collections import Counter import yaml REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) LOCAL_SKILL_DIRS = [ ("skills", "built-in"), ("optional-skills", "optional"), ] UNIFIED_INDEX_PATH = os.path.join(REPO_ROOT, "website", "static", "api", "skills-index.json") LEGACY_INDEX_CACHE_DIR = os.path.join(REPO_ROOT, "skills", "index-cache") OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills.json") CATEGORY_LABELS = { "apple": "Apple", "autonomous-ai-agents": "AI Agents", "blockchain": "Blockchain", "communication": "Communication", "creative": "Creative", "data-science": "Data Science", "devops": "DevOps", "dogfood": "Dogfood", "domain": "Domain", "email": "Email", "gaming": "Gaming", "gifs": "GIFs", "github": "GitHub", "health": "Health", "inference-sh": "Inference", "leisure": "Leisure", "mcp": "MCP", "media": "Media", "migration": "Migration", "mlops": "MLOps", "note-taking": "Note-Taking", "productivity": "Productivity", "red-teaming": "Red Teaming", "research": "Research", "security": "Security", "smart-home": "Smart Home", "social-media": "Social Media", "software-development": "Software Dev", "translation": "Translation", "other": "Other", } # Map the source ids the unified index emits to the friendly labels the # Skills Hub UI uses. Keep these in sync with the SOURCE_CONFIG dict in # website/src/pages/skills/index.tsx. UNIFIED_SOURCE_LABELS = { "official": "official", # treated as our "optional" tier in the UI "skills.sh": "skills.sh", "skills-sh": "skills.sh", "clawhub": "ClawHub", "browse-sh": "browse.sh", "lobehub": "LobeHub", "claude-marketplace": "Claude Marketplace", "well-known": "Well-Known", "github": "GitHub", # default for non-named GitHub taps } # Repo-specific labels for the unified index's "github" source. Lets us # call out the well-known taps with their vendor name instead of a generic # "GitHub" pill. Match is checked against the leading "owner/repo/" prefix # of the identifier. GITHUB_TAP_LABELS = { "openai/skills": "OpenAI", "anthropics/skills": "Anthropic", "huggingface/skills": "HuggingFace", "VoltAgent/awesome-agent-skills": "VoltAgent", "garrytan/gstack": "gstack", "MiniMax-AI/cli": "MiniMax", } # Legacy filename -> label mapping for the deprecated skills/index-cache/ # fallback. Used only when website/static/api/skills-index.json is absent. LEGACY_SOURCE_LABELS = { "anthropics_skills": "Anthropic", "openai_skills": "OpenAI", "claude_marketplace": "Claude Marketplace", "lobehub": "LobeHub", } def _extract_overview(body: str) -> str: """Pull the first non-heading paragraph from a SKILL.md body.""" if not body: return "" paragraphs = [p.strip() for p in body.split("\n\n") if p.strip()] for p in paragraphs[:6]: if p.startswith("#"): lines = [ln for ln in p.split("\n") if ln.strip() and not ln.lstrip().startswith("#")] if lines: p = "\n".join(lines).strip() else: continue if p.startswith(":::"): continue if p.startswith("```") or p.startswith("~~~"): continue if len(p) > 500: cut = p[:500] last_period = cut.rfind(". ") if last_period > 200: p = cut[: last_period + 1] else: p = cut.rstrip() + "…" return p return "" def _docs_page_path(rel_dir: str, source_label: str) -> str: """Compute the per-skill docs-site URL slug for a given SKILL.md location. Mirrors the slug logic in website/scripts/generate-skill-docs.py: bundled + skills///SKILL.md -> bundled//- bundled + skills////SKILL.md -> bundled//-- optional + optional-skills///SKILL.md -> optional//- """ parts = [p for p in rel_dir.split(os.sep) if p] if not parts: return "" source_dir = "bundled" if source_label == "built-in" else "optional" if len(parts) == 1: category, slug = parts[0], parts[0] return f"{source_dir}/{category}/{category}-{slug}" if len(parts) == 2: category, slug = parts return f"{source_dir}/{category}/{category}-{slug}" if len(parts) == 3: category, sub, slug = parts return f"{source_dir}/{category}/{category}-{sub}-{slug}" return "" def _install_command(source: str, identifier: str, name: str) -> str: """Build the ``hermes skills install …`` command for a unified-index entry. These show up in the SkillCard panel so users can copy-paste them. We try to use the most idiomatic identifier per source. """ if not identifier: return f"hermes skills install {name}" src = source.lower() if src in {"official", "built-in", "optional"}: # OptionalSkillSource emits identifiers like "official/security/1password" return f"hermes skills install {identifier}" if src in {"skills.sh", "skills-sh"}: # Already wrapped as "skills-sh/owner/repo/skill" by the source return f"hermes skills install {identifier}" if src == "clawhub": return f"hermes skills install clawhub/{identifier}" if src == "browse-sh": # Identifier already includes the "browse-sh/" prefix from BrowseShSource return f"hermes skills install {identifier}" if src == "lobehub": return f"hermes skills install {identifier}" if src == "claude-marketplace": return f"hermes skills install {identifier}" if src == "github": return f"hermes skills install {identifier}" if src == "well-known": return f"hermes skills install {identifier}" return f"hermes skills install {identifier}" def extract_local_skills(): skills = [] for base_dir, source_label in LOCAL_SKILL_DIRS: base_path = os.path.join(REPO_ROOT, base_dir) if not os.path.isdir(base_path): continue for root, _dirs, files in os.walk(base_path): if "SKILL.md" not in files: continue skill_path = os.path.join(root, "SKILL.md") with open(skill_path, encoding="utf-8") as f: content = f.read() if not content.startswith("---"): continue parts = content.split("---", 2) if len(parts) < 3: continue try: fm = yaml.safe_load(parts[1]) except yaml.YAMLError: continue if not fm or not isinstance(fm, dict): continue body = parts[2].strip() overview = _extract_overview(body) rel = os.path.relpath(root, base_path) category = rel.split(os.sep)[0] tags = [] metadata = fm.get("metadata") if isinstance(metadata, dict): hermes_meta = metadata.get("hermes", {}) if isinstance(hermes_meta, dict): tags = hermes_meta.get("tags", []) if not tags: tags = fm.get("tags", []) if isinstance(tags, str): tags = [tags] prereq = fm.get("prerequisites") or {} env_vars = [] commands = [] if isinstance(prereq, dict): ev = prereq.get("env_vars") if isinstance(ev, list): env_vars = [str(x) for x in ev if x] elif isinstance(ev, str) and ev.strip(): env_vars = [ev.strip()] cmds = prereq.get("commands") if isinstance(cmds, list): commands = [str(x) for x in cmds if x] elif isinstance(cmds, str) and cmds.strip(): commands = [cmds.strip()] skills.append({ "name": fm.get("name", os.path.basename(root)), "description": fm.get("description", ""), "overview": overview, "category": category, "categoryLabel": CATEGORY_LABELS.get(category, category.replace("-", " ").title()), "source": source_label, "tags": tags or [], "platforms": fm.get("platforms", []), "author": fm.get("author", ""), "version": fm.get("version", ""), "license": fm.get("license", ""), "envVars": env_vars, "commands": commands, "docsPath": _docs_page_path(rel, source_label), }) return skills def _label_for_github_identifier(identifier: str) -> str: """Return a friendly source label for a unified-index 'github' entry.""" if not identifier: return "GitHub" for prefix, label in GITHUB_TAP_LABELS.items(): if identifier.startswith(prefix + "/") or identifier == prefix: return label return "GitHub" def extract_unified_index_skills(): """Read website/static/api/skills-index.json — the canonical multi-source index.""" if not os.path.isfile(UNIFIED_INDEX_PATH): return None try: with open(UNIFIED_INDEX_PATH, encoding="utf-8") as f: data = json.load(f) except (json.JSONDecodeError, OSError) as e: print(f"[extract-skills] Failed to read unified index: {e}") return None if not isinstance(data, dict) or "skills" not in data: return None out = [] for entry in data.get("skills", []): if not isinstance(entry, dict): continue source_id = (entry.get("source") or "").lower() identifier = entry.get("identifier", "") or "" name = entry.get("name") or identifier.split("/")[-1] or "unknown" description = (entry.get("description") or "").split("\n")[0] if len(description) > 280: description = description[:277] + "…" tags = entry.get("tags", []) or [] if not isinstance(tags, list): tags = [] # Skip official entries here — extract_local_skills() already covered # those from optional-skills/ with full metadata (overview, version, etc.). if source_id == "official": continue # Map source id -> display label if source_id == "github": source_label = _label_for_github_identifier(identifier) else: source_label = UNIFIED_SOURCE_LABELS.get(source_id, source_id or "community") # Guess a category from tags so the UI's category filter has a chance. category = _guess_category(tags) extra = entry.get("extra", {}) or {} # Author hint from extras when available (skills.sh has installs; # clawhub doesn't expose author). author = "" if source_id in {"skills.sh", "skills-sh"}: repo = entry.get("repo", "") if repo: author = repo.split("/")[0] install_cmd = _install_command(source_id, identifier, name) out.append({ "name": name, "description": description, "overview": "", "category": category, "categoryLabel": "", # filled in _consolidate_small_categories "source": source_label, "tags": tags, "platforms": [], "author": author, "version": "", "license": "", "envVars": [], "commands": [], "docsPath": "", "identifier": identifier, "installCmd": install_cmd, }) return out def extract_legacy_cache_skills(): """Read the deprecated skills/index-cache/ snapshots — fallback only.""" skills = [] if not os.path.isdir(LEGACY_INDEX_CACHE_DIR): return skills for filename in os.listdir(LEGACY_INDEX_CACHE_DIR): if not filename.endswith(".json"): continue filepath = os.path.join(LEGACY_INDEX_CACHE_DIR, filename) try: with open(filepath, encoding="utf-8") as f: data = json.load(f) except (json.JSONDecodeError, OSError): continue stem = filename.replace(".json", "") source_label = "community" for key, label in LEGACY_SOURCE_LABELS.items(): if key in stem: source_label = label break if isinstance(data, dict) and "agents" in data: for agent in data["agents"]: if not isinstance(agent, dict): continue skills.append({ "name": agent.get("identifier", agent.get("meta", {}).get("title", "unknown")), "description": (agent.get("meta", {}).get("description", "") or "").split("\n")[0][:200], "category": _guess_category(agent.get("meta", {}).get("tags", [])), "categoryLabel": "", "source": source_label, "tags": agent.get("meta", {}).get("tags", []), "platforms": [], "author": agent.get("author", ""), "version": "", }) continue if isinstance(data, list): for entry in data: if not isinstance(entry, dict) or not entry.get("name"): continue if "skills" in entry and isinstance(entry["skills"], list): continue skills.append({ "name": entry.get("name", ""), "description": entry.get("description", ""), "category": "uncategorized", "categoryLabel": "", "source": source_label, "tags": entry.get("tags", []), "platforms": [], "author": "", "version": "", }) for s in skills: if not s["categoryLabel"]: s["categoryLabel"] = CATEGORY_LABELS.get( s["category"], s["category"].replace("-", " ").title() if s["category"] else "Uncategorized", ) return skills TAG_TO_CATEGORY = {} for _cat, _tags in { "software-development": [ "programming", "code", "coding", "software-development", "frontend-development", "backend-development", "web-development", "react", "python", "typescript", "java", "rust", ], "creative": ["writing", "design", "creative", "art", "image-generation"], "research": ["education", "academic", "research"], "social-media": ["marketing", "seo", "social-media"], "productivity": ["productivity", "business"], "data-science": ["data", "data-science"], "mlops": ["machine-learning", "deep-learning"], "devops": ["devops"], "gaming": ["gaming", "game", "game-development"], "media": ["music", "media", "video"], "health": ["health", "fitness"], "translation": ["translation", "language-learning"], "security": ["security", "cybersecurity"], }.items(): for _t in _tags: TAG_TO_CATEGORY[_t] = _cat def _guess_category(tags: list) -> str: if not tags: return "uncategorized" for tag in tags: if not isinstance(tag, str): continue cat = TAG_TO_CATEGORY.get(tag.lower()) if cat: return cat first = tags[0] if isinstance(tags[0], str) else "" return first.lower().replace(" ", "-") if first else "uncategorized" MIN_CATEGORY_SIZE = 4 def _consolidate_small_categories(skills: list) -> list: for s in skills: if s["category"] in {"uncategorized", ""}: s["category"] = "other" s["categoryLabel"] = "Other" counts = Counter(s["category"] for s in skills) small_cats = {cat for cat, n in counts.items() if n < MIN_CATEGORY_SIZE} for s in skills: if s["category"] in small_cats: s["category"] = "other" s["categoryLabel"] = "Other" elif not s["categoryLabel"]: s["categoryLabel"] = CATEGORY_LABELS.get( s["category"], s["category"].replace("-", " ").title() if s["category"] else "Uncategorized", ) return skills def main(): local = extract_local_skills() unified = extract_unified_index_skills() if unified is not None: external = unified external_source = "unified index" else: external = extract_legacy_cache_skills() external_source = "legacy index-cache" print( f"[extract-skills] WARNING: unified index not found at " f"{UNIFIED_INDEX_PATH}; falling back to {external_source}. " f"Run `python3 scripts/build_skills_index.py` to refresh." ) all_skills = _consolidate_small_categories(local + external) source_order = {"built-in": 0, "optional": 1} all_skills.sort(key=lambda s: ( source_order.get(s["source"], 2), 1 if s["category"] == "other" else 0, s["category"], s["name"], )) os.makedirs(os.path.dirname(OUTPUT), exist_ok=True) with open(OUTPUT, "w", encoding="utf-8") as f: json.dump(all_skills, f, indent=2) print(f"Extracted {len(all_skills)} skills to {OUTPUT}") print(f" {len(local)} local ({sum(1 for s in local if s['source'] == 'built-in')} built-in, " f"{sum(1 for s in local if s['source'] == 'optional')} optional)") print(f" {len(external)} from {external_source}") # Breakdown by source by_source = Counter(s["source"] for s in all_skills) print("By source:") for src, count in by_source.most_common(): print(f" {src}: {count}") if __name__ == "__main__": main()