fix(skills-hub): show every catalog source on /docs/skills (skills.sh, ClawHub, browse.sh, OpenAI, …) (#32336)

The Skills Hub page was stuck on a stale Feb 25 snapshot, showing only Built-in + Optional + Anthropic + LobeHub. The unified index already has 2078 skills from skills.sh / ClawHub / LobeHub / GitHub taps / Claude Marketplace, and BrowseShSource adds another ~330 — none of it was reaching the page. Changes: - website/scripts/extract-skills.py: read website/static/api/skills-index.json (the unified multi-source catalog, rebuilt twice daily) as the canonical external source. Keep the legacy skills/index-cache/ fallback for offline builds. Add friendly per-source labels (skills.sh, ClawHub, browse.sh, OpenAI, HuggingFace, Anthropic, LobeHub, etc.) and per-entry installCmd. - website/src/pages/skills/index.tsx: add source pills + ordering for the 11 new sources; render installCmd from the index entry. - website/scripts/prebuild.mjs: when no local skills-index.json exists, fetch the live one from hermes-agent.nousresearch.com so local 'npm run build' matches production without burning GitHub API quota. - scripts/build_skills_index.py: crawl BrowseShSource so browse.sh entries land in the unified index. Adjust source_order. - tools/skills_hub.py: GitHubSource.DEFAULT_TAPS — openai/skills moved its skills into skills/.curated/ and skills/.system/, so add both as explicit taps (the listing code skips dotted dirs by design). Drop VoltAgent/awesome-agent-skills (README-only, no SKILL.md files) and MiniMax-AI/cli (singular skill, not a tap directory). Net effect: github source jumps from 83 → 143 skills, with OpenAI properly included. - .github/workflows/deploy-site.yml: build the unified index BEFORE running extract-skills.py — previous order meant extract-skills always fell back to the legacy cache. Drop the 'skip if file exists' guard; the file is gitignored and must be rebuilt every deploy. - .github/workflows/skills-index.yml: drop the broken 'deploy-with-index' job (it cp'd 'landingpage/\*' which no longer exists, failing every cron run since the landingpage move). Replace it with a workflow_dispatch trigger of deploy-site.yml so the index refresh still reaches production on schedule. - website/docs/user-guide/features/skills.md: drop VoltAgent from the default-taps doc list to match the code. Before: 695 skills (Built-in 90, Optional 84, Anthropic 16, LobeHub 505). After: 2168 skills across 9 source pills, including the 1212 skills.sh entries the user expected to see.
2026-07-17 14:42:06 +00:00 · 2026-05-25 18:34:54 -07:00 · 2026-05-25 18:34:54 -07:00 · cea87d9139
commit cea87d9139
parent c26af46811
8 changed files with 396 additions and 95 deletions
--- a/website/scripts/extract-skills.py
+++ b/website/scripts/extract-skills.py
@ -1,5 +1,22 @@
 #!/usr/bin/env python3
-"""Extract skill metadata from SKILL.md files and index caches into JSON."""
+"""Extract skill metadata into website/src/data/skills.json for the Skills Hub page.
+
+Two data sources:
+
+1. Local SKILL.md files under ``skills/`` (built-in) and ``optional-skills/``
+   (official optional). These give us full metadata — overview prose, version,
+   license, env vars, commands — that the unified index doesn't carry.
+
+2. The unified Hermes Skills Index at ``website/static/api/skills-index.json``,
+   built twice daily by ``scripts/build_skills_index.py`` (workflow
+   ``.github/workflows/skills-index.yml``). Covers skills.sh, ClawHub, browse.sh,
+   LobeHub, Claude Marketplace, well-known endpoints, and the GitHub taps
+   (openai/skills, anthropics/skills, huggingface/skills, VoltAgent, etc.).
+
+Legacy fallback: if the unified index is missing AND ``skills/index-cache/``
+contains pre-baked JSON dumps, we read those (preserves behaviour from before
+the unified index existed).
+"""

 import json
 import os
@ -12,7 +29,8 @@ LOCAL_SKILL_DIRS = [
    ("skills", "built-in"),
    ("optional-skills", "optional"),
 ]
-INDEX_CACHE_DIR = os.path.join(REPO_ROOT, "skills", "index-cache")
+UNIFIED_INDEX_PATH = os.path.join(REPO_ROOT, "website", "static", "api", "skills-index.json")
+LEGACY_INDEX_CACHE_DIR = os.path.join(REPO_ROOT, "skills", "index-cache")
 OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills.json")

 CATEGORY_LABELS = {
@ -48,7 +66,37 @@ CATEGORY_LABELS = {
    "other": "Other",
 }

-SOURCE_LABELS = {
+# Map the source ids the unified index emits to the friendly labels the
+# Skills Hub UI uses. Keep these in sync with the SOURCE_CONFIG dict in
+# website/src/pages/skills/index.tsx.
+UNIFIED_SOURCE_LABELS = {
+    "official": "official",   # treated as our "optional" tier in the UI
+    "skills.sh": "skills.sh",
+    "skills-sh": "skills.sh",
+    "clawhub": "ClawHub",
+    "browse-sh": "browse.sh",
+    "lobehub": "LobeHub",
+    "claude-marketplace": "Claude Marketplace",
+    "well-known": "Well-Known",
+    "github": "GitHub",  # default for non-named GitHub taps
+}
+
+# Repo-specific labels for the unified index's "github" source. Lets us
+# call out the well-known taps with their vendor name instead of a generic
+# "GitHub" pill. Match is checked against the leading "owner/repo/" prefix
+# of the identifier.
+GITHUB_TAP_LABELS = {
+    "openai/skills": "OpenAI",
+    "anthropics/skills": "Anthropic",
+    "huggingface/skills": "HuggingFace",
+    "VoltAgent/awesome-agent-skills": "VoltAgent",
+    "garrytan/gstack": "gstack",
+    "MiniMax-AI/cli": "MiniMax",
+}
+
+# Legacy filename -> label mapping for the deprecated skills/index-cache/
+# fallback. Used only when website/static/api/skills-index.json is absent.
+LEGACY_SOURCE_LABELS = {
    "anthropics_skills": "Anthropic",
    "openai_skills": "OpenAI",
    "claude_marketplace": "Claude Marketplace",
@ -57,31 +105,21 @@ SOURCE_LABELS = {


 def _extract_overview(body: str) -> str:
-    """Pull the first non-heading paragraph from a SKILL.md body.
-
-    Skips H1/H2/etc. lines so the overview is real prose, not a heading.
-    Strips markdown links/code-fence syntax to plain-ish text. Capped at
-    ~500 chars so the SkillCard panel stays a reasonable size.
-    """
+    """Pull the first non-heading paragraph from a SKILL.md body."""
    if not body:
        return ""
    paragraphs = [p.strip() for p in body.split("\n\n") if p.strip()]
    for p in paragraphs[:6]:
-        # Skip pure heading paragraphs ("# Foo", "## Foo")
        if p.startswith("#"):
-            # If a heading paragraph also has body text on later lines, take those
            lines = [ln for ln in p.split("\n") if ln.strip() and not ln.lstrip().startswith("#")]
            if lines:
                p = "\n".join(lines).strip()
            else:
                continue
-        # Skip a leading admonition fence (:::tip / :::info / etc.)
        if p.startswith(":::"):
            continue
-        # Skip pure code fences and frontmatter-style blocks
        if p.startswith("```") or p.startswith("~~~"):
            continue
-        # Trim to roughly 500 chars at a sentence boundary
        if len(p) > 500:
            cut = p[:500]
            last_period = cut.rfind(". ")
@ -117,6 +155,37 @@ def _docs_page_path(rel_dir: str, source_label: str) -> str:
    return ""


+def _install_command(source: str, identifier: str, name: str) -> str:
+    """Build the ``hermes skills install …`` command for a unified-index entry.
+
+    These show up in the SkillCard panel so users can copy-paste them. We try
+    to use the most idiomatic identifier per source.
+    """
+    if not identifier:
+        return f"hermes skills install {name}"
+    src = source.lower()
+    if src in {"official", "built-in", "optional"}:
+        # OptionalSkillSource emits identifiers like "official/security/1password"
+        return f"hermes skills install {identifier}"
+    if src in {"skills.sh", "skills-sh"}:
+        # Already wrapped as "skills-sh/owner/repo/skill" by the source
+        return f"hermes skills install {identifier}"
+    if src == "clawhub":
+        return f"hermes skills install clawhub/{identifier}"
+    if src == "browse-sh":
+        # Identifier already includes the "browse-sh/" prefix from BrowseShSource
+        return f"hermes skills install {identifier}"
+    if src == "lobehub":
+        return f"hermes skills install {identifier}"
+    if src == "claude-marketplace":
+        return f"hermes skills install {identifier}"
+    if src == "github":
+        return f"hermes skills install {identifier}"
+    if src == "well-known":
+        return f"hermes skills install {identifier}"
+    return f"hermes skills install {identifier}"
+
+
 def extract_local_skills():
    skills = []

@ -165,7 +234,6 @@ def extract_local_skills():
            if isinstance(tags, str):
                tags = [tags]

-            # Optional structured prerequisites — surfaced in the SkillCard panel
            prereq = fm.get("prerequisites") or {}
            env_vars = []
            commands = []
@ -201,17 +269,104 @@ def extract_local_skills():
    return skills


-def extract_cached_index_skills():
+def _label_for_github_identifier(identifier: str) -> str:
+    """Return a friendly source label for a unified-index 'github' entry."""
+    if not identifier:
+        return "GitHub"
+    for prefix, label in GITHUB_TAP_LABELS.items():
+        if identifier.startswith(prefix + "/") or identifier == prefix:
+            return label
+    return "GitHub"
+
+
+def extract_unified_index_skills():
+    """Read website/static/api/skills-index.json — the canonical multi-source index."""
+    if not os.path.isfile(UNIFIED_INDEX_PATH):
+        return None
+
+    try:
+        with open(UNIFIED_INDEX_PATH, encoding="utf-8") as f:
+            data = json.load(f)
+    except (json.JSONDecodeError, OSError) as e:
+        print(f"[extract-skills] Failed to read unified index: {e}")
+        return None
+
+    if not isinstance(data, dict) or "skills" not in data:
+        return None
+
+    out = []
+    for entry in data.get("skills", []):
+        if not isinstance(entry, dict):
+            continue
+        source_id = (entry.get("source") or "").lower()
+        identifier = entry.get("identifier", "") or ""
+        name = entry.get("name") or identifier.split("/")[-1] or "unknown"
+        description = (entry.get("description") or "").split("\n")[0]
+        if len(description) > 280:
+            description = description[:277] + "…"
+        tags = entry.get("tags", []) or []
+        if not isinstance(tags, list):
+            tags = []
+
+        # Skip official entries here — extract_local_skills() already covered
+        # those from optional-skills/ with full metadata (overview, version, etc.).
+        if source_id == "official":
+            continue
+
+        # Map source id -> display label
+        if source_id == "github":
+            source_label = _label_for_github_identifier(identifier)
+        else:
+            source_label = UNIFIED_SOURCE_LABELS.get(source_id, source_id or "community")
+
+        # Guess a category from tags so the UI's category filter has a chance.
+        category = _guess_category(tags)
+        extra = entry.get("extra", {}) or {}
+
+        # Author hint from extras when available (skills.sh has installs;
+        # clawhub doesn't expose author).
+        author = ""
+        if source_id in {"skills.sh", "skills-sh"}:
+            repo = entry.get("repo", "")
+            if repo:
+                author = repo.split("/")[0]
+
+        install_cmd = _install_command(source_id, identifier, name)
+
+        out.append({
+            "name": name,
+            "description": description,
+            "overview": "",
+            "category": category,
+            "categoryLabel": "",  # filled in _consolidate_small_categories
+            "source": source_label,
+            "tags": tags,
+            "platforms": [],
+            "author": author,
+            "version": "",
+            "license": "",
+            "envVars": [],
+            "commands": [],
+            "docsPath": "",
+            "identifier": identifier,
+            "installCmd": install_cmd,
+        })
+
+    return out
+
+
+def extract_legacy_cache_skills():
+    """Read the deprecated skills/index-cache/ snapshots — fallback only."""
    skills = []

-    if not os.path.isdir(INDEX_CACHE_DIR):
+    if not os.path.isdir(LEGACY_INDEX_CACHE_DIR):
        return skills

-    for filename in os.listdir(INDEX_CACHE_DIR):
+    for filename in os.listdir(LEGACY_INDEX_CACHE_DIR):
        if not filename.endswith(".json"):
            continue

-        filepath = os.path.join(INDEX_CACHE_DIR, filename)
+        filepath = os.path.join(LEGACY_INDEX_CACHE_DIR, filename)
        try:
            with open(filepath, encoding="utf-8") as f:
                data = json.load(f)
@ -220,7 +375,7 @@ def extract_cached_index_skills():

        stem = filename.replace(".json", "")
        source_label = "community"
-        for key, label in SOURCE_LABELS.items():
+        for key, label in LEGACY_SOURCE_LABELS.items():
            if key in stem:
                source_label = label
                break
@ -233,7 +388,7 @@ def extract_cached_index_skills():
                    "name": agent.get("identifier", agent.get("meta", {}).get("title", "unknown")),
                    "description": (agent.get("meta", {}).get("description", "") or "").split("\n")[0][:200],
                    "category": _guess_category(agent.get("meta", {}).get("tags", [])),
-                    "categoryLabel": "",  # filled below
+                    "categoryLabel": "",
                    "source": source_label,
                    "tags": agent.get("meta", {}).get("tags", []),
                    "platforms": [],
@ -298,10 +453,13 @@ def _guess_category(tags: list) -> str:
    if not tags:
        return "uncategorized"
    for tag in tags:
+        if not isinstance(tag, str):
+            continue
        cat = TAG_TO_CATEGORY.get(tag.lower())
        if cat:
            return cat
-    return tags[0].lower().replace(" ", "-")
+    first = tags[0] if isinstance(tags[0], str) else ""
+    return first.lower().replace(" ", "-") if first else "uncategorized"


 MIN_CATEGORY_SIZE = 4
@ -320,13 +478,30 @@ def _consolidate_small_categories(skills: list) -> list:
        if s["category"] in small_cats:
            s["category"] = "other"
            s["categoryLabel"] = "Other"
+        elif not s["categoryLabel"]:
+            s["categoryLabel"] = CATEGORY_LABELS.get(
+                s["category"],
+                s["category"].replace("-", " ").title() if s["category"] else "Uncategorized",
+            )

    return skills


 def main():
    local = extract_local_skills()
-    external = extract_cached_index_skills()
+
+    unified = extract_unified_index_skills()
+    if unified is not None:
+        external = unified
+        external_source = "unified index"
+    else:
+        external = extract_legacy_cache_skills()
+        external_source = "legacy index-cache"
+        print(
+            f"[extract-skills] WARNING: unified index not found at "
+            f"{UNIFIED_INDEX_PATH}; falling back to {external_source}. "
+            f"Run `python3 scripts/build_skills_index.py` to refresh."
+        )

    all_skills = _consolidate_small_categories(local + external)

@ -345,7 +520,13 @@ def main():
    print(f"Extracted {len(all_skills)} skills to {OUTPUT}")
    print(f"  {len(local)} local ({sum(1 for s in local if s['source'] == 'built-in')} built-in, "
          f"{sum(1 for s in local if s['source'] == 'optional')} optional)")
-    print(f"  {len(external)} from external indexes")
+    print(f"  {len(external)} from {external_source}")
+
+    # Breakdown by source
+    by_source = Counter(s["source"] for s in all_skills)
+    print("By source:")
+    for src, count in by_source.most_common():
+        print(f"  {src}: {count}")


 if __name__ == "__main__":
--- a/website/scripts/prebuild.mjs
+++ b/website/scripts/prebuild.mjs
@ -8,6 +8,13 @@
 // CI workflows still run the extraction explicitly, which is a no-op duplicate
 // but matches their historical behaviour.
 //
+// We also try to pull a fresh copy of skills-index.json (the unified
+// multi-source catalog) from the live docs site if it's not already on disk.
+// That way local `npm run build` doesn't have to wait on
+// scripts/build_skills_index.py crawling every skill source — which takes
+// several minutes and burns GitHub API quota — but still gets the same
+// 2000+ external skills the deployed site has.
+//
 // If python3 or its deps (pyyaml) aren't available on the local machine, we
 // fall back to writing an empty skills.json so `npm run build` still
 // succeeds — the Skills Hub page just shows an empty state, and llms.txt
@ -15,7 +22,7 @@
 // deploys get real data.

 import { spawnSync } from "node:child_process";
-import { mkdirSync, writeFileSync, existsSync } from "node:fs";
+import { mkdirSync, writeFileSync, existsSync, statSync } from "node:fs";
 import { dirname, join, resolve } from "node:path";
 import { fileURLToPath } from "node:url";

@ -24,6 +31,10 @@ const websiteDir = resolve(scriptDir, "..");
 const extractScript = join(scriptDir, "extract-skills.py");
 const llmsScript = join(scriptDir, "generate-llms-txt.py");
 const outputFile = join(websiteDir, "src", "data", "skills.json");
+const unifiedIndexFile = join(websiteDir, "static", "api", "skills-index.json");
+const UNIFIED_INDEX_URL =
+  "https://hermes-agent.nousresearch.com/docs/api/skills-index.json";
+const UNIFIED_INDEX_MAX_AGE_MS = 24 * 60 * 60 * 1000; // 24h

 function writeEmptyFallback(reason) {
  mkdirSync(dirname(outputFile), { recursive: true });
@ -51,6 +62,64 @@ function runPython(script, label) {
  return true;
 }

+async function ensureUnifiedIndex() {
+  // If we have a recent copy on disk, trust it.
+  if (existsSync(unifiedIndexFile)) {
+    try {
+      const age = Date.now() - statSync(unifiedIndexFile).mtimeMs;
+      if (age < UNIFIED_INDEX_MAX_AGE_MS) {
+        return true;
+      }
+      console.log(
+        `[prebuild] skills-index.json is ${(age / 3600000).toFixed(1)}h old; ` +
+          `refreshing from ${UNIFIED_INDEX_URL}`,
+      );
+    } catch {
+      // fall through to re-fetch
+    }
+  }
+
+  try {
+    const resp = await fetch(UNIFIED_INDEX_URL, {
+      headers: { accept: "application/json" },
+    });
+    if (!resp.ok) {
+      console.warn(
+        `[prebuild] skills-index.json fetch returned HTTP ${resp.status}; ` +
+          `using local copy if any`,
+      );
+      return existsSync(unifiedIndexFile);
+    }
+    const text = await resp.text();
+    // Sanity check: must be valid JSON with a skills array
+    try {
+      const parsed = JSON.parse(text);
+      if (!parsed || !Array.isArray(parsed.skills)) {
+        console.warn(
+          "[prebuild] skills-index.json from live site has no skills array; ignoring",
+        );
+        return existsSync(unifiedIndexFile);
+      }
+    } catch (e) {
+      console.warn(`[prebuild] skills-index.json from live site is not valid JSON: ${e}`);
+      return existsSync(unifiedIndexFile);
+    }
+    mkdirSync(dirname(unifiedIndexFile), { recursive: true });
+    writeFileSync(unifiedIndexFile, text);
+    console.log(
+      `[prebuild] downloaded skills-index.json from ${UNIFIED_INDEX_URL} ` +
+        `(${(text.length / 1024).toFixed(0)} KB)`,
+    );
+    return true;
+  } catch (e) {
+    console.warn(`[prebuild] skills-index.json fetch failed: ${e}`);
+    return existsSync(unifiedIndexFile);
+  }
+}
+
+// 0) Pull unified index if we don't have a fresh one.
+await ensureUnifiedIndex();
+
 // 1) skills.json — required for the Skills Hub page.
 if (!existsSync(extractScript)) {
  writeEmptyFallback("extract script missing");