feat(skills-hub): health checks, freshness badge, and a watchdog cron (#32345)

Layered safety so the Skills Hub at /docs/skills stays in sync without silent rot. Three pieces: 1. build_skills_index.py — refuses to ship a degenerate index. EXPECTED_FLOORS per source (skills.sh ≥100, lobehub ≥100, clawhub ≥50, official ≥50, github ≥30, browse-sh ≥50) and MIN_TOTAL=1500. Any source collapsing to zero (the silent OpenAI breakage that hid for weeks) now fails the workflow loud — broken index never reaches the live site. 2. extract-skills.py + the React page — visible freshness signal. Sidecar website/src/data/skills-meta.json carries the index's generated_at timestamp, plus per-source counts. Skills Hub renders a 'Catalog refreshed N hours ago · auto-rebuilt twice daily' line under the hero copy. If the cron stalls, users see the staleness immediately. 3. .github/workflows/skills-index-freshness.yml — watchdog cron. Every 4 hours, fetches the live /docs/api/skills-index.json, validates shape, checks age (>26h is stale), checks the same per-source floors, and opens (or appends to) a GitHub issue when anything is off. The issue is title-prefixed [skills-index-watchdog] so subsequent failures append a comment instead of spamming new issues. Net effect: - A silent regression like 'OpenAI tap moved its skills' now fails the build instead of shipping a quietly broken catalog. - A stuck cron (like the landingpage breakage that ran red for weeks) now files an issue within 4 hours. - Users see how fresh the catalog is on the page itself. Test plan: - Local: built skills-meta.json from the live index → 'Catalog refreshed N minutes ago' rendered correctly in the static HTML. - Probe logic dry-run against the live index: total=2456, all 6 sources above floor, age 0.1h — issues=NONE. - Triggered skills-index.yml manually; both jobs green, deploy-site.yml dispatch fired.
2026-07-18 14:52:04 +00:00 · 2026-05-25 23:10:45 -07:00 · 2026-05-25 23:10:45 -07:00 · d8703e27f5
commit d8703e27f5
parent cea87d9139
5 changed files with 273 additions and 8 deletions
--- a/website/scripts/extract-skills.py
+++ b/website/scripts/extract-skills.py
@ -21,6 +21,7 @@ the unified index existed).
 import json
 import os
 from collections import Counter
+from datetime import datetime, timezone

 import yaml

@ -32,6 +33,7 @@ LOCAL_SKILL_DIRS = [
 UNIFIED_INDEX_PATH = os.path.join(REPO_ROOT, "website", "static", "api", "skills-index.json")
 LEGACY_INDEX_CACHE_DIR = os.path.join(REPO_ROOT, "skills", "index-cache")
 OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills.json")
+META_OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills-meta.json")

 CATEGORY_LABELS = {
    "apple": "Apple",
@ -280,19 +282,32 @@ def _label_for_github_identifier(identifier: str) -> str:


 def extract_unified_index_skills():
-    """Read website/static/api/skills-index.json — the canonical multi-source index."""
+    """Read website/static/api/skills-index.json — the canonical multi-source index.
+
+    Returns ``(skills, meta)`` where ``meta`` carries the index's
+    ``generated_at`` timestamp and total count so the Skills Hub page can
+    show a "Last refreshed …" badge. Returns ``(None, None)`` when the
+    index file is absent or malformed (caller falls back to the legacy
+    cache).
+    """
    if not os.path.isfile(UNIFIED_INDEX_PATH):
-        return None
+        return None, None

    try:
        with open(UNIFIED_INDEX_PATH, encoding="utf-8") as f:
            data = json.load(f)
    except (json.JSONDecodeError, OSError) as e:
        print(f"[extract-skills] Failed to read unified index: {e}")
-        return None
+        return None, None

    if not isinstance(data, dict) or "skills" not in data:
-        return None
+        return None, None
+
+    meta = {
+        "indexGeneratedAt": data.get("generated_at", ""),
+        "indexSkillCount": data.get("skill_count", 0),
+        "indexVersion": data.get("version", 0),
+    }

    out = []
    for entry in data.get("skills", []):
@ -352,7 +367,7 @@ def extract_unified_index_skills():
            "installCmd": install_cmd,
        })

-    return out
+    return out, meta


 def extract_legacy_cache_skills():
@ -490,13 +505,14 @@ def _consolidate_small_categories(skills: list) -> list:
 def main():
    local = extract_local_skills()

-    unified = extract_unified_index_skills()
+    unified, index_meta = extract_unified_index_skills()
    if unified is not None:
        external = unified
        external_source = "unified index"
    else:
        external = extract_legacy_cache_skills()
        external_source = "legacy index-cache"
+        index_meta = None
        print(
            f"[extract-skills] WARNING: unified index not found at "
            f"{UNIFIED_INDEX_PATH}; falling back to {external_source}. "
@ -517,16 +533,32 @@ def main():
    with open(OUTPUT, "w", encoding="utf-8") as f:
        json.dump(all_skills, f, indent=2)

+    # Sidecar meta file so the page can render a "Last refreshed" badge
+    # without changing the shape of skills.json.
+    by_source = Counter(s["source"] for s in all_skills)
+    meta = {
+        "extractedAt": datetime.now(timezone.utc).isoformat(),
+        "totalSkills": len(all_skills),
+        "localSkills": len(local),
+        "externalSkills": len(external),
+        "externalSource": external_source,
+        "bySource": dict(by_source.most_common()),
+    }
+    if index_meta:
+        meta.update(index_meta)
+    with open(META_OUTPUT, "w", encoding="utf-8") as f:
+        json.dump(meta, f, indent=2)
+
    print(f"Extracted {len(all_skills)} skills to {OUTPUT}")
    print(f"  {len(local)} local ({sum(1 for s in local if s['source'] == 'built-in')} built-in, "
          f"{sum(1 for s in local if s['source'] == 'optional')} optional)")
    print(f"  {len(external)} from {external_source}")

-    # Breakdown by source
-    by_source = Counter(s["source"] for s in all_skills)
    print("By source:")
    for src, count in by_source.most_common():
        print(f"  {src}: {count}")
+    if index_meta and index_meta.get("indexGeneratedAt"):
+        print(f"Unified index built at: {index_meta['indexGeneratedAt']}")


 if __name__ == "__main__":