From d8703e27f5c3417bc05ddc792b6026b538a376f9 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 25 May 2026 23:10:45 -0700 Subject: [PATCH] feat(skills-hub): health checks, freshness badge, and a watchdog cron (#32345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Layered safety so the Skills Hub at /docs/skills stays in sync without silent rot. Three pieces: 1. build_skills_index.py — refuses to ship a degenerate index. EXPECTED_FLOORS per source (skills.sh ≥100, lobehub ≥100, clawhub ≥50, official ≥50, github ≥30, browse-sh ≥50) and MIN_TOTAL=1500. Any source collapsing to zero (the silent OpenAI breakage that hid for weeks) now fails the workflow loud — broken index never reaches the live site. 2. extract-skills.py + the React page — visible freshness signal. Sidecar website/src/data/skills-meta.json carries the index's generated_at timestamp, plus per-source counts. Skills Hub renders a 'Catalog refreshed N hours ago · auto-rebuilt twice daily' line under the hero copy. If the cron stalls, users see the staleness immediately. 3. .github/workflows/skills-index-freshness.yml — watchdog cron. Every 4 hours, fetches the live /docs/api/skills-index.json, validates shape, checks age (>26h is stale), checks the same per-source floors, and opens (or appends to) a GitHub issue when anything is off. The issue is title-prefixed [skills-index-watchdog] so subsequent failures append a comment instead of spamming new issues. Net effect: - A silent regression like 'OpenAI tap moved its skills' now fails the build instead of shipping a quietly broken catalog. - A stuck cron (like the landingpage breakage that ran red for weeks) now files an issue within 4 hours. - Users see how fresh the catalog is on the page itself. Test plan: - Local: built skills-meta.json from the live index → 'Catalog refreshed N minutes ago' rendered correctly in the static HTML. - Probe logic dry-run against the live index: total=2456, all 6 sources above floor, age 0.1h — issues=NONE. - Triggered skills-index.yml manually; both jobs green, deploy-site.yml dispatch fired. --- .github/workflows/skills-index-freshness.yml | 149 +++++++++++++++++++ scripts/build_skills_index.py | 44 ++++++ website/.gitignore | 1 + website/scripts/extract-skills.py | 48 +++++- website/src/pages/skills/index.tsx | 39 +++++ 5 files changed, 273 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/skills-index-freshness.yml diff --git a/.github/workflows/skills-index-freshness.yml b/.github/workflows/skills-index-freshness.yml new file mode 100644 index 00000000000..856878def5f --- /dev/null +++ b/.github/workflows/skills-index-freshness.yml @@ -0,0 +1,149 @@ +name: Skills Index Freshness Check + +# Belt-and-suspenders for the twice-daily build_skills_index pipeline. +# If the live /docs/api/skills-index.json ever goes more than 26 hours +# stale OR the file disappears entirely OR a major source has collapsed, +# this workflow opens a GitHub issue so we hear about it before users do. +# +# Triggered every 4 hours so we catch a stuck cron within one tick. + +on: + schedule: + - cron: '0 */4 * * *' + workflow_dispatch: + +permissions: + contents: read + issues: write + +jobs: + check-freshness: + if: github.repository == 'NousResearch/hermes-agent' + runs-on: ubuntu-latest + steps: + - name: Probe live index + id: probe + run: | + set -e + URL="https://hermes-agent.nousresearch.com/docs/api/skills-index.json" + echo "Probing $URL" + # -L follows redirects; -f fails on HTTP errors; -s suppresses progress + if ! curl -fsSL -o /tmp/skills-index.json "$URL"; then + echo "status=fetch-failed" >> "$GITHUB_OUTPUT" + echo "detail=Could not download $URL" >> "$GITHUB_OUTPUT" + exit 0 + fi + # Validate + extract generated_at and per-source counts + python3 <<'PY' >> "$GITHUB_OUTPUT" + import json, sys + from datetime import datetime, timezone + + try: + with open("/tmp/skills-index.json") as f: + data = json.load(f) + except Exception as e: + print(f"status=parse-failed") + print(f"detail=JSON decode error: {e}") + sys.exit(0) + + generated_at = data.get("generated_at", "") + total = data.get("skill_count", 0) + skills = data.get("skills", []) + if not isinstance(skills, list): + print("status=invalid-shape") + print(f"detail=skills field is not a list (got {type(skills).__name__})") + sys.exit(0) + + # Per-source counts + from collections import Counter + by_src = Counter(s.get("source", "") for s in skills) + + # Freshness + age_hours = None + try: + ts = datetime.fromisoformat(generated_at.replace("Z", "+00:00")) + age_hours = (datetime.now(timezone.utc) - ts).total_seconds() / 3600 + except Exception: + pass + + # Floors — same as build_skills_index.py EXPECTED_FLOORS. + floors = { + "skills.sh": 100, + "lobehub": 100, + "clawhub": 50, + "official": 50, + "github": 30, + "browse-sh": 50, + } + issues = [] + if age_hours is not None and age_hours > 26: + issues.append(f"Index is {age_hours:.1f}h old (limit 26h)") + for src, floor in floors.items(): + count = by_src.get(src, 0) + if src == "skills.sh": + count = by_src.get("skills.sh", 0) + by_src.get("skills-sh", 0) + if count < floor: + issues.append(f"{src}: {count} < {floor}") + if total < 1500: + issues.append(f"total skills: {total} < 1500") + + if issues: + detail = "; ".join(issues) + print("status=degraded") + # GITHUB_OUTPUT doesn't allow newlines without explicit delimiter + print(f"detail={detail}") + else: + print("status=ok") + print(f"detail=Index OK — {total} skills, generated {generated_at}") + by_summary = ", ".join(f"{k}={v}" for k, v in by_src.most_common(8)) + print(f"summary={by_summary}") + PY + + - name: Report status + run: | + echo "Probe status: ${{ steps.probe.outputs.status }}" + echo "Detail: ${{ steps.probe.outputs.detail }}" + if [ -n "${{ steps.probe.outputs.summary }}" ]; then + echo "Summary: ${{ steps.probe.outputs.summary }}" + fi + + - name: Open issue on degraded / failed probe + if: steps.probe.outputs.status != 'ok' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + STATUS: ${{ steps.probe.outputs.status }} + DETAIL: ${{ steps.probe.outputs.detail }} + run: | + # Find existing open issue by title prefix so we don't spam — we + # append a comment instead of opening a new one each tick. + TITLE_PREFIX="[skills-index-watchdog]" + existing=$(gh issue list \ + --repo "${{ github.repository }}" \ + --state open \ + --search "in:title \"$TITLE_PREFIX\"" \ + --json number,title \ + --jq '.[] | select(.title | startswith("'"$TITLE_PREFIX"'")) | .number' \ + | head -1) + BODY="Automated freshness probe failed. + + **Status:** \`$STATUS\` + **Detail:** $DETAIL + + The Skills Hub at /docs/skills depends on \`/docs/api/skills-index.json\`. + The unified index is rebuilt by \`.github/workflows/skills-index.yml\` (cron 6/18 UTC) + and \`.github/workflows/deploy-site.yml\` (on every push affecting website/skills). + If this issue keeps reopening, check the latest runs: + + - https://github.com/${{ github.repository }}/actions/workflows/skills-index.yml + - https://github.com/${{ github.repository }}/actions/workflows/deploy-site.yml + + This issue was opened by \`.github/workflows/skills-index-freshness.yml\`. Close it once the underlying problem is fixed; the next probe will reopen if it's still broken." + if [ -n "$existing" ]; then + echo "Appending to existing issue #$existing" + gh issue comment "$existing" --repo "${{ github.repository }}" --body "Probe still failing at $(date -u +%FT%TZ): \`$STATUS\` — $DETAIL" + else + echo "Opening new watchdog issue" + gh issue create --repo "${{ github.repository }}" \ + --title "$TITLE_PREFIX Skills index is stale or degraded ($STATUS)" \ + --body "$BODY" + fi diff --git a/scripts/build_skills_index.py b/scripts/build_skills_index.py index 844b29733b7..9b9277547f7 100644 --- a/scripts/build_skills_index.py +++ b/scripts/build_skills_index.py @@ -322,6 +322,50 @@ def main(): extra = f" ({resolved} resolved)" if resolved else "" print(f" {src}: {count}{extra}") + # Health check: catch silent breakage early. Every source listed below + # has historically returned at least `floor` entries; a zero (or near- + # zero) result almost certainly means a tap path moved, an API changed, + # or rate limiting kicked in. Failing here forces a human look before + # the broken index reaches the live docs. + EXPECTED_FLOORS = { + "skills.sh": 100, + "lobehub": 100, + "clawhub": 50, + "official": 50, + "github": 30, # collapsed across all GitHub taps + "browse-sh": 50, + } + health_errors = [] + for src, floor in EXPECTED_FLOORS.items(): + # 'skills-sh' and 'skills.sh' are the same source; both labels exist. + count = by_source.get(src, 0) + if src == "skills.sh": + count = by_source.get("skills.sh", 0) + by_source.get("skills-sh", 0) + if count < floor: + health_errors.append(f" {src}: {count} < expected floor {floor}") + + MIN_TOTAL = 1500 + if len(deduped) < MIN_TOTAL: + health_errors.append( + f" total: {len(deduped)} < expected floor {MIN_TOTAL}" + ) + + if health_errors: + print( + "\nERROR: skills index health check failed — refusing to ship " + "a degenerate index. Investigate the following sources:", + file=sys.stderr, + ) + for line in health_errors: + print(line, file=sys.stderr) + print( + "\nIf the drop is expected (e.g. a hub is genuinely shutting " + "down), lower the floor in scripts/build_skills_index.py " + "EXPECTED_FLOORS in the same PR.", + file=sys.stderr, + ) + sys.exit(2) + if __name__ == "__main__": main() diff --git a/website/.gitignore b/website/.gitignore index c8dd1071c02..618c20e2b1e 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -8,6 +8,7 @@ .docusaurus .cache-loader src/data/skills.json +src/data/skills-meta.json static/llms.txt static/llms-full.txt diff --git a/website/scripts/extract-skills.py b/website/scripts/extract-skills.py index 5bdb39d4f9b..dd648589db8 100644 --- a/website/scripts/extract-skills.py +++ b/website/scripts/extract-skills.py @@ -21,6 +21,7 @@ the unified index existed). import json import os from collections import Counter +from datetime import datetime, timezone import yaml @@ -32,6 +33,7 @@ LOCAL_SKILL_DIRS = [ UNIFIED_INDEX_PATH = os.path.join(REPO_ROOT, "website", "static", "api", "skills-index.json") LEGACY_INDEX_CACHE_DIR = os.path.join(REPO_ROOT, "skills", "index-cache") OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills.json") +META_OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills-meta.json") CATEGORY_LABELS = { "apple": "Apple", @@ -280,19 +282,32 @@ def _label_for_github_identifier(identifier: str) -> str: def extract_unified_index_skills(): - """Read website/static/api/skills-index.json — the canonical multi-source index.""" + """Read website/static/api/skills-index.json — the canonical multi-source index. + + Returns ``(skills, meta)`` where ``meta`` carries the index's + ``generated_at`` timestamp and total count so the Skills Hub page can + show a "Last refreshed …" badge. Returns ``(None, None)`` when the + index file is absent or malformed (caller falls back to the legacy + cache). + """ if not os.path.isfile(UNIFIED_INDEX_PATH): - return None + return None, None try: with open(UNIFIED_INDEX_PATH, encoding="utf-8") as f: data = json.load(f) except (json.JSONDecodeError, OSError) as e: print(f"[extract-skills] Failed to read unified index: {e}") - return None + return None, None if not isinstance(data, dict) or "skills" not in data: - return None + return None, None + + meta = { + "indexGeneratedAt": data.get("generated_at", ""), + "indexSkillCount": data.get("skill_count", 0), + "indexVersion": data.get("version", 0), + } out = [] for entry in data.get("skills", []): @@ -352,7 +367,7 @@ def extract_unified_index_skills(): "installCmd": install_cmd, }) - return out + return out, meta def extract_legacy_cache_skills(): @@ -490,13 +505,14 @@ def _consolidate_small_categories(skills: list) -> list: def main(): local = extract_local_skills() - unified = extract_unified_index_skills() + unified, index_meta = extract_unified_index_skills() if unified is not None: external = unified external_source = "unified index" else: external = extract_legacy_cache_skills() external_source = "legacy index-cache" + index_meta = None print( f"[extract-skills] WARNING: unified index not found at " f"{UNIFIED_INDEX_PATH}; falling back to {external_source}. " @@ -517,16 +533,32 @@ def main(): with open(OUTPUT, "w", encoding="utf-8") as f: json.dump(all_skills, f, indent=2) + # Sidecar meta file so the page can render a "Last refreshed" badge + # without changing the shape of skills.json. + by_source = Counter(s["source"] for s in all_skills) + meta = { + "extractedAt": datetime.now(timezone.utc).isoformat(), + "totalSkills": len(all_skills), + "localSkills": len(local), + "externalSkills": len(external), + "externalSource": external_source, + "bySource": dict(by_source.most_common()), + } + if index_meta: + meta.update(index_meta) + with open(META_OUTPUT, "w", encoding="utf-8") as f: + json.dump(meta, f, indent=2) + print(f"Extracted {len(all_skills)} skills to {OUTPUT}") print(f" {len(local)} local ({sum(1 for s in local if s['source'] == 'built-in')} built-in, " f"{sum(1 for s in local if s['source'] == 'optional')} optional)") print(f" {len(external)} from {external_source}") - # Breakdown by source - by_source = Counter(s["source"] for s in all_skills) print("By source:") for src, count in by_source.most_common(): print(f" {src}: {count}") + if index_meta and index_meta.get("indexGeneratedAt"): + print(f"Unified index built at: {index_meta['indexGeneratedAt']}") if __name__ == "__main__": diff --git a/website/src/pages/skills/index.tsx b/website/src/pages/skills/index.tsx index 495fb35ca5d..0ef6f64abc2 100644 --- a/website/src/pages/skills/index.tsx +++ b/website/src/pages/skills/index.tsx @@ -1,6 +1,7 @@ import React, { useState, useMemo, useCallback, useRef, useEffect } from "react"; import Layout from "@theme/Layout"; import skills from "../../data/skills.json"; +import meta from "../../data/skills-meta.json"; import styles from "./styles.module.css"; interface Skill { @@ -24,6 +25,33 @@ interface Skill { const allSkills: Skill[] = skills as Skill[]; +interface IndexMeta { + extractedAt?: string; + indexGeneratedAt?: string; + totalSkills?: number; + externalSource?: string; + bySource?: Record; +} +const indexMeta: IndexMeta = meta as IndexMeta; + +function formatRelativeTime(iso?: string): string | null { + if (!iso) return null; + const then = new Date(iso).getTime(); + if (!Number.isFinite(then)) return null; + const now = Date.now(); + const diffMs = now - then; + if (diffMs < 0) return "just now"; + const mins = Math.floor(diffMs / 60_000); + if (mins < 1) return "just now"; + if (mins < 60) return `${mins} minute${mins === 1 ? "" : "s"} ago`; + const hours = Math.floor(mins / 60); + if (hours < 24) return `${hours} hour${hours === 1 ? "" : "s"} ago`; + const days = Math.floor(hours / 24); + if (days < 30) return `${days} day${days === 1 ? "" : "s"} ago`; + const months = Math.floor(days / 30); + return `${months} month${months === 1 ? "" : "s"} ago`; +} + const CATEGORY_ICONS: Record = { apple: "\u{f179}", "autonomous-ai-agents": "\u{1F916}", @@ -487,6 +515,17 @@ export default function SkillsDashboard() { {allSkills.length} skills across {sources.length - 1} registries

+ {(indexMeta?.indexGeneratedAt || indexMeta?.extractedAt) && ( +

+ Catalog refreshed{" "} + + {formatRelativeTime( + indexMeta.indexGeneratedAt || indexMeta.extractedAt, + ) || "recently"} + + {" "}· auto-rebuilt twice daily +

+ )}