mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
Layered safety so the Skills Hub at /docs/skills stays in sync without silent rot. Three pieces: 1. build_skills_index.py — refuses to ship a degenerate index. EXPECTED_FLOORS per source (skills.sh ≥100, lobehub ≥100, clawhub ≥50, official ≥50, github ≥30, browse-sh ≥50) and MIN_TOTAL=1500. Any source collapsing to zero (the silent OpenAI breakage that hid for weeks) now fails the workflow loud — broken index never reaches the live site. 2. extract-skills.py + the React page — visible freshness signal. Sidecar website/src/data/skills-meta.json carries the index's generated_at timestamp, plus per-source counts. Skills Hub renders a 'Catalog refreshed N hours ago · auto-rebuilt twice daily' line under the hero copy. If the cron stalls, users see the staleness immediately. 3. .github/workflows/skills-index-freshness.yml — watchdog cron. Every 4 hours, fetches the live /docs/api/skills-index.json, validates shape, checks age (>26h is stale), checks the same per-source floors, and opens (or appends to) a GitHub issue when anything is off. The issue is title-prefixed [skills-index-watchdog] so subsequent failures append a comment instead of spamming new issues. Net effect: - A silent regression like 'OpenAI tap moved its skills' now fails the build instead of shipping a quietly broken catalog. - A stuck cron (like the landingpage breakage that ran red for weeks) now files an issue within 4 hours. - Users see how fresh the catalog is on the page itself. Test plan: - Local: built skills-meta.json from the live index → 'Catalog refreshed N minutes ago' rendered correctly in the static HTML. - Probe logic dry-run against the live index: total=2456, all 6 sources above floor, age 0.1h — issues=NONE. - Triggered skills-index.yml manually; both jobs green, deploy-site.yml dispatch fired.
565 lines
20 KiB
Python
565 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""Extract skill metadata into website/src/data/skills.json for the Skills Hub page.
|
|
|
|
Two data sources:
|
|
|
|
1. Local SKILL.md files under ``skills/`` (built-in) and ``optional-skills/``
|
|
(official optional). These give us full metadata — overview prose, version,
|
|
license, env vars, commands — that the unified index doesn't carry.
|
|
|
|
2. The unified Hermes Skills Index at ``website/static/api/skills-index.json``,
|
|
built twice daily by ``scripts/build_skills_index.py`` (workflow
|
|
``.github/workflows/skills-index.yml``). Covers skills.sh, ClawHub, browse.sh,
|
|
LobeHub, Claude Marketplace, well-known endpoints, and the GitHub taps
|
|
(openai/skills, anthropics/skills, huggingface/skills, VoltAgent, etc.).
|
|
|
|
Legacy fallback: if the unified index is missing AND ``skills/index-cache/``
|
|
contains pre-baked JSON dumps, we read those (preserves behaviour from before
|
|
the unified index existed).
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
from collections import Counter
|
|
from datetime import datetime, timezone
|
|
|
|
import yaml
|
|
|
|
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
LOCAL_SKILL_DIRS = [
|
|
("skills", "built-in"),
|
|
("optional-skills", "optional"),
|
|
]
|
|
UNIFIED_INDEX_PATH = os.path.join(REPO_ROOT, "website", "static", "api", "skills-index.json")
|
|
LEGACY_INDEX_CACHE_DIR = os.path.join(REPO_ROOT, "skills", "index-cache")
|
|
OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills.json")
|
|
META_OUTPUT = os.path.join(REPO_ROOT, "website", "src", "data", "skills-meta.json")
|
|
|
|
CATEGORY_LABELS = {
|
|
"apple": "Apple",
|
|
"autonomous-ai-agents": "AI Agents",
|
|
"blockchain": "Blockchain",
|
|
"communication": "Communication",
|
|
"creative": "Creative",
|
|
"data-science": "Data Science",
|
|
"devops": "DevOps",
|
|
"dogfood": "Dogfood",
|
|
"domain": "Domain",
|
|
"email": "Email",
|
|
"gaming": "Gaming",
|
|
"gifs": "GIFs",
|
|
"github": "GitHub",
|
|
"health": "Health",
|
|
"inference-sh": "Inference",
|
|
"leisure": "Leisure",
|
|
"mcp": "MCP",
|
|
"media": "Media",
|
|
"migration": "Migration",
|
|
"mlops": "MLOps",
|
|
"note-taking": "Note-Taking",
|
|
"productivity": "Productivity",
|
|
"red-teaming": "Red Teaming",
|
|
"research": "Research",
|
|
"security": "Security",
|
|
"smart-home": "Smart Home",
|
|
"social-media": "Social Media",
|
|
"software-development": "Software Dev",
|
|
"translation": "Translation",
|
|
"other": "Other",
|
|
}
|
|
|
|
# Map the source ids the unified index emits to the friendly labels the
|
|
# Skills Hub UI uses. Keep these in sync with the SOURCE_CONFIG dict in
|
|
# website/src/pages/skills/index.tsx.
|
|
UNIFIED_SOURCE_LABELS = {
|
|
"official": "official", # treated as our "optional" tier in the UI
|
|
"skills.sh": "skills.sh",
|
|
"skills-sh": "skills.sh",
|
|
"clawhub": "ClawHub",
|
|
"browse-sh": "browse.sh",
|
|
"lobehub": "LobeHub",
|
|
"claude-marketplace": "Claude Marketplace",
|
|
"well-known": "Well-Known",
|
|
"github": "GitHub", # default for non-named GitHub taps
|
|
}
|
|
|
|
# Repo-specific labels for the unified index's "github" source. Lets us
|
|
# call out the well-known taps with their vendor name instead of a generic
|
|
# "GitHub" pill. Match is checked against the leading "owner/repo/" prefix
|
|
# of the identifier.
|
|
GITHUB_TAP_LABELS = {
|
|
"openai/skills": "OpenAI",
|
|
"anthropics/skills": "Anthropic",
|
|
"huggingface/skills": "HuggingFace",
|
|
"VoltAgent/awesome-agent-skills": "VoltAgent",
|
|
"garrytan/gstack": "gstack",
|
|
"MiniMax-AI/cli": "MiniMax",
|
|
}
|
|
|
|
# Legacy filename -> label mapping for the deprecated skills/index-cache/
|
|
# fallback. Used only when website/static/api/skills-index.json is absent.
|
|
LEGACY_SOURCE_LABELS = {
|
|
"anthropics_skills": "Anthropic",
|
|
"openai_skills": "OpenAI",
|
|
"claude_marketplace": "Claude Marketplace",
|
|
"lobehub": "LobeHub",
|
|
}
|
|
|
|
|
|
def _extract_overview(body: str) -> str:
|
|
"""Pull the first non-heading paragraph from a SKILL.md body."""
|
|
if not body:
|
|
return ""
|
|
paragraphs = [p.strip() for p in body.split("\n\n") if p.strip()]
|
|
for p in paragraphs[:6]:
|
|
if p.startswith("#"):
|
|
lines = [ln for ln in p.split("\n") if ln.strip() and not ln.lstrip().startswith("#")]
|
|
if lines:
|
|
p = "\n".join(lines).strip()
|
|
else:
|
|
continue
|
|
if p.startswith(":::"):
|
|
continue
|
|
if p.startswith("```") or p.startswith("~~~"):
|
|
continue
|
|
if len(p) > 500:
|
|
cut = p[:500]
|
|
last_period = cut.rfind(". ")
|
|
if last_period > 200:
|
|
p = cut[: last_period + 1]
|
|
else:
|
|
p = cut.rstrip() + "…"
|
|
return p
|
|
return ""
|
|
|
|
|
|
def _docs_page_path(rel_dir: str, source_label: str) -> str:
|
|
"""Compute the per-skill docs-site URL slug for a given SKILL.md location.
|
|
|
|
Mirrors the slug logic in website/scripts/generate-skill-docs.py:
|
|
bundled + skills/<cat>/<slug>/SKILL.md -> bundled/<cat>/<cat>-<slug>
|
|
bundled + skills/<cat>/<sub>/<slug>/SKILL.md -> bundled/<cat>/<cat>-<sub>-<slug>
|
|
optional + optional-skills/<cat>/<slug>/SKILL.md -> optional/<cat>/<cat>-<slug>
|
|
"""
|
|
parts = [p for p in rel_dir.split(os.sep) if p]
|
|
if not parts:
|
|
return ""
|
|
source_dir = "bundled" if source_label == "built-in" else "optional"
|
|
if len(parts) == 1:
|
|
category, slug = parts[0], parts[0]
|
|
return f"{source_dir}/{category}/{category}-{slug}"
|
|
if len(parts) == 2:
|
|
category, slug = parts
|
|
return f"{source_dir}/{category}/{category}-{slug}"
|
|
if len(parts) == 3:
|
|
category, sub, slug = parts
|
|
return f"{source_dir}/{category}/{category}-{sub}-{slug}"
|
|
return ""
|
|
|
|
|
|
def _install_command(source: str, identifier: str, name: str) -> str:
|
|
"""Build the ``hermes skills install …`` command for a unified-index entry.
|
|
|
|
These show up in the SkillCard panel so users can copy-paste them. We try
|
|
to use the most idiomatic identifier per source.
|
|
"""
|
|
if not identifier:
|
|
return f"hermes skills install {name}"
|
|
src = source.lower()
|
|
if src in {"official", "built-in", "optional"}:
|
|
# OptionalSkillSource emits identifiers like "official/security/1password"
|
|
return f"hermes skills install {identifier}"
|
|
if src in {"skills.sh", "skills-sh"}:
|
|
# Already wrapped as "skills-sh/owner/repo/skill" by the source
|
|
return f"hermes skills install {identifier}"
|
|
if src == "clawhub":
|
|
return f"hermes skills install clawhub/{identifier}"
|
|
if src == "browse-sh":
|
|
# Identifier already includes the "browse-sh/" prefix from BrowseShSource
|
|
return f"hermes skills install {identifier}"
|
|
if src == "lobehub":
|
|
return f"hermes skills install {identifier}"
|
|
if src == "claude-marketplace":
|
|
return f"hermes skills install {identifier}"
|
|
if src == "github":
|
|
return f"hermes skills install {identifier}"
|
|
if src == "well-known":
|
|
return f"hermes skills install {identifier}"
|
|
return f"hermes skills install {identifier}"
|
|
|
|
|
|
def extract_local_skills():
|
|
skills = []
|
|
|
|
for base_dir, source_label in LOCAL_SKILL_DIRS:
|
|
base_path = os.path.join(REPO_ROOT, base_dir)
|
|
if not os.path.isdir(base_path):
|
|
continue
|
|
|
|
for root, _dirs, files in os.walk(base_path):
|
|
if "SKILL.md" not in files:
|
|
continue
|
|
|
|
skill_path = os.path.join(root, "SKILL.md")
|
|
with open(skill_path, encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
if not content.startswith("---"):
|
|
continue
|
|
|
|
parts = content.split("---", 2)
|
|
if len(parts) < 3:
|
|
continue
|
|
|
|
try:
|
|
fm = yaml.safe_load(parts[1])
|
|
except yaml.YAMLError:
|
|
continue
|
|
|
|
if not fm or not isinstance(fm, dict):
|
|
continue
|
|
|
|
body = parts[2].strip()
|
|
overview = _extract_overview(body)
|
|
|
|
rel = os.path.relpath(root, base_path)
|
|
category = rel.split(os.sep)[0]
|
|
|
|
tags = []
|
|
metadata = fm.get("metadata")
|
|
if isinstance(metadata, dict):
|
|
hermes_meta = metadata.get("hermes", {})
|
|
if isinstance(hermes_meta, dict):
|
|
tags = hermes_meta.get("tags", [])
|
|
if not tags:
|
|
tags = fm.get("tags", [])
|
|
if isinstance(tags, str):
|
|
tags = [tags]
|
|
|
|
prereq = fm.get("prerequisites") or {}
|
|
env_vars = []
|
|
commands = []
|
|
if isinstance(prereq, dict):
|
|
ev = prereq.get("env_vars")
|
|
if isinstance(ev, list):
|
|
env_vars = [str(x) for x in ev if x]
|
|
elif isinstance(ev, str) and ev.strip():
|
|
env_vars = [ev.strip()]
|
|
cmds = prereq.get("commands")
|
|
if isinstance(cmds, list):
|
|
commands = [str(x) for x in cmds if x]
|
|
elif isinstance(cmds, str) and cmds.strip():
|
|
commands = [cmds.strip()]
|
|
|
|
skills.append({
|
|
"name": fm.get("name", os.path.basename(root)),
|
|
"description": fm.get("description", ""),
|
|
"overview": overview,
|
|
"category": category,
|
|
"categoryLabel": CATEGORY_LABELS.get(category, category.replace("-", " ").title()),
|
|
"source": source_label,
|
|
"tags": tags or [],
|
|
"platforms": fm.get("platforms", []),
|
|
"author": fm.get("author", ""),
|
|
"version": fm.get("version", ""),
|
|
"license": fm.get("license", ""),
|
|
"envVars": env_vars,
|
|
"commands": commands,
|
|
"docsPath": _docs_page_path(rel, source_label),
|
|
})
|
|
|
|
return skills
|
|
|
|
|
|
def _label_for_github_identifier(identifier: str) -> str:
|
|
"""Return a friendly source label for a unified-index 'github' entry."""
|
|
if not identifier:
|
|
return "GitHub"
|
|
for prefix, label in GITHUB_TAP_LABELS.items():
|
|
if identifier.startswith(prefix + "/") or identifier == prefix:
|
|
return label
|
|
return "GitHub"
|
|
|
|
|
|
def extract_unified_index_skills():
|
|
"""Read website/static/api/skills-index.json — the canonical multi-source index.
|
|
|
|
Returns ``(skills, meta)`` where ``meta`` carries the index's
|
|
``generated_at`` timestamp and total count so the Skills Hub page can
|
|
show a "Last refreshed …" badge. Returns ``(None, None)`` when the
|
|
index file is absent or malformed (caller falls back to the legacy
|
|
cache).
|
|
"""
|
|
if not os.path.isfile(UNIFIED_INDEX_PATH):
|
|
return None, None
|
|
|
|
try:
|
|
with open(UNIFIED_INDEX_PATH, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, OSError) as e:
|
|
print(f"[extract-skills] Failed to read unified index: {e}")
|
|
return None, None
|
|
|
|
if not isinstance(data, dict) or "skills" not in data:
|
|
return None, None
|
|
|
|
meta = {
|
|
"indexGeneratedAt": data.get("generated_at", ""),
|
|
"indexSkillCount": data.get("skill_count", 0),
|
|
"indexVersion": data.get("version", 0),
|
|
}
|
|
|
|
out = []
|
|
for entry in data.get("skills", []):
|
|
if not isinstance(entry, dict):
|
|
continue
|
|
source_id = (entry.get("source") or "").lower()
|
|
identifier = entry.get("identifier", "") or ""
|
|
name = entry.get("name") or identifier.split("/")[-1] or "unknown"
|
|
description = (entry.get("description") or "").split("\n")[0]
|
|
if len(description) > 280:
|
|
description = description[:277] + "…"
|
|
tags = entry.get("tags", []) or []
|
|
if not isinstance(tags, list):
|
|
tags = []
|
|
|
|
# Skip official entries here — extract_local_skills() already covered
|
|
# those from optional-skills/ with full metadata (overview, version, etc.).
|
|
if source_id == "official":
|
|
continue
|
|
|
|
# Map source id -> display label
|
|
if source_id == "github":
|
|
source_label = _label_for_github_identifier(identifier)
|
|
else:
|
|
source_label = UNIFIED_SOURCE_LABELS.get(source_id, source_id or "community")
|
|
|
|
# Guess a category from tags so the UI's category filter has a chance.
|
|
category = _guess_category(tags)
|
|
extra = entry.get("extra", {}) or {}
|
|
|
|
# Author hint from extras when available (skills.sh has installs;
|
|
# clawhub doesn't expose author).
|
|
author = ""
|
|
if source_id in {"skills.sh", "skills-sh"}:
|
|
repo = entry.get("repo", "")
|
|
if repo:
|
|
author = repo.split("/")[0]
|
|
|
|
install_cmd = _install_command(source_id, identifier, name)
|
|
|
|
out.append({
|
|
"name": name,
|
|
"description": description,
|
|
"overview": "",
|
|
"category": category,
|
|
"categoryLabel": "", # filled in _consolidate_small_categories
|
|
"source": source_label,
|
|
"tags": tags,
|
|
"platforms": [],
|
|
"author": author,
|
|
"version": "",
|
|
"license": "",
|
|
"envVars": [],
|
|
"commands": [],
|
|
"docsPath": "",
|
|
"identifier": identifier,
|
|
"installCmd": install_cmd,
|
|
})
|
|
|
|
return out, meta
|
|
|
|
|
|
def extract_legacy_cache_skills():
|
|
"""Read the deprecated skills/index-cache/ snapshots — fallback only."""
|
|
skills = []
|
|
|
|
if not os.path.isdir(LEGACY_INDEX_CACHE_DIR):
|
|
return skills
|
|
|
|
for filename in os.listdir(LEGACY_INDEX_CACHE_DIR):
|
|
if not filename.endswith(".json"):
|
|
continue
|
|
|
|
filepath = os.path.join(LEGACY_INDEX_CACHE_DIR, filename)
|
|
try:
|
|
with open(filepath, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
except (json.JSONDecodeError, OSError):
|
|
continue
|
|
|
|
stem = filename.replace(".json", "")
|
|
source_label = "community"
|
|
for key, label in LEGACY_SOURCE_LABELS.items():
|
|
if key in stem:
|
|
source_label = label
|
|
break
|
|
|
|
if isinstance(data, dict) and "agents" in data:
|
|
for agent in data["agents"]:
|
|
if not isinstance(agent, dict):
|
|
continue
|
|
skills.append({
|
|
"name": agent.get("identifier", agent.get("meta", {}).get("title", "unknown")),
|
|
"description": (agent.get("meta", {}).get("description", "") or "").split("\n")[0][:200],
|
|
"category": _guess_category(agent.get("meta", {}).get("tags", [])),
|
|
"categoryLabel": "",
|
|
"source": source_label,
|
|
"tags": agent.get("meta", {}).get("tags", []),
|
|
"platforms": [],
|
|
"author": agent.get("author", ""),
|
|
"version": "",
|
|
})
|
|
continue
|
|
|
|
if isinstance(data, list):
|
|
for entry in data:
|
|
if not isinstance(entry, dict) or not entry.get("name"):
|
|
continue
|
|
if "skills" in entry and isinstance(entry["skills"], list):
|
|
continue
|
|
skills.append({
|
|
"name": entry.get("name", ""),
|
|
"description": entry.get("description", ""),
|
|
"category": "uncategorized",
|
|
"categoryLabel": "",
|
|
"source": source_label,
|
|
"tags": entry.get("tags", []),
|
|
"platforms": [],
|
|
"author": "",
|
|
"version": "",
|
|
})
|
|
|
|
for s in skills:
|
|
if not s["categoryLabel"]:
|
|
s["categoryLabel"] = CATEGORY_LABELS.get(
|
|
s["category"],
|
|
s["category"].replace("-", " ").title() if s["category"] else "Uncategorized",
|
|
)
|
|
|
|
return skills
|
|
|
|
|
|
TAG_TO_CATEGORY = {}
|
|
for _cat, _tags in {
|
|
"software-development": [
|
|
"programming", "code", "coding", "software-development",
|
|
"frontend-development", "backend-development", "web-development",
|
|
"react", "python", "typescript", "java", "rust",
|
|
],
|
|
"creative": ["writing", "design", "creative", "art", "image-generation"],
|
|
"research": ["education", "academic", "research"],
|
|
"social-media": ["marketing", "seo", "social-media"],
|
|
"productivity": ["productivity", "business"],
|
|
"data-science": ["data", "data-science"],
|
|
"mlops": ["machine-learning", "deep-learning"],
|
|
"devops": ["devops"],
|
|
"gaming": ["gaming", "game", "game-development"],
|
|
"media": ["music", "media", "video"],
|
|
"health": ["health", "fitness"],
|
|
"translation": ["translation", "language-learning"],
|
|
"security": ["security", "cybersecurity"],
|
|
}.items():
|
|
for _t in _tags:
|
|
TAG_TO_CATEGORY[_t] = _cat
|
|
|
|
|
|
def _guess_category(tags: list) -> str:
|
|
if not tags:
|
|
return "uncategorized"
|
|
for tag in tags:
|
|
if not isinstance(tag, str):
|
|
continue
|
|
cat = TAG_TO_CATEGORY.get(tag.lower())
|
|
if cat:
|
|
return cat
|
|
first = tags[0] if isinstance(tags[0], str) else ""
|
|
return first.lower().replace(" ", "-") if first else "uncategorized"
|
|
|
|
|
|
MIN_CATEGORY_SIZE = 4
|
|
|
|
|
|
def _consolidate_small_categories(skills: list) -> list:
|
|
for s in skills:
|
|
if s["category"] in {"uncategorized", ""}:
|
|
s["category"] = "other"
|
|
s["categoryLabel"] = "Other"
|
|
|
|
counts = Counter(s["category"] for s in skills)
|
|
small_cats = {cat for cat, n in counts.items() if n < MIN_CATEGORY_SIZE}
|
|
|
|
for s in skills:
|
|
if s["category"] in small_cats:
|
|
s["category"] = "other"
|
|
s["categoryLabel"] = "Other"
|
|
elif not s["categoryLabel"]:
|
|
s["categoryLabel"] = CATEGORY_LABELS.get(
|
|
s["category"],
|
|
s["category"].replace("-", " ").title() if s["category"] else "Uncategorized",
|
|
)
|
|
|
|
return skills
|
|
|
|
|
|
def main():
|
|
local = extract_local_skills()
|
|
|
|
unified, index_meta = extract_unified_index_skills()
|
|
if unified is not None:
|
|
external = unified
|
|
external_source = "unified index"
|
|
else:
|
|
external = extract_legacy_cache_skills()
|
|
external_source = "legacy index-cache"
|
|
index_meta = None
|
|
print(
|
|
f"[extract-skills] WARNING: unified index not found at "
|
|
f"{UNIFIED_INDEX_PATH}; falling back to {external_source}. "
|
|
f"Run `python3 scripts/build_skills_index.py` to refresh."
|
|
)
|
|
|
|
all_skills = _consolidate_small_categories(local + external)
|
|
|
|
source_order = {"built-in": 0, "optional": 1}
|
|
all_skills.sort(key=lambda s: (
|
|
source_order.get(s["source"], 2),
|
|
1 if s["category"] == "other" else 0,
|
|
s["category"],
|
|
s["name"],
|
|
))
|
|
|
|
os.makedirs(os.path.dirname(OUTPUT), exist_ok=True)
|
|
with open(OUTPUT, "w", encoding="utf-8") as f:
|
|
json.dump(all_skills, f, indent=2)
|
|
|
|
# Sidecar meta file so the page can render a "Last refreshed" badge
|
|
# without changing the shape of skills.json.
|
|
by_source = Counter(s["source"] for s in all_skills)
|
|
meta = {
|
|
"extractedAt": datetime.now(timezone.utc).isoformat(),
|
|
"totalSkills": len(all_skills),
|
|
"localSkills": len(local),
|
|
"externalSkills": len(external),
|
|
"externalSource": external_source,
|
|
"bySource": dict(by_source.most_common()),
|
|
}
|
|
if index_meta:
|
|
meta.update(index_meta)
|
|
with open(META_OUTPUT, "w", encoding="utf-8") as f:
|
|
json.dump(meta, f, indent=2)
|
|
|
|
print(f"Extracted {len(all_skills)} skills to {OUTPUT}")
|
|
print(f" {len(local)} local ({sum(1 for s in local if s['source'] == 'built-in')} built-in, "
|
|
f"{sum(1 for s in local if s['source'] == 'optional')} optional)")
|
|
print(f" {len(external)} from {external_source}")
|
|
|
|
print("By source:")
|
|
for src, count in by_source.most_common():
|
|
print(f" {src}: {count}")
|
|
if index_meta and index_meta.get("indexGeneratedAt"):
|
|
print(f"Unified index built at: {index_meta['indexGeneratedAt']}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|