diff --git a/.github/workflows/deploy-site.yml b/.github/workflows/deploy-site.yml index 3c471f376d..c55a62908d 100644 --- a/.github/workflows/deploy-site.yml +++ b/.github/workflows/deploy-site.yml @@ -41,11 +41,19 @@ jobs: python-version: '3.11' - name: Install PyYAML for skill extraction - run: pip install pyyaml + run: pip install pyyaml httpx - name: Extract skill metadata for dashboard run: python3 website/scripts/extract-skills.py + - name: Build skills index (if not already present) + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + if [ ! -f website/static/api/skills-index.json ]; then + python3 scripts/build_skills_index.py || echo "Skills index build failed (non-fatal)" + fi + - name: Install dependencies run: npm ci working-directory: website diff --git a/.github/workflows/skills-index.yml b/.github/workflows/skills-index.yml new file mode 100644 index 0000000000..6c03e40746 --- /dev/null +++ b/.github/workflows/skills-index.yml @@ -0,0 +1,101 @@ +name: Build Skills Index + +on: + schedule: + # Run twice daily: 6 AM and 6 PM UTC + - cron: '0 6,18 * * *' + workflow_dispatch: # Manual trigger + push: + branches: [main] + paths: + - 'scripts/build_skills_index.py' + - '.github/workflows/skills-index.yml' + +permissions: + contents: read + +jobs: + build-index: + # Only run on the upstream repository, not on forks + if: github.repository == 'NousResearch/hermes-agent' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: pip install httpx pyyaml + + - name: Build skills index + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: python scripts/build_skills_index.py + + - name: Upload index artifact + uses: actions/upload-artifact@v4 + with: + name: skills-index + path: website/static/api/skills-index.json + retention-days: 7 + + deploy-with-index: + needs: build-index + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + environment: + name: github-pages + url: ${{ steps.deploy.outputs.page_url }} + # Only deploy on schedule or manual trigger (not on every push to the script) + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + steps: + - uses: actions/checkout@v4 + + - uses: actions/download-artifact@v4 + with: + name: skills-index + path: website/static/api/ + + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + cache-dependency-path: website/package-lock.json + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install PyYAML for skill extraction + run: pip install pyyaml + + - name: Extract skill metadata for dashboard + run: python3 website/scripts/extract-skills.py + + - name: Install dependencies + run: npm ci + working-directory: website + + - name: Build Docusaurus + run: npm run build + working-directory: website + + - name: Stage deployment + run: | + mkdir -p _site/docs + cp -r landingpage/* _site/ + cp -r website/build/* _site/docs/ + echo "hermes-agent.nousresearch.com" > _site/CNAME + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: _site + + - name: Deploy to GitHub Pages + id: deploy + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore index baa31a543c..73132e4f4a 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,4 @@ mini-swe-agent/ # Nix .direnv/ result +website/static/api/skills-index.json diff --git a/scripts/build_skills_index.py b/scripts/build_skills_index.py new file mode 100644 index 0000000000..efa1ba76ed --- /dev/null +++ b/scripts/build_skills_index.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +"""Build the Hermes Skills Index — a centralized JSON catalog of all skills. + +This script crawls every skill source (skills.sh, GitHub taps, official, +clawhub, lobehub, claude-marketplace) and writes a JSON index with resolved +GitHub paths. The index is served as a static file on the docs site so that +`hermes skills search/install` can use it without hitting the GitHub API. + +Usage: + # Local (uses gh CLI or GITHUB_TOKEN for auth) + python scripts/build_skills_index.py + + # CI (set GITHUB_TOKEN as secret) + GITHUB_TOKEN=ghp_... python scripts/build_skills_index.py + +Output: website/static/api/skills-index.json +""" + +import json +import os +import sys +import time +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone + +# Allow importing from repo root +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, REPO_ROOT) + +# Ensure HERMES_HOME is set (needed by tools/skills_hub.py imports) +os.environ.setdefault("HERMES_HOME", os.path.join(os.path.expanduser("~"), ".hermes")) + +from tools.skills_hub import ( + GitHubAuth, + GitHubSource, + SkillsShSource, + OptionalSkillSource, + WellKnownSkillSource, + ClawHubSource, + ClaudeMarketplaceSource, + LobeHubSource, + SkillMeta, +) +import httpx + +OUTPUT_PATH = os.path.join(REPO_ROOT, "website", "static", "api", "skills-index.json") +INDEX_VERSION = 1 + + +def _meta_to_dict(meta: SkillMeta) -> dict: + """Convert a SkillMeta to a serializable dict.""" + return { + "name": meta.name, + "description": meta.description, + "source": meta.source, + "identifier": meta.identifier, + "trust_level": meta.trust_level, + "repo": meta.repo or "", + "path": meta.path or "", + "tags": meta.tags or [], + "extra": meta.extra or {}, + } + + +def crawl_source(source, source_name: str, limit: int) -> list: + """Crawl a single source and return skill dicts.""" + print(f" Crawling {source_name}...", flush=True) + start = time.time() + try: + results = source.search("", limit=limit) + except Exception as e: + print(f" Error crawling {source_name}: {e}", file=sys.stderr) + return [] + skills = [_meta_to_dict(m) for m in results] + elapsed = time.time() - start + print(f" {source_name}: {len(skills)} skills ({elapsed:.1f}s)", flush=True) + return skills + + +def crawl_skills_sh(source: SkillsShSource) -> list: + """Crawl skills.sh using popular queries for broad coverage.""" + print(" Crawling skills.sh (popular queries)...", flush=True) + start = time.time() + + queries = [ + "", # featured + "react", "python", "web", "api", "database", "docker", + "testing", "scraping", "design", "typescript", "git", + "aws", "security", "data", "ml", "ai", "devops", + "frontend", "backend", "mobile", "cli", "documentation", + "kubernetes", "terraform", "rust", "go", "java", + ] + + all_skills: dict[str, dict] = {} + for query in queries: + try: + results = source.search(query, limit=50) + for meta in results: + entry = _meta_to_dict(meta) + if entry["identifier"] not in all_skills: + all_skills[entry["identifier"]] = entry + except Exception as e: + print(f" Warning: skills.sh search '{query}' failed: {e}", + file=sys.stderr) + + elapsed = time.time() - start + print(f" skills.sh: {len(all_skills)} unique skills ({elapsed:.1f}s)", + flush=True) + return list(all_skills.values()) + + +def _fetch_repo_tree(repo: str, auth: GitHubAuth) -> list: + """Fetch the recursive tree for a repo. Returns list of tree entries.""" + headers = auth.get_headers() + try: + resp = httpx.get( + f"https://api.github.com/repos/{repo}", + headers=headers, timeout=15, follow_redirects=True, + ) + if resp.status_code != 200: + return [] + branch = resp.json().get("default_branch", "main") + + resp = httpx.get( + f"https://api.github.com/repos/{repo}/git/trees/{branch}", + params={"recursive": "1"}, + headers=headers, timeout=30, follow_redirects=True, + ) + if resp.status_code != 200: + return [] + data = resp.json() + if data.get("truncated"): + return [] + return data.get("tree", []) + except Exception: + return [] + + +def batch_resolve_paths(skills: list, auth: GitHubAuth) -> list: + """Resolve GitHub paths for skills.sh entries using batch tree lookups. + + Instead of resolving each skill individually (N×M API calls), we: + 1. Group skills by repo + 2. Fetch one tree per repo (2 API calls per repo) + 3. Find all SKILL.md files in the tree + 4. Match skills to their resolved paths + """ + # Filter to skills.sh entries that need resolution + skills_sh = [s for s in skills if s["source"] in ("skills.sh", "skills-sh")] + if not skills_sh: + return skills + + print(f" Resolving paths for {len(skills_sh)} skills.sh entries...", + flush=True) + start = time.time() + + # Group by repo + by_repo: dict[str, list] = defaultdict(list) + for s in skills_sh: + repo = s.get("repo", "") + if repo: + by_repo[repo].append(s) + + print(f" {len(by_repo)} unique repos to scan", flush=True) + + resolved_count = 0 + + # Fetch trees in parallel (up to 6 concurrent) + def _resolve_repo(repo: str, entries: list): + tree = _fetch_repo_tree(repo, auth) + if not tree: + return 0 + + # Find all SKILL.md paths in this repo + skill_paths = {} # skill_dir_name -> full_path + for item in tree: + if item.get("type") != "blob": + continue + path = item.get("path", "") + if path.endswith("/SKILL.md"): + skill_dir = path[: -len("/SKILL.md")] + dir_name = skill_dir.split("/")[-1] + skill_paths[dir_name.lower()] = f"{repo}/{skill_dir}" + + # Also check SKILL.md frontmatter name if we can match by path + # For now, just index by directory name + elif path == "SKILL.md": + # Root-level SKILL.md + skill_paths["_root_"] = f"{repo}" + + count = 0 + for entry in entries: + # Try to match the skill's name/path to a tree entry + skill_name = entry.get("name", "").lower() + skill_path = entry.get("path", "").lower() + identifier = entry.get("identifier", "") + + # Extract the skill token from the identifier + # e.g. "skills-sh/d4vinci/scrapling/scrapling-official" -> "scrapling-official" + parts = identifier.replace("skills-sh/", "").replace("skills.sh/", "") + skill_token = parts.split("/")[-1].lower() if "/" in parts else "" + + # Try matching in order of likelihood + for candidate in [skill_token, skill_name, skill_path]: + if not candidate: + continue + matched = skill_paths.get(candidate) + if matched: + entry["resolved_github_id"] = matched + count += 1 + break + else: + # Try fuzzy: skill_token with common transformations + for tree_name, tree_path in skill_paths.items(): + if (skill_token and ( + tree_name.replace("-", "") == skill_token.replace("-", "") + or skill_token in tree_name + or tree_name in skill_token + )): + entry["resolved_github_id"] = tree_path + count += 1 + break + + return count + + with ThreadPoolExecutor(max_workers=6) as pool: + futures = { + pool.submit(_resolve_repo, repo, entries): repo + for repo, entries in by_repo.items() + } + for future in as_completed(futures): + try: + resolved_count += future.result() + except Exception as e: + repo = futures[future] + print(f" Warning: {repo}: {e}", file=sys.stderr) + + elapsed = time.time() - start + print(f" Resolved {resolved_count}/{len(skills_sh)} paths ({elapsed:.1f}s)", + flush=True) + return skills + + +def main(): + print("Building Hermes Skills Index...", flush=True) + overall_start = time.time() + + auth = GitHubAuth() + print(f"GitHub auth: {auth.auth_method()}") + if auth.auth_method() == "anonymous": + print("WARNING: No GitHub authentication — rate limit is 60/hr. " + "Set GITHUB_TOKEN for better results.", file=sys.stderr) + + skills_sh_source = SkillsShSource(auth=auth) + sources = { + "official": OptionalSkillSource(), + "well-known": WellKnownSkillSource(), + "github": GitHubSource(auth=auth), + "clawhub": ClawHubSource(), + "claude-marketplace": ClaudeMarketplaceSource(auth=auth), + "lobehub": LobeHubSource(), + } + + all_skills: list[dict] = [] + + # Crawl skills.sh + all_skills.extend(crawl_skills_sh(skills_sh_source)) + + # Crawl other sources in parallel + with ThreadPoolExecutor(max_workers=4) as pool: + futures = {} + for name, source in sources.items(): + futures[pool.submit(crawl_source, source, name, 500)] = name + for future in as_completed(futures): + try: + all_skills.extend(future.result()) + except Exception as e: + print(f" Error: {e}", file=sys.stderr) + + # Batch resolve GitHub paths for skills.sh entries + all_skills = batch_resolve_paths(all_skills, auth) + + # Deduplicate by identifier + seen: dict[str, dict] = {} + for skill in all_skills: + key = skill["identifier"] + if key not in seen: + seen[key] = skill + deduped = list(seen.values()) + + # Sort + source_order = {"official": 0, "skills-sh": 1, "skills.sh": 1, + "github": 2, "well-known": 3, "clawhub": 4, + "claude-marketplace": 5, "lobehub": 6} + deduped.sort(key=lambda s: (source_order.get(s["source"], 99), s["name"])) + + # Build index + index = { + "version": INDEX_VERSION, + "generated_at": datetime.now(timezone.utc).isoformat(), + "skill_count": len(deduped), + "skills": deduped, + } + + os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) + with open(OUTPUT_PATH, "w") as f: + json.dump(index, f, separators=(",", ":"), ensure_ascii=False) + + elapsed = time.time() - overall_start + file_size = os.path.getsize(OUTPUT_PATH) + print(f"\nDone! {len(deduped)} skills indexed in {elapsed:.0f}s") + print(f"Output: {OUTPUT_PATH} ({file_size / 1024:.0f} KB)") + + from collections import Counter + by_source = Counter(s["source"] for s in deduped) + for src, count in sorted(by_source.items(), key=lambda x: -x[1]): + resolved = sum(1 for s in deduped + if s["source"] == src and s.get("resolved_github_id")) + extra = f" ({resolved} resolved)" if resolved else "" + print(f" {src}: {count}{extra}") + + +if __name__ == "__main__": + main() diff --git a/tools/skills_hub.py b/tools/skills_hub.py index 8c7b7a23fd..47aef8075b 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -2698,6 +2698,222 @@ def check_for_skill_updates( return results +# --------------------------------------------------------------------------- +# Hermes centralized index source +# --------------------------------------------------------------------------- + +HERMES_INDEX_URL = "https://hermes-agent.nousresearch.com/docs/api/skills-index.json" +HERMES_INDEX_CACHE_FILE = INDEX_CACHE_DIR / "hermes-index.json" +HERMES_INDEX_TTL = 6 * 3600 # 6 hours + + +def _load_hermes_index() -> Optional[dict]: + """Fetch the centralized skills index, with local cache. + + The index is a JSON file hosted on the docs site, rebuilt daily by CI. + We cache it locally for HERMES_INDEX_TTL seconds to avoid repeated + downloads within a session. + """ + # Check local cache + if HERMES_INDEX_CACHE_FILE.exists(): + try: + age = time.time() - HERMES_INDEX_CACHE_FILE.stat().st_mtime + if age < HERMES_INDEX_TTL: + return json.loads(HERMES_INDEX_CACHE_FILE.read_text()) + except (OSError, json.JSONDecodeError): + pass + + # Fetch from docs site + try: + resp = httpx.get(HERMES_INDEX_URL, timeout=15, follow_redirects=True) + if resp.status_code != 200: + logger.debug("Hermes index fetch returned %d", resp.status_code) + return _load_stale_index_cache() + data = resp.json() + except (httpx.HTTPError, json.JSONDecodeError) as e: + logger.debug("Hermes index fetch failed: %s", e) + return _load_stale_index_cache() + + # Validate structure + if not isinstance(data, dict) or "skills" not in data: + return _load_stale_index_cache() + + # Cache locally + try: + HERMES_INDEX_CACHE_FILE.parent.mkdir(parents=True, exist_ok=True) + HERMES_INDEX_CACHE_FILE.write_text(json.dumps(data)) + except OSError: + pass + + return data + + +def _load_stale_index_cache() -> Optional[dict]: + """Fall back to stale cache when the network fetch fails.""" + if HERMES_INDEX_CACHE_FILE.exists(): + try: + return json.loads(HERMES_INDEX_CACHE_FILE.read_text()) + except (OSError, json.JSONDecodeError): + pass + return None + + +class HermesIndexSource(SkillSource): + """Skill source backed by the centralized Hermes Skills Index. + + The index is a JSON catalog published to the docs site and rebuilt + daily by CI. It contains metadata + resolved GitHub paths for every + skill, eliminating the need for users to hit the GitHub API for + search or path discovery. + + When the index is unavailable, all methods return empty / None so + downstream sources take over transparently. + """ + + def __init__(self, auth: GitHubAuth): + self._index: Optional[dict] = None + self._loaded = False + self.auth = auth + # Lazily create GitHubSource for fetch — only used when actually + # downloading files, which requires real GitHub API calls. + self._github: Optional[GitHubSource] = None + + def _ensure_loaded(self) -> dict: + if not self._loaded: + self._index = _load_hermes_index() + self._loaded = True + return self._index or {} + + def _get_github(self) -> GitHubSource: + if self._github is None: + self._github = GitHubSource(auth=self.auth) + return self._github + + def source_id(self) -> str: + return "hermes-index" + + @property + def is_available(self) -> bool: + """Whether the index is loaded and has skills.""" + index = self._ensure_loaded() + return bool(index.get("skills")) + + def trust_level_for(self, identifier: str) -> str: + index = self._ensure_loaded() + for skill in index.get("skills", []): + if skill.get("identifier") == identifier: + return skill.get("trust_level", "community") + return "community" + + def search(self, query: str, limit: int = 10) -> List[SkillMeta]: + """Search the cached index. Zero API calls.""" + index = self._ensure_loaded() + skills = index.get("skills", []) + if not skills: + return [] + + if not query.strip(): + # No query — return featured/popular + return [self._to_meta(s) for s in skills[:limit]] + + query_lower = query.lower() + results: List[SkillMeta] = [] + for s in skills: + searchable = f"{s.get('name', '')} {s.get('description', '')} {' '.join(s.get('tags', []))}".lower() + if query_lower in searchable: + results.append(self._to_meta(s)) + if len(results) >= limit: + break + return results + + def fetch(self, identifier: str) -> Optional[SkillBundle]: + """Fetch a skill using the resolved path from the index. + + If the index has a ``resolved_github_id`` for this skill, we skip + the entire candidate/discovery chain and go directly to GitHub + with the exact path. This reduces install from ~31 API calls to + just the file content downloads (~5-22 depending on skill size). + """ + index = self._ensure_loaded() + entry = self._find_entry(identifier, index) + if not entry: + return None + + # Use resolved path if available + resolved = entry.get("resolved_github_id") + if resolved: + bundle = self._get_github().fetch(resolved) + if bundle: + bundle.source = entry.get("source", "hermes-index") + bundle.identifier = identifier + return bundle + + # Fall back to identifier-based fetch via repo/path + repo = entry.get("repo", "") + path = entry.get("path", "") + if repo and path: + github_id = f"{repo}/{path}" + bundle = self._get_github().fetch(github_id) + if bundle: + bundle.source = entry.get("source", "hermes-index") + bundle.identifier = identifier + return bundle + + return None + + def inspect(self, identifier: str) -> Optional[SkillMeta]: + """Return metadata from the index. Zero API calls.""" + index = self._ensure_loaded() + entry = self._find_entry(identifier, index) + if entry: + return self._to_meta(entry) + return None + + def _find_entry(self, identifier: str, index: dict) -> Optional[dict]: + """Look up a skill in the index by identifier or name.""" + skills = index.get("skills", []) + + # Exact identifier match + for s in skills: + if s.get("identifier") == identifier: + return s + + # Try without source prefix (e.g. "skills-sh/" stripped) + normalized = identifier + for prefix in ("skills-sh/", "skills.sh/", "official/", "github/", "clawhub/"): + if identifier.startswith(prefix): + normalized = identifier[len(prefix):] + break + + # Match on normalized identifier or name + for s in skills: + sid = s.get("identifier", "") + # Strip prefix from stored identifier too + stored_normalized = sid + for prefix in ("skills-sh/", "skills.sh/", "official/", "github/", "clawhub/"): + if sid.startswith(prefix): + stored_normalized = sid[len(prefix):] + break + if stored_normalized == normalized: + return s + + return None + + @staticmethod + def _to_meta(entry: dict) -> SkillMeta: + return SkillMeta( + name=entry.get("name", ""), + description=entry.get("description", ""), + source=entry.get("source", "hermes-index"), + identifier=entry.get("identifier", ""), + trust_level=entry.get("trust_level", "community"), + repo=entry.get("repo"), + path=entry.get("path"), + tags=entry.get("tags", []), + extra=entry.get("extra", {}), + ) + + def create_source_router(auth: Optional[GitHubAuth] = None) -> List[SkillSource]: """ Create all configured source adapters. @@ -2711,6 +2927,7 @@ def create_source_router(auth: Optional[GitHubAuth] = None) -> List[SkillSource] sources: List[SkillSource] = [ OptionalSkillSource(), # Official optional skills (highest priority) + HermesIndexSource(auth=auth), # Centralized index (search + resolved install paths) SkillsShSource(auth=auth), WellKnownSkillSource(), GitHubSource(auth=auth, extra_taps=extra_taps), @@ -2753,10 +2970,27 @@ def parallel_search_sources( per_source_limits = per_source_limits or {} active: List[SkillSource] = [] + # When the centralized index is available and the user hasn't filtered + # to a specific source, skip external API sources (github, skills-sh, + # clawhub, etc.) — the index already has their data. This avoids + # ~70 GitHub API calls per search for unauthenticated users. + _index_available = False + _api_source_ids = frozenset({"github", "skills-sh", "clawhub", + "claude-marketplace", "lobehub", "well-known"}) + if source_filter == "all": + for src in sources: + if (src.source_id() == "hermes-index" + and getattr(src, "is_available", False)): + _index_available = True + break + for src in sources: sid = src.source_id() if source_filter != "all" and sid != source_filter and sid != "official": continue + # Skip external API sources when the index covers them + if _index_available and sid in _api_source_ids: + continue active.append(src) all_results: List[SkillMeta] = []