fix(skills-hub): stop shipping a degenerate index when GitHub taps collapse (#42347)

The Skills Hub lost every api.github.com-backed source — the OpenAI, Anthropic, HuggingFace, NVIDIA, gstack, Claude Marketplace and Well-Known tabs all vanished — while ClawHub/skills.sh/LobeHub/browse.sh survived. A GitHub API rate limit during the docs-deploy crawl zeroed all three api.github.com sources (github / claude-marketplace / well-known) at once. Two compounding bugs let the broken index reach the live site: 1. build_skills_index.py wrote the output file BEFORE the health check, so even when the github floor (30) tripped and the script exited 2, the degenerate file was already on disk. deploy-site.yml then swallowed the exit code with `|| echo non-fatal` and extract-skills.py read the partial index. Fix: run the health check first, write the file only when healthy, exit without writing on failure. Removed the non-fatal swallow in deploy-site.yml so a collapse fails the deploy and the last good site stays live (Pages serves the previous build). 2. The build-time GitHub listing path returned [] on a 403 rate-limit without retrying or flagging it, so a rate-limited crawl looked identical to an empty source. Fix: a shared _github_get() helper on GitHubSource with retry/backoff (honors Retry-After / X-RateLimit-Reset on 403/429, backs off on 5xx + transport errors) and flags is_rate_limited. Routed _list_skills_in_repo and _fetch_file_content through it; gave ClaudeMarketplaceSource a persistent GitHubSource + is_rate_limited so the builder can name the rate limit as the cause instead of '0 results'. Added tests/scripts/test_build_skills_index_health.py pinning both contracts: a degenerate crawl exits non-zero and writes no file; a healthy crawl writes the index with github/claude-marketplace/well-known all present.
2026-07-28 18:19:28 +00:00 · 2026-06-08 15:21:28 -07:00 · 2026-06-08 15:21:28 -07:00 · 5e9d7a7661
commit 5e9d7a7661
parent 639c1e3636
4 changed files with 274 additions and 53 deletions
--- a/.github/workflows/deploy-site.yml
+++ b/.github/workflows/deploy-site.yml
@ -59,12 +59,22 @@ jobs:
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          # Always rebuild — the file isn't committed (gitignored), so a
-          # fresh checkout starts without it and we want the freshest crawl
-          # in every deploy. Failure is non-fatal: extract-skills.py will
-          # fall back to the legacy snapshot cache and the Skills Hub page
-          # still renders, just without the latest community catalog.
-          python3 scripts/build_skills_index.py || echo "Skills index build failed (non-fatal)"
+          # Rebuild the unified catalog. The file is gitignored, so a fresh
+          # checkout starts without it and we want the freshest crawl in
+          # every deploy.
+          #
+          # This MUST be fatal. build_skills_index.py runs a health check and
+          # exits non-zero WITHOUT writing the output file when a source
+          # collapses (e.g. a GitHub API rate limit zeroes the github /
+          # claude-marketplace / well-known taps all at once). Letting the
+          # deploy continue would either (a) ship a degenerate index missing
+          # whole hubs — the June 2026 regression where OpenAI/Anthropic/
+          # HuggingFace/NVIDIA tabs vanished — or (b) fall through to a
+          # local-only catalog. Failing here keeps the last good deployment
+          # live (GitHub Pages serves the previous build) instead of
+          # publishing a broken catalog. Re-run the workflow once the
+          # transient rate limit clears.
+          python3 scripts/build_skills_index.py

      - name: Extract skill metadata for dashboard
        run: python3 website/scripts/extract-skills.py
--- a/scripts/build_skills_index.py
+++ b/scripts/build_skills_index.py
@ -297,6 +297,21 @@ def main():
    # Batch resolve GitHub paths for skills.sh entries
    all_skills = batch_resolve_paths(all_skills, auth)

+    # Collect which sources hit a GitHub API rate limit during the crawl.
+    # github / claude-marketplace / well-known all read api.github.com, so a
+    # rate-limited token zeroes all three at once — surfaced below so the
+    # failure message names the real cause instead of "source returned 0".
+    rate_limited_sources = {
+        name for name, source in sources.items()
+        if getattr(source, "is_rate_limited", False)
+    }
+    if rate_limited_sources:
+        print(
+            "  WARNING: GitHub API rate limit hit for: "
+            + ", ".join(sorted(rate_limited_sources)),
+            file=sys.stderr,
+        )
+
    # Deduplicate by identifier
    seen: dict[str, dict] = {}
    for skill in all_skills:
@ -311,25 +326,9 @@ def main():
                    "browse-sh": 5, "claude-marketplace": 6, "lobehub": 7}
    deduped.sort(key=lambda s: (source_order.get(s["source"], 99), s["name"]))

-    # Build index
-    index = {
-        "version": INDEX_VERSION,
-        "generated_at": datetime.now(timezone.utc).isoformat(),
-        "skill_count": len(deduped),
-        "skills": deduped,
-    }
-
-    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
-    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
-        json.dump(index, f, separators=(",", ":"), ensure_ascii=False)
-
-    elapsed = time.time() - overall_start
-    file_size = os.path.getsize(OUTPUT_PATH)
-    print(f"\nDone! {len(deduped)} skills indexed in {elapsed:.0f}s")
-    print(f"Output: {OUTPUT_PATH} ({file_size / 1024:.0f} KB)")
-
    from collections import Counter
    by_source = Counter(s["source"] for s in deduped)
+    print(f"\nCrawled {len(deduped)} skills in {time.time() - overall_start:.0f}s")
    for src, count in sorted(by_source.items(), key=lambda x: -x[1]):
        resolved = sum(1 for s in deduped
                       if s["source"] == src and s.get("resolved_github_id"))
@ -380,14 +379,46 @@ def main():
        )
        for line in health_errors:
            print(line, file=sys.stderr)
+        if rate_limited_sources:
+            print(
+                "\nGitHub API rate limit was hit during this crawl for: "
+                + ", ".join(sorted(rate_limited_sources))
+                + ". This is the usual cause of an all-GitHub-tap collapse "
+                "(github / claude-marketplace / well-known dropping to zero "
+                "together). Re-run with a higher-quota GITHUB_TOKEN.",
+                file=sys.stderr,
+            )
        print(
            "\nIf the drop is expected (e.g. a hub is genuinely shutting "
            "down), lower the floor in scripts/build_skills_index.py "
            "EXPECTED_FLOORS in the same PR.",
            file=sys.stderr,
        )
+        # IMPORTANT: do NOT write OUTPUT_PATH on failure. The index file is
+        # gitignored, so a fresh deploy checkout has no copy on disk — leaving
+        # it absent lets website/scripts/extract-skills.py fall back to the
+        # legacy snapshot cache (or skip the unified index) instead of reading
+        # a degenerate file. Writing-then-exiting-2 was the bug that shipped an
+        # index with every GitHub-API source dropped to zero: deploy-site.yml
+        # swallows the exit code with `|| echo non-fatal`, and the partial file
+        # was already on disk for extract-skills to pick up.
        sys.exit(2)

+    # Healthy — only now write the index out for the docs build to consume.
+    index = {
+        "version": INDEX_VERSION,
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "skill_count": len(deduped),
+        "skills": deduped,
+    }
+    os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
+    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
+        json.dump(index, f, separators=(",", ":"), ensure_ascii=False)
+    file_size = os.path.getsize(OUTPUT_PATH)
+    print(f"\nDone! {len(deduped)} skills indexed in "
+          f"{time.time() - overall_start:.0f}s")
+    print(f"Output: {OUTPUT_PATH} ({file_size / 1024:.0f} KB)")
+

 if __name__ == "__main__":
    main()
--- a/tests/scripts/test_build_skills_index_health.py
+++ b/tests/scripts/test_build_skills_index_health.py
@ -0,0 +1,99 @@
+"""Invariants for scripts/build_skills_index.py's health-check guard.
+
+Regression context (June 2026): a GitHub API rate limit zeroed every
+api.github.com-backed source (github / claude-marketplace / well-known) at
+once during the docs deploy crawl. The build's health check fired and exited
+non-zero — but it had ALREADY written the degenerate index to disk, and
+deploy-site.yml swallowed the exit code with ``|| echo non-fatal``. The
+partial index (missing the OpenAI/Anthropic/HuggingFace/NVIDIA tabs) shipped
+to the live Skills Hub.
+
+These tests pin the two contracts that prevent a recurrence:
+  1. A degenerate crawl exits non-zero AND does NOT write the output file
+     (so extract-skills.py falls back instead of reading a broken index).
+  2. A healthy crawl exits zero AND writes the file with every source present.
+"""
+
+import os
+import sys
+import types
+
+import pytest
+
+import scripts.build_skills_index as build_mod
+
+
+def _meta(name, src):
+    return build_mod.SkillMeta(
+        name=name, description="d", source=src,
+        identifier=f"{src}/{name}", trust_level="community",
+    )
+
+
+class _FakeSource:
+    def __init__(self, src, n, rate_limited=False):
+        self._src = src
+        self._n = n
+        self.is_rate_limited = rate_limited
+
+    def search(self, query, limit=10):
+        return [_meta(f"{self._src}-{i}", self._src) for i in range(self._n)]
+
+
+def _install_fake_sources(monkeypatch, *, github_count, claude_count=40,
+                          well_known_count=10, github_rate_limited=False):
+    monkeypatch.setattr(build_mod, "SkillsShSource", lambda auth: _FakeSource("skills.sh", 15000))
+    monkeypatch.setattr(build_mod, "OptionalSkillSource", lambda: _FakeSource("official", 95))
+    monkeypatch.setattr(build_mod, "WellKnownSkillSource", lambda: _FakeSource("well-known", well_known_count))
+    monkeypatch.setattr(
+        build_mod, "GitHubSource",
+        lambda auth: _FakeSource("github", github_count, rate_limited=github_rate_limited),
+    )
+    monkeypatch.setattr(build_mod, "ClawHubSource", lambda: _FakeSource("clawhub", 69000))
+    monkeypatch.setattr(
+        build_mod, "ClaudeMarketplaceSource",
+        lambda auth: _FakeSource("claude-marketplace", claude_count, rate_limited=github_rate_limited),
+    )
+    monkeypatch.setattr(build_mod, "LobeHubSource", lambda: _FakeSource("lobehub", 500))
+    monkeypatch.setattr(build_mod, "BrowseShSource", lambda: _FakeSource("browse-sh", 380))
+    monkeypatch.setattr(
+        build_mod, "crawl_skills_sh",
+        lambda source: [build_mod._meta_to_dict(m) for m in source.search("", 0)],
+    )
+    monkeypatch.setattr(build_mod, "batch_resolve_paths", lambda skills, auth: skills)
+    monkeypatch.setattr(
+        build_mod, "GitHubAuth",
+        lambda: types.SimpleNamespace(auth_method=lambda: "token"),
+    )
+
+
+def test_degenerate_crawl_exits_nonzero_and_writes_no_file(tmp_path, monkeypatch):
+    """A collapsed GitHub crawl must fail loud and leave OUTPUT_PATH unwritten."""
+    out = tmp_path / "skills-index.json"
+    monkeypatch.setattr(build_mod, "OUTPUT_PATH", str(out))
+    _install_fake_sources(monkeypatch, github_count=0, claude_count=0,
+                          well_known_count=0, github_rate_limited=True)
+
+    with pytest.raises(SystemExit) as exc:
+        build_mod.main()
+
+    assert exc.value.code != 0
+    # The degenerate index must NOT have been written — extract-skills.py
+    # relies on the file's absence to fall back instead of reading garbage.
+    assert not out.exists()
+
+
+def test_healthy_crawl_writes_index_with_all_sources(tmp_path, monkeypatch):
+    out = tmp_path / "skills-index.json"
+    monkeypatch.setattr(build_mod, "OUTPUT_PATH", str(out))
+    _install_fake_sources(monkeypatch, github_count=200)
+
+    build_mod.main()  # exit 0 (no SystemExit)
+
+    assert out.exists()
+    import json
+    data = json.loads(out.read_text())
+    sources = {s["source"] for s in data["skills"]}
+    # Every GitHub-API-backed source that vanished in the regression is present.
+    assert {"github", "claude-marketplace", "well-known"} <= sources
+    assert data["skill_count"] == len(data["skills"])
--- a/tools/skills_hub.py
+++ b/tools/skills_hub.py
@ -550,11 +550,8 @@ class GitHubSource(SkillSource):
            return [SkillMeta(**s) for s in cached]

        url = f"https://api.github.com/repos/{repo}/contents/{path.rstrip('/')}"
-        try:
-            resp = httpx.get(url, headers=self.auth.get_headers(), timeout=15, follow_redirects=True)
-            if resp.status_code != 200:
-                return []
-        except httpx.HTTPError:
+        resp = self._github_get(url)
+        if resp is None or resp.status_code != 200:
            return []

        entries = resp.json()
@ -639,15 +636,98 @@ class GitHubSource(SkillSource):

    def _check_rate_limit_response(self, resp: "httpx.Response") -> None:
        """Flag the instance as rate-limited when GitHub returns 403 + exhausted quota."""
-        if resp.status_code == 403:
+        if resp.status_code in (403, 429):
            remaining = resp.headers.get("X-RateLimit-Remaining", "")
-            if remaining == "0":
+            if remaining == "0" or resp.status_code == 429:
                self._rate_limited = True
                logger.warning(
                    "GitHub API rate limit exhausted (unauthenticated: 60 req/hr). "
                    "Set GITHUB_TOKEN or install the gh CLI to raise the limit to 5,000/hr."
                )

+    def _github_get(
+        self,
+        url: str,
+        *,
+        params: Optional[Dict] = None,
+        headers: Optional[Dict] = None,
+        timeout: float = 15.0,
+        max_retries: int = 3,
+    ) -> Optional["httpx.Response"]:
+        """GET against the GitHub API with retry/backoff on transient failures.
+
+        Returns the final ``httpx.Response`` (caller inspects status) or
+        ``None`` when every attempt raised a transport error.
+
+        Retries on:
+          - 403/429 with ``X-RateLimit-Remaining: 0`` — waits until the
+            reset time (capped) when the header is present, else exponential
+            backoff. This is the all-GitHub-tap-collapse case: a single
+            shared rate limit zeroes github + claude-marketplace + well-known
+            at once during the index build.
+          - 5xx and connection/timeout errors — exponential backoff.
+
+        On terminal rate-limit exhaustion the instance is flagged via
+        ``_check_rate_limit_response`` so the build can fail loud instead of
+        silently shipping an index with the GitHub sources dropped to zero.
+        """
+        hdrs = headers if headers is not None else self.auth.get_headers()
+        backoff = 1.0
+        last_resp: Optional["httpx.Response"] = None
+        for attempt in range(max_retries):
+            try:
+                resp = httpx.get(
+                    url, params=params, headers=hdrs,
+                    timeout=timeout, follow_redirects=True,
+                )
+            except httpx.HTTPError as e:
+                logger.debug("GitHub GET %s failed (attempt %d/%d): %s",
+                             url, attempt + 1, max_retries, e)
+                if attempt < max_retries - 1:
+                    time.sleep(backoff)
+                    backoff = min(backoff * 2, 30.0)
+                    continue
+                return None
+
+            last_resp = resp
+            if resp.status_code == 200:
+                return resp
+
+            # Rate-limited: honor the reset header when present, else back off.
+            if resp.status_code in (403, 429):
+                remaining = resp.headers.get("X-RateLimit-Remaining", "")
+                is_rl = remaining == "0" or resp.status_code == 429
+                if is_rl and attempt < max_retries - 1:
+                    wait = backoff
+                    reset = resp.headers.get("X-RateLimit-Reset", "")
+                    retry_after = resp.headers.get("Retry-After", "")
+                    if retry_after.isdigit():
+                        wait = min(float(retry_after), 60.0)
+                    elif reset.isdigit():
+                        delta = float(reset) - time.time()
+                        if 0 < delta <= 60.0:
+                            wait = delta
+                    logger.debug(
+                        "GitHub rate limited on %s, waiting %.1fs (attempt %d/%d)",
+                        url, wait, attempt + 1, max_retries,
+                    )
+                    time.sleep(wait)
+                    backoff = min(backoff * 2, 30.0)
+                    continue
+                # Out of retries (or not a rate-limit 403) — flag and return.
+                self._check_rate_limit_response(resp)
+                return resp
+
+            # 5xx — retry; 4xx (other than rate limit) — return immediately.
+            if 500 <= resp.status_code < 600 and attempt < max_retries - 1:
+                time.sleep(backoff)
+                backoff = min(backoff * 2, 30.0)
+                continue
+            return resp
+
+        return last_resp
+
+
    def _download_directory(self, repo: str, path: str) -> Dict[str, str]:
        """Recursively download all text files from a GitHub directory.

@ -768,17 +848,12 @@ class GitHubSource(SkillSource):
    def _fetch_file_content(self, repo: str, path: str) -> Optional[str]:
        """Fetch a single file's content from GitHub."""
        url = f"https://api.github.com/repos/{repo}/contents/{path}"
-        try:
-            resp = httpx.get(
-                url,
-                headers={**self.auth.get_headers(), "Accept": "application/vnd.github.v3.raw"},
-                timeout=15, follow_redirects=True,
-            )
-            if resp.status_code == 200:
-                return resp.text
-            self._check_rate_limit_response(resp)
-        except httpx.HTTPError as e:
-            logger.debug("GitHub contents API fetch failed: %s", e)
+        resp = self._github_get(
+            url,
+            headers={**self.auth.get_headers(), "Accept": "application/vnd.github.v3.raw"},
+        )
+        if resp is not None and resp.status_code == 200:
+            return resp.text
        return None

    def _get_skillsh_groupings(self, repo: str) -> Optional[Dict[str, str]]:
@ -2373,10 +2448,19 @@ class ClaudeMarketplaceSource(SkillSource):

    def __init__(self, auth: GitHubAuth):
        self.auth = auth
+        # Persistent GitHubSource so rate-limit state survives across the
+        # marketplace-index fetch + per-skill inspect calls and can be
+        # surfaced to the index builder (see is_rate_limited).
+        self.github = GitHubSource(auth=auth)

    def source_id(self) -> str:
        return "claude-marketplace"

+    @property
+    def is_rate_limited(self) -> bool:
+        """Whether the underlying GitHub API hit a rate limit during the crawl."""
+        return self.github.is_rate_limited
+
    def trust_level_for(self, identifier: str) -> str:
        parts = identifier.split("/", 2)
        if len(parts) >= 2:
@ -2415,15 +2499,13 @@ class ClaudeMarketplaceSource(SkillSource):

    def fetch(self, identifier: str) -> Optional[SkillBundle]:
        # Delegate to GitHub Contents API since marketplace skills live in GitHub repos
-        gh = GitHubSource(auth=self.auth)
-        bundle = gh.fetch(identifier)
+        bundle = self.github.fetch(identifier)
        if bundle:
            bundle.source = "claude-marketplace"
        return bundle

    def inspect(self, identifier: str) -> Optional[SkillMeta]:
-        gh = GitHubSource(auth=self.auth)
-        meta = gh.inspect(identifier)
+        meta = self.github.inspect(identifier)
        if meta:
            meta.source = "claude-marketplace"
            meta.trust_level = self.trust_level_for(identifier)
@ -2437,16 +2519,15 @@ class ClaudeMarketplaceSource(SkillSource):
            return cached

        url = f"https://api.github.com/repos/{repo}/contents/.claude-plugin/marketplace.json"
+        resp = self.github._github_get(
+            url,
+            headers={**self.auth.get_headers(), "Accept": "application/vnd.github.v3.raw"},
+        )
+        if resp is None or resp.status_code != 200:
+            return []
        try:
-            resp = httpx.get(
-                url,
-                headers={**self.auth.get_headers(), "Accept": "application/vnd.github.v3.raw"},
-                timeout=15,
-            )
-            if resp.status_code != 200:
-                return []
            data = json.loads(resp.text)
-        except (httpx.HTTPError, json.JSONDecodeError):
+        except json.JSONDecodeError:
            return []

        plugins = data.get("plugins", [])