From 5e9d7a766107d1aaee3f347a587ca653abd96e0a Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 8 Jun 2026 15:21:28 -0700 Subject: [PATCH] fix(skills-hub): stop shipping a degenerate index when GitHub taps collapse (#42347) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Skills Hub lost every api.github.com-backed source — the OpenAI, Anthropic, HuggingFace, NVIDIA, gstack, Claude Marketplace and Well-Known tabs all vanished — while ClawHub/skills.sh/LobeHub/browse.sh survived. A GitHub API rate limit during the docs-deploy crawl zeroed all three api.github.com sources (github / claude-marketplace / well-known) at once. Two compounding bugs let the broken index reach the live site: 1. build_skills_index.py wrote the output file BEFORE the health check, so even when the github floor (30) tripped and the script exited 2, the degenerate file was already on disk. deploy-site.yml then swallowed the exit code with `|| echo non-fatal` and extract-skills.py read the partial index. Fix: run the health check first, write the file only when healthy, exit without writing on failure. Removed the non-fatal swallow in deploy-site.yml so a collapse fails the deploy and the last good site stays live (Pages serves the previous build). 2. The build-time GitHub listing path returned [] on a 403 rate-limit without retrying or flagging it, so a rate-limited crawl looked identical to an empty source. Fix: a shared _github_get() helper on GitHubSource with retry/backoff (honors Retry-After / X-RateLimit-Reset on 403/429, backs off on 5xx + transport errors) and flags is_rate_limited. Routed _list_skills_in_repo and _fetch_file_content through it; gave ClaudeMarketplaceSource a persistent GitHubSource + is_rate_limited so the builder can name the rate limit as the cause instead of '0 results'. Added tests/scripts/test_build_skills_index_health.py pinning both contracts: a degenerate crawl exits non-zero and writes no file; a healthy crawl writes the index with github/claude-marketplace/well-known all present. --- .github/workflows/deploy-site.yml | 22 ++- scripts/build_skills_index.py | 65 +++++--- .../scripts/test_build_skills_index_health.py | 99 ++++++++++++ tools/skills_hub.py | 141 ++++++++++++++---- 4 files changed, 274 insertions(+), 53 deletions(-) create mode 100644 tests/scripts/test_build_skills_index_health.py diff --git a/.github/workflows/deploy-site.yml b/.github/workflows/deploy-site.yml index 82acaa6667d..9b3e6426652 100644 --- a/.github/workflows/deploy-site.yml +++ b/.github/workflows/deploy-site.yml @@ -59,12 +59,22 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - # Always rebuild — the file isn't committed (gitignored), so a - # fresh checkout starts without it and we want the freshest crawl - # in every deploy. Failure is non-fatal: extract-skills.py will - # fall back to the legacy snapshot cache and the Skills Hub page - # still renders, just without the latest community catalog. - python3 scripts/build_skills_index.py || echo "Skills index build failed (non-fatal)" + # Rebuild the unified catalog. The file is gitignored, so a fresh + # checkout starts without it and we want the freshest crawl in + # every deploy. + # + # This MUST be fatal. build_skills_index.py runs a health check and + # exits non-zero WITHOUT writing the output file when a source + # collapses (e.g. a GitHub API rate limit zeroes the github / + # claude-marketplace / well-known taps all at once). Letting the + # deploy continue would either (a) ship a degenerate index missing + # whole hubs — the June 2026 regression where OpenAI/Anthropic/ + # HuggingFace/NVIDIA tabs vanished — or (b) fall through to a + # local-only catalog. Failing here keeps the last good deployment + # live (GitHub Pages serves the previous build) instead of + # publishing a broken catalog. Re-run the workflow once the + # transient rate limit clears. + python3 scripts/build_skills_index.py - name: Extract skill metadata for dashboard run: python3 website/scripts/extract-skills.py diff --git a/scripts/build_skills_index.py b/scripts/build_skills_index.py index 2712ae5403a..a5bf900d831 100644 --- a/scripts/build_skills_index.py +++ b/scripts/build_skills_index.py @@ -297,6 +297,21 @@ def main(): # Batch resolve GitHub paths for skills.sh entries all_skills = batch_resolve_paths(all_skills, auth) + # Collect which sources hit a GitHub API rate limit during the crawl. + # github / claude-marketplace / well-known all read api.github.com, so a + # rate-limited token zeroes all three at once — surfaced below so the + # failure message names the real cause instead of "source returned 0". + rate_limited_sources = { + name for name, source in sources.items() + if getattr(source, "is_rate_limited", False) + } + if rate_limited_sources: + print( + " WARNING: GitHub API rate limit hit for: " + + ", ".join(sorted(rate_limited_sources)), + file=sys.stderr, + ) + # Deduplicate by identifier seen: dict[str, dict] = {} for skill in all_skills: @@ -311,25 +326,9 @@ def main(): "browse-sh": 5, "claude-marketplace": 6, "lobehub": 7} deduped.sort(key=lambda s: (source_order.get(s["source"], 99), s["name"])) - # Build index - index = { - "version": INDEX_VERSION, - "generated_at": datetime.now(timezone.utc).isoformat(), - "skill_count": len(deduped), - "skills": deduped, - } - - os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) - with open(OUTPUT_PATH, "w", encoding="utf-8") as f: - json.dump(index, f, separators=(",", ":"), ensure_ascii=False) - - elapsed = time.time() - overall_start - file_size = os.path.getsize(OUTPUT_PATH) - print(f"\nDone! {len(deduped)} skills indexed in {elapsed:.0f}s") - print(f"Output: {OUTPUT_PATH} ({file_size / 1024:.0f} KB)") - from collections import Counter by_source = Counter(s["source"] for s in deduped) + print(f"\nCrawled {len(deduped)} skills in {time.time() - overall_start:.0f}s") for src, count in sorted(by_source.items(), key=lambda x: -x[1]): resolved = sum(1 for s in deduped if s["source"] == src and s.get("resolved_github_id")) @@ -380,14 +379,46 @@ def main(): ) for line in health_errors: print(line, file=sys.stderr) + if rate_limited_sources: + print( + "\nGitHub API rate limit was hit during this crawl for: " + + ", ".join(sorted(rate_limited_sources)) + + ". This is the usual cause of an all-GitHub-tap collapse " + "(github / claude-marketplace / well-known dropping to zero " + "together). Re-run with a higher-quota GITHUB_TOKEN.", + file=sys.stderr, + ) print( "\nIf the drop is expected (e.g. a hub is genuinely shutting " "down), lower the floor in scripts/build_skills_index.py " "EXPECTED_FLOORS in the same PR.", file=sys.stderr, ) + # IMPORTANT: do NOT write OUTPUT_PATH on failure. The index file is + # gitignored, so a fresh deploy checkout has no copy on disk — leaving + # it absent lets website/scripts/extract-skills.py fall back to the + # legacy snapshot cache (or skip the unified index) instead of reading + # a degenerate file. Writing-then-exiting-2 was the bug that shipped an + # index with every GitHub-API source dropped to zero: deploy-site.yml + # swallows the exit code with `|| echo non-fatal`, and the partial file + # was already on disk for extract-skills to pick up. sys.exit(2) + # Healthy — only now write the index out for the docs build to consume. + index = { + "version": INDEX_VERSION, + "generated_at": datetime.now(timezone.utc).isoformat(), + "skill_count": len(deduped), + "skills": deduped, + } + os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True) + with open(OUTPUT_PATH, "w", encoding="utf-8") as f: + json.dump(index, f, separators=(",", ":"), ensure_ascii=False) + file_size = os.path.getsize(OUTPUT_PATH) + print(f"\nDone! {len(deduped)} skills indexed in " + f"{time.time() - overall_start:.0f}s") + print(f"Output: {OUTPUT_PATH} ({file_size / 1024:.0f} KB)") + if __name__ == "__main__": main() diff --git a/tests/scripts/test_build_skills_index_health.py b/tests/scripts/test_build_skills_index_health.py new file mode 100644 index 00000000000..c5116917d1b --- /dev/null +++ b/tests/scripts/test_build_skills_index_health.py @@ -0,0 +1,99 @@ +"""Invariants for scripts/build_skills_index.py's health-check guard. + +Regression context (June 2026): a GitHub API rate limit zeroed every +api.github.com-backed source (github / claude-marketplace / well-known) at +once during the docs deploy crawl. The build's health check fired and exited +non-zero — but it had ALREADY written the degenerate index to disk, and +deploy-site.yml swallowed the exit code with ``|| echo non-fatal``. The +partial index (missing the OpenAI/Anthropic/HuggingFace/NVIDIA tabs) shipped +to the live Skills Hub. + +These tests pin the two contracts that prevent a recurrence: + 1. A degenerate crawl exits non-zero AND does NOT write the output file + (so extract-skills.py falls back instead of reading a broken index). + 2. A healthy crawl exits zero AND writes the file with every source present. +""" + +import os +import sys +import types + +import pytest + +import scripts.build_skills_index as build_mod + + +def _meta(name, src): + return build_mod.SkillMeta( + name=name, description="d", source=src, + identifier=f"{src}/{name}", trust_level="community", + ) + + +class _FakeSource: + def __init__(self, src, n, rate_limited=False): + self._src = src + self._n = n + self.is_rate_limited = rate_limited + + def search(self, query, limit=10): + return [_meta(f"{self._src}-{i}", self._src) for i in range(self._n)] + + +def _install_fake_sources(monkeypatch, *, github_count, claude_count=40, + well_known_count=10, github_rate_limited=False): + monkeypatch.setattr(build_mod, "SkillsShSource", lambda auth: _FakeSource("skills.sh", 15000)) + monkeypatch.setattr(build_mod, "OptionalSkillSource", lambda: _FakeSource("official", 95)) + monkeypatch.setattr(build_mod, "WellKnownSkillSource", lambda: _FakeSource("well-known", well_known_count)) + monkeypatch.setattr( + build_mod, "GitHubSource", + lambda auth: _FakeSource("github", github_count, rate_limited=github_rate_limited), + ) + monkeypatch.setattr(build_mod, "ClawHubSource", lambda: _FakeSource("clawhub", 69000)) + monkeypatch.setattr( + build_mod, "ClaudeMarketplaceSource", + lambda auth: _FakeSource("claude-marketplace", claude_count, rate_limited=github_rate_limited), + ) + monkeypatch.setattr(build_mod, "LobeHubSource", lambda: _FakeSource("lobehub", 500)) + monkeypatch.setattr(build_mod, "BrowseShSource", lambda: _FakeSource("browse-sh", 380)) + monkeypatch.setattr( + build_mod, "crawl_skills_sh", + lambda source: [build_mod._meta_to_dict(m) for m in source.search("", 0)], + ) + monkeypatch.setattr(build_mod, "batch_resolve_paths", lambda skills, auth: skills) + monkeypatch.setattr( + build_mod, "GitHubAuth", + lambda: types.SimpleNamespace(auth_method=lambda: "token"), + ) + + +def test_degenerate_crawl_exits_nonzero_and_writes_no_file(tmp_path, monkeypatch): + """A collapsed GitHub crawl must fail loud and leave OUTPUT_PATH unwritten.""" + out = tmp_path / "skills-index.json" + monkeypatch.setattr(build_mod, "OUTPUT_PATH", str(out)) + _install_fake_sources(monkeypatch, github_count=0, claude_count=0, + well_known_count=0, github_rate_limited=True) + + with pytest.raises(SystemExit) as exc: + build_mod.main() + + assert exc.value.code != 0 + # The degenerate index must NOT have been written — extract-skills.py + # relies on the file's absence to fall back instead of reading garbage. + assert not out.exists() + + +def test_healthy_crawl_writes_index_with_all_sources(tmp_path, monkeypatch): + out = tmp_path / "skills-index.json" + monkeypatch.setattr(build_mod, "OUTPUT_PATH", str(out)) + _install_fake_sources(monkeypatch, github_count=200) + + build_mod.main() # exit 0 (no SystemExit) + + assert out.exists() + import json + data = json.loads(out.read_text()) + sources = {s["source"] for s in data["skills"]} + # Every GitHub-API-backed source that vanished in the regression is present. + assert {"github", "claude-marketplace", "well-known"} <= sources + assert data["skill_count"] == len(data["skills"]) diff --git a/tools/skills_hub.py b/tools/skills_hub.py index c2a22eef3fe..ca40bb55c05 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -550,11 +550,8 @@ class GitHubSource(SkillSource): return [SkillMeta(**s) for s in cached] url = f"https://api.github.com/repos/{repo}/contents/{path.rstrip('/')}" - try: - resp = httpx.get(url, headers=self.auth.get_headers(), timeout=15, follow_redirects=True) - if resp.status_code != 200: - return [] - except httpx.HTTPError: + resp = self._github_get(url) + if resp is None or resp.status_code != 200: return [] entries = resp.json() @@ -639,15 +636,98 @@ class GitHubSource(SkillSource): def _check_rate_limit_response(self, resp: "httpx.Response") -> None: """Flag the instance as rate-limited when GitHub returns 403 + exhausted quota.""" - if resp.status_code == 403: + if resp.status_code in (403, 429): remaining = resp.headers.get("X-RateLimit-Remaining", "") - if remaining == "0": + if remaining == "0" or resp.status_code == 429: self._rate_limited = True logger.warning( "GitHub API rate limit exhausted (unauthenticated: 60 req/hr). " "Set GITHUB_TOKEN or install the gh CLI to raise the limit to 5,000/hr." ) + def _github_get( + self, + url: str, + *, + params: Optional[Dict] = None, + headers: Optional[Dict] = None, + timeout: float = 15.0, + max_retries: int = 3, + ) -> Optional["httpx.Response"]: + """GET against the GitHub API with retry/backoff on transient failures. + + Returns the final ``httpx.Response`` (caller inspects status) or + ``None`` when every attempt raised a transport error. + + Retries on: + - 403/429 with ``X-RateLimit-Remaining: 0`` — waits until the + reset time (capped) when the header is present, else exponential + backoff. This is the all-GitHub-tap-collapse case: a single + shared rate limit zeroes github + claude-marketplace + well-known + at once during the index build. + - 5xx and connection/timeout errors — exponential backoff. + + On terminal rate-limit exhaustion the instance is flagged via + ``_check_rate_limit_response`` so the build can fail loud instead of + silently shipping an index with the GitHub sources dropped to zero. + """ + hdrs = headers if headers is not None else self.auth.get_headers() + backoff = 1.0 + last_resp: Optional["httpx.Response"] = None + for attempt in range(max_retries): + try: + resp = httpx.get( + url, params=params, headers=hdrs, + timeout=timeout, follow_redirects=True, + ) + except httpx.HTTPError as e: + logger.debug("GitHub GET %s failed (attempt %d/%d): %s", + url, attempt + 1, max_retries, e) + if attempt < max_retries - 1: + time.sleep(backoff) + backoff = min(backoff * 2, 30.0) + continue + return None + + last_resp = resp + if resp.status_code == 200: + return resp + + # Rate-limited: honor the reset header when present, else back off. + if resp.status_code in (403, 429): + remaining = resp.headers.get("X-RateLimit-Remaining", "") + is_rl = remaining == "0" or resp.status_code == 429 + if is_rl and attempt < max_retries - 1: + wait = backoff + reset = resp.headers.get("X-RateLimit-Reset", "") + retry_after = resp.headers.get("Retry-After", "") + if retry_after.isdigit(): + wait = min(float(retry_after), 60.0) + elif reset.isdigit(): + delta = float(reset) - time.time() + if 0 < delta <= 60.0: + wait = delta + logger.debug( + "GitHub rate limited on %s, waiting %.1fs (attempt %d/%d)", + url, wait, attempt + 1, max_retries, + ) + time.sleep(wait) + backoff = min(backoff * 2, 30.0) + continue + # Out of retries (or not a rate-limit 403) — flag and return. + self._check_rate_limit_response(resp) + return resp + + # 5xx — retry; 4xx (other than rate limit) — return immediately. + if 500 <= resp.status_code < 600 and attempt < max_retries - 1: + time.sleep(backoff) + backoff = min(backoff * 2, 30.0) + continue + return resp + + return last_resp + + def _download_directory(self, repo: str, path: str) -> Dict[str, str]: """Recursively download all text files from a GitHub directory. @@ -768,17 +848,12 @@ class GitHubSource(SkillSource): def _fetch_file_content(self, repo: str, path: str) -> Optional[str]: """Fetch a single file's content from GitHub.""" url = f"https://api.github.com/repos/{repo}/contents/{path}" - try: - resp = httpx.get( - url, - headers={**self.auth.get_headers(), "Accept": "application/vnd.github.v3.raw"}, - timeout=15, follow_redirects=True, - ) - if resp.status_code == 200: - return resp.text - self._check_rate_limit_response(resp) - except httpx.HTTPError as e: - logger.debug("GitHub contents API fetch failed: %s", e) + resp = self._github_get( + url, + headers={**self.auth.get_headers(), "Accept": "application/vnd.github.v3.raw"}, + ) + if resp is not None and resp.status_code == 200: + return resp.text return None def _get_skillsh_groupings(self, repo: str) -> Optional[Dict[str, str]]: @@ -2373,10 +2448,19 @@ class ClaudeMarketplaceSource(SkillSource): def __init__(self, auth: GitHubAuth): self.auth = auth + # Persistent GitHubSource so rate-limit state survives across the + # marketplace-index fetch + per-skill inspect calls and can be + # surfaced to the index builder (see is_rate_limited). + self.github = GitHubSource(auth=auth) def source_id(self) -> str: return "claude-marketplace" + @property + def is_rate_limited(self) -> bool: + """Whether the underlying GitHub API hit a rate limit during the crawl.""" + return self.github.is_rate_limited + def trust_level_for(self, identifier: str) -> str: parts = identifier.split("/", 2) if len(parts) >= 2: @@ -2415,15 +2499,13 @@ class ClaudeMarketplaceSource(SkillSource): def fetch(self, identifier: str) -> Optional[SkillBundle]: # Delegate to GitHub Contents API since marketplace skills live in GitHub repos - gh = GitHubSource(auth=self.auth) - bundle = gh.fetch(identifier) + bundle = self.github.fetch(identifier) if bundle: bundle.source = "claude-marketplace" return bundle def inspect(self, identifier: str) -> Optional[SkillMeta]: - gh = GitHubSource(auth=self.auth) - meta = gh.inspect(identifier) + meta = self.github.inspect(identifier) if meta: meta.source = "claude-marketplace" meta.trust_level = self.trust_level_for(identifier) @@ -2437,16 +2519,15 @@ class ClaudeMarketplaceSource(SkillSource): return cached url = f"https://api.github.com/repos/{repo}/contents/.claude-plugin/marketplace.json" + resp = self.github._github_get( + url, + headers={**self.auth.get_headers(), "Accept": "application/vnd.github.v3.raw"}, + ) + if resp is None or resp.status_code != 200: + return [] try: - resp = httpx.get( - url, - headers={**self.auth.get_headers(), "Accept": "application/vnd.github.v3.raw"}, - timeout=15, - ) - if resp.status_code != 200: - return [] data = json.loads(resp.text) - except (httpx.HTTPError, json.JSONDecodeError): + except json.JSONDecodeError: return [] plugins = data.get("plugins", [])