From 7050c052e38b1ea6f8785e8f38b29127d6d7b283 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 28 May 2026 11:28:12 -0700 Subject: [PATCH] =?UTF-8?q?fix(skills):=20pull=20full=20skills.sh=20catalo?= =?UTF-8?q?g=20via=20sitemap=20(858=20=E2=86=92=2019,932)=20(#34025)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skills.sh source was returning ~858 unique skills from a hardcoded list of 28 popular keyword searches (each capped at 50 results). The real catalog is ~20k — exposed via sitemap-skills-{1,2}.xml linked from the site's sitemap index. Switch the empty-query path in SkillsShSource.search() to walk the sitemap instead of scraping the homepage's curated featured strip. Falls back to the homepage scrape if the sitemap is unreachable. build_skills_index.crawl_skills_sh() now just calls search("", limit=0) instead of running 28 keyword searches — same result in one HTTP round instead of 28. Also handle a httpx + brotlicffi interaction: the per-skill sitemaps are ~900 KB brotli-compressed and the cffi backend's streaming decode chokes on them. Forcing Accept-Encoding to gzip dodges the bug without requiring a brotli library upgrade. E2E against live skills.sh: 19,932 unique skills walked in 0.7s. Tests: 137 pass (+1 new regression test exercising the sitemap path). Floor for skills.sh raised 100 → 10,000 in EXPECTED_FLOORS so a future regression hard-fails the build. --- scripts/build_skills_index.py | 43 ++++++------- tests/tools/test_skills_hub.py | 62 +++++++++++++++++++ tools/skills_hub.py | 106 ++++++++++++++++++++++++++++++++- 3 files changed, 189 insertions(+), 22 deletions(-) diff --git a/scripts/build_skills_index.py b/scripts/build_skills_index.py index c1490669006..2712ae5403a 100644 --- a/scripts/build_skills_index.py +++ b/scripts/build_skills_index.py @@ -80,30 +80,27 @@ def crawl_source(source, source_name: str, limit: int) -> list: def crawl_skills_sh(source: SkillsShSource) -> list: - """Crawl skills.sh using popular queries for broad coverage.""" - print(" Crawling skills.sh (popular queries)...", flush=True) + """Crawl skills.sh via its sitemap to enumerate the full catalog (~20k entries). + + Previously walked a hardcoded list of ~28 popular keywords (each capped at + 50 results) which yielded ~850 unique skills — about 4% of the real catalog. + The SkillsShSource.search("") path now hits the sitemap directly, returning + the full 20k-entry catalog deduplicated by canonical identifier. + """ + print(" Crawling skills.sh (sitemap)...", flush=True) start = time.time() - queries = [ - "", # featured - "react", "python", "web", "api", "database", "docker", - "testing", "scraping", "design", "typescript", "git", - "aws", "security", "data", "ml", "ai", "devops", - "frontend", "backend", "mobile", "cli", "documentation", - "kubernetes", "terraform", "rust", "go", "java", - ] + try: + results = source.search("", limit=0) # 0 = no cap, return the whole catalog + except Exception as e: + print(f" Warning: skills.sh sitemap walk failed: {e}", file=sys.stderr) + results = [] all_skills: dict[str, dict] = {} - for query in queries: - try: - results = source.search(query, limit=50) - for meta in results: - entry = _meta_to_dict(meta) - if entry["identifier"] not in all_skills: - all_skills[entry["identifier"]] = entry - except Exception as e: - print(f" Warning: skills.sh search '{query}' failed: {e}", - file=sys.stderr) + for meta in results: + entry = _meta_to_dict(meta) + if entry["identifier"] not in all_skills: + all_skills[entry["identifier"]] = entry elapsed = time.time() - start print(f" skills.sh: {len(all_skills)} unique skills ({elapsed:.1f}s)", @@ -345,7 +342,11 @@ def main(): # or rate limiting kicked in. Failing here forces a human look before # the broken index reaches the live docs. EXPECTED_FLOORS = { - "skills.sh": 100, + # skills.sh now uses the sitemap walker (~20k catalog as of May 2026). + # Anything under 10k means the sitemap shape changed or fetches failed + # — better to fail loudly than ship a regression to the 858-skill + # popular-queries era. + "skills.sh": 10000, "lobehub": 100, # ClawHub had 49,698+ skills as of May 2026 — anything under 20k means # pagination broke or the API surface changed. Fail loudly rather diff --git a/tests/tools/test_skills_hub.py b/tests/tools/test_skills_hub.py index 22406a8bacd..85bd4c5e17c 100644 --- a/tests/tools/test_skills_hub.py +++ b/tests/tools/test_skills_hub.py @@ -472,6 +472,68 @@ class TestSkillsShSource: requested_urls = [call.args[0] for call in mock_get.call_args_list] assert root_url not in requested_urls + @patch("tools.skills_hub._write_index_cache") + @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch("tools.skills_hub.httpx.get") + def test_empty_query_walks_sitemap_not_homepage( + self, mock_get, _mock_read_cache, _mock_write_cache, + ): + """Empty query must walk the full sitemap. + + Regression for skills.sh shipping ~858/20000 skills: the previous + empty-query path scraped the homepage's featured strip (~200 entries), + and build_skills_index.py supplemented it with 28 popular keyword + searches to drag the count to ~850. The sitemap walker hits the + full ~20k catalog in one pass. + """ + index_xml = """ + + https://www.skills.sh/sitemap-misc.xml + https://www.skills.sh/sitemap-skills-1.xml + https://www.skills.sh/sitemap-skills-2.xml +""" + skills_1_xml = """ + + https://www.skills.sh/anthropics/skills/frontend-design + https://www.skills.sh/anthropics/skills/pdf + https://www.skills.sh/vercel-labs/agent-skills/react-best-practices +""" + skills_2_xml = """ + + https://www.skills.sh/microsoft/azure-skills/azure-ai + https://www.skills.sh/anthropics/skills/frontend-design +""" + + def side_effect(url, *args, **kwargs): + resp = MagicMock(status_code=200) + if url.endswith("/sitemap.xml"): + resp.text = index_xml + elif "sitemap-skills-1" in url: + resp.text = skills_1_xml + elif "sitemap-skills-2" in url: + resp.text = skills_2_xml + else: + resp.status_code = 404 + resp.text = "" + return resp + + mock_get.side_effect = side_effect + + results = self._source().search("", limit=0) + + # 4 unique skills (the frontend-design dup across sitemaps collapsed). + assert len(results) == 4 + identifiers = {r.identifier for r in results} + assert identifiers == { + "skills-sh/anthropics/skills/frontend-design", + "skills-sh/anthropics/skills/pdf", + "skills-sh/vercel-labs/agent-skills/react-best-practices", + "skills-sh/microsoft/azure-skills/azure-ai", + } + # Homepage was NOT fetched — the sitemap path is taken on empty query. + urls_called = [call.args[0] for call in mock_get.call_args_list] + assert not any(u == "https://skills.sh" or u == "https://skills.sh/" for u in urls_called) + class TestFindSkillInRepoTree: """Tests for GitHubSource._find_skill_in_repo_tree.""" diff --git a/tools/skills_hub.py b/tools/skills_hub.py index b0d58122b34..084494e6b70 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -1217,6 +1217,16 @@ class SkillsShSource(SkillSource): BASE_URL = "https://skills.sh" SEARCH_URL = f"{BASE_URL}/api/search" + # Sitemap index — the real catalog source. The homepage scrape only + # exposes a curated featured strip (~200 entries); the sitemap covers + # the full ~20k+ catalog. https://www.skills.sh/sitemap.xml points at + # sitemap-skills-1.xml + sitemap-skills-2.xml, each up to 10k URLs. + SITEMAP_INDEX_URL = "https://www.skills.sh/sitemap.xml" + _SITEMAP_LOC_RE = re.compile(r"([^<]+)", re.IGNORECASE) + _SITEMAP_SKILL_RE = re.compile( + r"^https?://(?:www\.)?skills\.sh/(?P[^/]+)/(?P[^/]+)/(?P[^/]+)/?$", + re.IGNORECASE, + ) _SKILL_LINK_RE = re.compile(r'href=["\']/(?P(?!agents/|_next/|api/)[^"\'/]+/[^"\'/]+/[^"\'/]+)["\']') _INSTALL_CMD_RE = re.compile( r'npx\s+skills\s+add\s+(?Phttps?://github\.com/[^\s<]+|[^\s<]+)' @@ -1246,7 +1256,10 @@ class SkillsShSource(SkillSource): def search(self, query: str, limit: int = 10) -> List[SkillMeta]: if not query.strip(): - return self._featured_skills(limit) + # Empty query = bulk catalog dump (what build_skills_index.py + # calls with). The homepage scrape only sees ~200 featured + # entries; the sitemap walks the full ~20k+ catalog. + return self._sitemap_catalog(limit) cache_key = f"skills_sh_search_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}" cached = _read_index_cache(cache_key) @@ -1307,6 +1320,97 @@ class SkillsShSource(SkillSource): return self._finalize_inspect_meta(meta, canonical, detail) return None + def _sitemap_catalog(self, limit: int) -> List[SkillMeta]: + """Walk the skills.sh sitemap to enumerate the full catalog. + + Cached for the standard index TTL so we don't refetch ~2 MB of + sitemap XML per build. Falls back to ``_featured_skills`` if the + sitemap is unreachable or empty (network failure, hostname + change, etc.). + """ + cache_key = "skills_sh_sitemap_v1" + cached = _read_index_cache(cache_key) + if cached is not None: + metas = [SkillMeta(**item) for item in cached] + return metas[:limit] if limit > 0 else metas + + # skills.sh serves the per-skill sitemaps brotli-compressed, and + # httpx's optional brotlicffi backend has a streaming-decode bug + # that fails on these specific payloads. Excluding "br" from + # Accept-Encoding makes the server fall back to gzip (or + # identity), which works on every httpx install. + sitemap_headers = {"Accept-Encoding": "gzip"} + + # Step 1: fetch the sitemap index → list of skill-sitemap URLs. + skill_sitemap_urls: List[str] = [] + try: + resp = httpx.get( + self.SITEMAP_INDEX_URL, + timeout=20, + follow_redirects=True, + headers=sitemap_headers, + ) + if resp.status_code != 200: + return self._featured_skills(limit) + for match in self._SITEMAP_LOC_RE.finditer(resp.text): + loc = match.group(1).strip() + # Sitemap index entries that point at the per-skill maps. + if "sitemap-skills" in loc: + skill_sitemap_urls.append(loc) + except httpx.HTTPError: + return self._featured_skills(limit) + + if not skill_sitemap_urls: + return self._featured_skills(limit) + + # Step 2: fetch each skill sitemap and collect canonical "owner/repo/skill" IDs. + seen: set[str] = set() + results: List[SkillMeta] = [] + for sitemap_url in skill_sitemap_urls: + try: + resp = httpx.get( + sitemap_url, + timeout=30, + follow_redirects=True, + headers=sitemap_headers, + ) + if resp.status_code != 200: + continue + except httpx.HTTPError: + continue + for loc_match in self._SITEMAP_LOC_RE.finditer(resp.text): + url = loc_match.group(1).strip() + m = self._SITEMAP_SKILL_RE.match(url) + if not m: + continue + owner = m.group("owner") + repo_name = m.group("repo") + skill_name = m.group("skill") + canonical = f"{owner}/{repo_name}/{skill_name}" + if canonical in seen: + continue + seen.add(canonical) + repo = f"{owner}/{repo_name}" + results.append(SkillMeta( + name=skill_name, + description=f"Indexed by skills.sh from {repo}", + source="skills.sh", + identifier=self._wrap_identifier(canonical), + trust_level=self.github.trust_level_for(canonical), + repo=repo, + path=skill_name, + extra={ + "detail_url": f"{self.BASE_URL}/{canonical}", + "repo_url": f"https://github.com/{repo}", + }, + )) + + if not results: + return self._featured_skills(limit) + + _write_index_cache(cache_key, [_skill_meta_to_dict(item) for item in results]) + return results[:limit] if limit > 0 else results + def _featured_skills(self, limit: int) -> List[SkillMeta]: cache_key = "skills_sh_featured" cached = _read_index_cache(cache_key)