From 7050c052e38b1ea6f8785e8f38b29127d6d7b283 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 28 May 2026 11:28:12 -0700
Subject: [PATCH] =?UTF-8?q?fix(skills):=20pull=20full=20skills.sh=20catalo?=
=?UTF-8?q?g=20via=20sitemap=20(858=20=E2=86=92=2019,932)=20(#34025)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The skills.sh source was returning ~858 unique skills from a hardcoded
list of 28 popular keyword searches (each capped at 50 results). The
real catalog is ~20k — exposed via sitemap-skills-{1,2}.xml linked from
the site's sitemap index.
Switch the empty-query path in SkillsShSource.search() to walk the
sitemap instead of scraping the homepage's curated featured strip.
Falls back to the homepage scrape if the sitemap is unreachable.
build_skills_index.crawl_skills_sh() now just calls search("", limit=0)
instead of running 28 keyword searches — same result in one HTTP round
instead of 28.
Also handle a httpx + brotlicffi interaction: the per-skill sitemaps
are ~900 KB brotli-compressed and the cffi backend's streaming decode
chokes on them. Forcing Accept-Encoding to gzip dodges the bug without
requiring a brotli library upgrade.
E2E against live skills.sh: 19,932 unique skills walked in 0.7s.
Tests: 137 pass (+1 new regression test exercising the sitemap path).
Floor for skills.sh raised 100 → 10,000 in EXPECTED_FLOORS so a future
regression hard-fails the build.
---
scripts/build_skills_index.py | 43 ++++++-------
tests/tools/test_skills_hub.py | 62 +++++++++++++++++++
tools/skills_hub.py | 106 ++++++++++++++++++++++++++++++++-
3 files changed, 189 insertions(+), 22 deletions(-)
diff --git a/scripts/build_skills_index.py b/scripts/build_skills_index.py
index c1490669006..2712ae5403a 100644
--- a/scripts/build_skills_index.py
+++ b/scripts/build_skills_index.py
@@ -80,30 +80,27 @@ def crawl_source(source, source_name: str, limit: int) -> list:
def crawl_skills_sh(source: SkillsShSource) -> list:
- """Crawl skills.sh using popular queries for broad coverage."""
- print(" Crawling skills.sh (popular queries)...", flush=True)
+ """Crawl skills.sh via its sitemap to enumerate the full catalog (~20k entries).
+
+ Previously walked a hardcoded list of ~28 popular keywords (each capped at
+ 50 results) which yielded ~850 unique skills — about 4% of the real catalog.
+ The SkillsShSource.search("") path now hits the sitemap directly, returning
+ the full 20k-entry catalog deduplicated by canonical identifier.
+ """
+ print(" Crawling skills.sh (sitemap)...", flush=True)
start = time.time()
- queries = [
- "", # featured
- "react", "python", "web", "api", "database", "docker",
- "testing", "scraping", "design", "typescript", "git",
- "aws", "security", "data", "ml", "ai", "devops",
- "frontend", "backend", "mobile", "cli", "documentation",
- "kubernetes", "terraform", "rust", "go", "java",
- ]
+ try:
+ results = source.search("", limit=0) # 0 = no cap, return the whole catalog
+ except Exception as e:
+ print(f" Warning: skills.sh sitemap walk failed: {e}", file=sys.stderr)
+ results = []
all_skills: dict[str, dict] = {}
- for query in queries:
- try:
- results = source.search(query, limit=50)
- for meta in results:
- entry = _meta_to_dict(meta)
- if entry["identifier"] not in all_skills:
- all_skills[entry["identifier"]] = entry
- except Exception as e:
- print(f" Warning: skills.sh search '{query}' failed: {e}",
- file=sys.stderr)
+ for meta in results:
+ entry = _meta_to_dict(meta)
+ if entry["identifier"] not in all_skills:
+ all_skills[entry["identifier"]] = entry
elapsed = time.time() - start
print(f" skills.sh: {len(all_skills)} unique skills ({elapsed:.1f}s)",
@@ -345,7 +342,11 @@ def main():
# or rate limiting kicked in. Failing here forces a human look before
# the broken index reaches the live docs.
EXPECTED_FLOORS = {
- "skills.sh": 100,
+ # skills.sh now uses the sitemap walker (~20k catalog as of May 2026).
+ # Anything under 10k means the sitemap shape changed or fetches failed
+ # — better to fail loudly than ship a regression to the 858-skill
+ # popular-queries era.
+ "skills.sh": 10000,
"lobehub": 100,
# ClawHub had 49,698+ skills as of May 2026 — anything under 20k means
# pagination broke or the API surface changed. Fail loudly rather
diff --git a/tests/tools/test_skills_hub.py b/tests/tools/test_skills_hub.py
index 22406a8bacd..85bd4c5e17c 100644
--- a/tests/tools/test_skills_hub.py
+++ b/tests/tools/test_skills_hub.py
@@ -472,6 +472,68 @@ class TestSkillsShSource:
requested_urls = [call.args[0] for call in mock_get.call_args_list]
assert root_url not in requested_urls
+ @patch("tools.skills_hub._write_index_cache")
+ @patch("tools.skills_hub._read_index_cache", return_value=None)
+ @patch("tools.skills_hub.httpx.get")
+ def test_empty_query_walks_sitemap_not_homepage(
+ self, mock_get, _mock_read_cache, _mock_write_cache,
+ ):
+ """Empty query must walk the full sitemap.
+
+ Regression for skills.sh shipping ~858/20000 skills: the previous
+ empty-query path scraped the homepage's featured strip (~200 entries),
+ and build_skills_index.py supplemented it with 28 popular keyword
+ searches to drag the count to ~850. The sitemap walker hits the
+ full ~20k catalog in one pass.
+ """
+ index_xml = """
+
+ https://www.skills.sh/sitemap-misc.xml
+ https://www.skills.sh/sitemap-skills-1.xml
+ https://www.skills.sh/sitemap-skills-2.xml
+"""
+ skills_1_xml = """
+
+ https://www.skills.sh/anthropics/skills/frontend-design
+ https://www.skills.sh/anthropics/skills/pdf
+ https://www.skills.sh/vercel-labs/agent-skills/react-best-practices
+"""
+ skills_2_xml = """
+
+ https://www.skills.sh/microsoft/azure-skills/azure-ai
+ https://www.skills.sh/anthropics/skills/frontend-design
+"""
+
+ def side_effect(url, *args, **kwargs):
+ resp = MagicMock(status_code=200)
+ if url.endswith("/sitemap.xml"):
+ resp.text = index_xml
+ elif "sitemap-skills-1" in url:
+ resp.text = skills_1_xml
+ elif "sitemap-skills-2" in url:
+ resp.text = skills_2_xml
+ else:
+ resp.status_code = 404
+ resp.text = ""
+ return resp
+
+ mock_get.side_effect = side_effect
+
+ results = self._source().search("", limit=0)
+
+ # 4 unique skills (the frontend-design dup across sitemaps collapsed).
+ assert len(results) == 4
+ identifiers = {r.identifier for r in results}
+ assert identifiers == {
+ "skills-sh/anthropics/skills/frontend-design",
+ "skills-sh/anthropics/skills/pdf",
+ "skills-sh/vercel-labs/agent-skills/react-best-practices",
+ "skills-sh/microsoft/azure-skills/azure-ai",
+ }
+ # Homepage was NOT fetched — the sitemap path is taken on empty query.
+ urls_called = [call.args[0] for call in mock_get.call_args_list]
+ assert not any(u == "https://skills.sh" or u == "https://skills.sh/" for u in urls_called)
+
class TestFindSkillInRepoTree:
"""Tests for GitHubSource._find_skill_in_repo_tree."""
diff --git a/tools/skills_hub.py b/tools/skills_hub.py
index b0d58122b34..084494e6b70 100644
--- a/tools/skills_hub.py
+++ b/tools/skills_hub.py
@@ -1217,6 +1217,16 @@ class SkillsShSource(SkillSource):
BASE_URL = "https://skills.sh"
SEARCH_URL = f"{BASE_URL}/api/search"
+ # Sitemap index — the real catalog source. The homepage scrape only
+ # exposes a curated featured strip (~200 entries); the sitemap covers
+ # the full ~20k+ catalog. https://www.skills.sh/sitemap.xml points at
+ # sitemap-skills-1.xml + sitemap-skills-2.xml, each up to 10k URLs.
+ SITEMAP_INDEX_URL = "https://www.skills.sh/sitemap.xml"
+ _SITEMAP_LOC_RE = re.compile(r"([^<]+)", re.IGNORECASE)
+ _SITEMAP_SKILL_RE = re.compile(
+ r"^https?://(?:www\.)?skills\.sh/(?P[^/]+)/(?P[^/]+)/(?P[^/]+)/?$",
+ re.IGNORECASE,
+ )
_SKILL_LINK_RE = re.compile(r'href=["\']/(?P(?!agents/|_next/|api/)[^"\'/]+/[^"\'/]+/[^"\'/]+)["\']')
_INSTALL_CMD_RE = re.compile(
r'npx\s+skills\s+add\s+(?Phttps?://github\.com/[^\s<]+|[^\s<]+)'
@@ -1246,7 +1256,10 @@ class SkillsShSource(SkillSource):
def search(self, query: str, limit: int = 10) -> List[SkillMeta]:
if not query.strip():
- return self._featured_skills(limit)
+ # Empty query = bulk catalog dump (what build_skills_index.py
+ # calls with). The homepage scrape only sees ~200 featured
+ # entries; the sitemap walks the full ~20k+ catalog.
+ return self._sitemap_catalog(limit)
cache_key = f"skills_sh_search_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}"
cached = _read_index_cache(cache_key)
@@ -1307,6 +1320,97 @@ class SkillsShSource(SkillSource):
return self._finalize_inspect_meta(meta, canonical, detail)
return None
+ def _sitemap_catalog(self, limit: int) -> List[SkillMeta]:
+ """Walk the skills.sh sitemap to enumerate the full catalog.
+
+ Cached for the standard index TTL so we don't refetch ~2 MB of
+ sitemap XML per build. Falls back to ``_featured_skills`` if the
+ sitemap is unreachable or empty (network failure, hostname
+ change, etc.).
+ """
+ cache_key = "skills_sh_sitemap_v1"
+ cached = _read_index_cache(cache_key)
+ if cached is not None:
+ metas = [SkillMeta(**item) for item in cached]
+ return metas[:limit] if limit > 0 else metas
+
+ # skills.sh serves the per-skill sitemaps brotli-compressed, and
+ # httpx's optional brotlicffi backend has a streaming-decode bug
+ # that fails on these specific payloads. Excluding "br" from
+ # Accept-Encoding makes the server fall back to gzip (or
+ # identity), which works on every httpx install.
+ sitemap_headers = {"Accept-Encoding": "gzip"}
+
+ # Step 1: fetch the sitemap index → list of skill-sitemap URLs.
+ skill_sitemap_urls: List[str] = []
+ try:
+ resp = httpx.get(
+ self.SITEMAP_INDEX_URL,
+ timeout=20,
+ follow_redirects=True,
+ headers=sitemap_headers,
+ )
+ if resp.status_code != 200:
+ return self._featured_skills(limit)
+ for match in self._SITEMAP_LOC_RE.finditer(resp.text):
+ loc = match.group(1).strip()
+ # Sitemap index entries that point at the per-skill maps.
+ if "sitemap-skills" in loc:
+ skill_sitemap_urls.append(loc)
+ except httpx.HTTPError:
+ return self._featured_skills(limit)
+
+ if not skill_sitemap_urls:
+ return self._featured_skills(limit)
+
+ # Step 2: fetch each skill sitemap and collect canonical "owner/repo/skill" IDs.
+ seen: set[str] = set()
+ results: List[SkillMeta] = []
+ for sitemap_url in skill_sitemap_urls:
+ try:
+ resp = httpx.get(
+ sitemap_url,
+ timeout=30,
+ follow_redirects=True,
+ headers=sitemap_headers,
+ )
+ if resp.status_code != 200:
+ continue
+ except httpx.HTTPError:
+ continue
+ for loc_match in self._SITEMAP_LOC_RE.finditer(resp.text):
+ url = loc_match.group(1).strip()
+ m = self._SITEMAP_SKILL_RE.match(url)
+ if not m:
+ continue
+ owner = m.group("owner")
+ repo_name = m.group("repo")
+ skill_name = m.group("skill")
+ canonical = f"{owner}/{repo_name}/{skill_name}"
+ if canonical in seen:
+ continue
+ seen.add(canonical)
+ repo = f"{owner}/{repo_name}"
+ results.append(SkillMeta(
+ name=skill_name,
+ description=f"Indexed by skills.sh from {repo}",
+ source="skills.sh",
+ identifier=self._wrap_identifier(canonical),
+ trust_level=self.github.trust_level_for(canonical),
+ repo=repo,
+ path=skill_name,
+ extra={
+ "detail_url": f"{self.BASE_URL}/{canonical}",
+ "repo_url": f"https://github.com/{repo}",
+ },
+ ))
+
+ if not results:
+ return self._featured_skills(limit)
+
+ _write_index_cache(cache_key, [_skill_meta_to_dict(item) for item in results])
+ return results[:limit] if limit > 0 else results
+
def _featured_skills(self, limit: int) -> List[SkillMeta]:
cache_key = "skills_sh_featured"
cached = _read_index_cache(cache_key)