fix(skills): pull full skills.sh catalog via sitemap (858 → 19,932) (#34025)

The skills.sh source was returning ~858 unique skills from a hardcoded
list of 28 popular keyword searches (each capped at 50 results). The
real catalog is ~20k — exposed via sitemap-skills-{1,2}.xml linked from
the site's sitemap index.

Switch the empty-query path in SkillsShSource.search() to walk the
sitemap instead of scraping the homepage's curated featured strip.
Falls back to the homepage scrape if the sitemap is unreachable.

build_skills_index.crawl_skills_sh() now just calls search("", limit=0)
instead of running 28 keyword searches — same result in one HTTP round
instead of 28.

Also handle a httpx + brotlicffi interaction: the per-skill sitemaps
are ~900 KB brotli-compressed and the cffi backend's streaming decode
chokes on them. Forcing Accept-Encoding to gzip dodges the bug without
requiring a brotli library upgrade.

E2E against live skills.sh: 19,932 unique skills walked in 0.7s.
Tests: 137 pass (+1 new regression test exercising the sitemap path).

Floor for skills.sh raised 100 → 10,000 in EXPECTED_FLOORS so a future
regression hard-fails the build.
This commit is contained in:
Teknium 2026-05-28 11:28:12 -07:00 committed by GitHub
parent b1d3ead7fb
commit 7050c052e3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 189 additions and 22 deletions

View file

@ -80,30 +80,27 @@ def crawl_source(source, source_name: str, limit: int) -> list:
def crawl_skills_sh(source: SkillsShSource) -> list:
"""Crawl skills.sh using popular queries for broad coverage."""
print(" Crawling skills.sh (popular queries)...", flush=True)
"""Crawl skills.sh via its sitemap to enumerate the full catalog (~20k entries).
Previously walked a hardcoded list of ~28 popular keywords (each capped at
50 results) which yielded ~850 unique skills about 4% of the real catalog.
The SkillsShSource.search("") path now hits the sitemap directly, returning
the full 20k-entry catalog deduplicated by canonical identifier.
"""
print(" Crawling skills.sh (sitemap)...", flush=True)
start = time.time()
queries = [
"", # featured
"react", "python", "web", "api", "database", "docker",
"testing", "scraping", "design", "typescript", "git",
"aws", "security", "data", "ml", "ai", "devops",
"frontend", "backend", "mobile", "cli", "documentation",
"kubernetes", "terraform", "rust", "go", "java",
]
try:
results = source.search("", limit=0) # 0 = no cap, return the whole catalog
except Exception as e:
print(f" Warning: skills.sh sitemap walk failed: {e}", file=sys.stderr)
results = []
all_skills: dict[str, dict] = {}
for query in queries:
try:
results = source.search(query, limit=50)
for meta in results:
entry = _meta_to_dict(meta)
if entry["identifier"] not in all_skills:
all_skills[entry["identifier"]] = entry
except Exception as e:
print(f" Warning: skills.sh search '{query}' failed: {e}",
file=sys.stderr)
for meta in results:
entry = _meta_to_dict(meta)
if entry["identifier"] not in all_skills:
all_skills[entry["identifier"]] = entry
elapsed = time.time() - start
print(f" skills.sh: {len(all_skills)} unique skills ({elapsed:.1f}s)",
@ -345,7 +342,11 @@ def main():
# or rate limiting kicked in. Failing here forces a human look before
# the broken index reaches the live docs.
EXPECTED_FLOORS = {
"skills.sh": 100,
# skills.sh now uses the sitemap walker (~20k catalog as of May 2026).
# Anything under 10k means the sitemap shape changed or fetches failed
# — better to fail loudly than ship a regression to the 858-skill
# popular-queries era.
"skills.sh": 10000,
"lobehub": 100,
# ClawHub had 49,698+ skills as of May 2026 — anything under 20k means
# pagination broke or the API surface changed. Fail loudly rather

View file

@ -472,6 +472,68 @@ class TestSkillsShSource:
requested_urls = [call.args[0] for call in mock_get.call_args_list]
assert root_url not in requested_urls
@patch("tools.skills_hub._write_index_cache")
@patch("tools.skills_hub._read_index_cache", return_value=None)
@patch("tools.skills_hub.httpx.get")
def test_empty_query_walks_sitemap_not_homepage(
self, mock_get, _mock_read_cache, _mock_write_cache,
):
"""Empty query must walk the full sitemap.
Regression for skills.sh shipping ~858/20000 skills: the previous
empty-query path scraped the homepage's featured strip (~200 entries),
and build_skills_index.py supplemented it with 28 popular keyword
searches to drag the count to ~850. The sitemap walker hits the
full ~20k catalog in one pass.
"""
index_xml = """<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>https://www.skills.sh/sitemap-misc.xml</loc></sitemap>
<sitemap><loc>https://www.skills.sh/sitemap-skills-1.xml</loc></sitemap>
<sitemap><loc>https://www.skills.sh/sitemap-skills-2.xml</loc></sitemap>
</sitemapindex>"""
skills_1_xml = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://www.skills.sh/anthropics/skills/frontend-design</loc></url>
<url><loc>https://www.skills.sh/anthropics/skills/pdf</loc></url>
<url><loc>https://www.skills.sh/vercel-labs/agent-skills/react-best-practices</loc></url>
</urlset>"""
skills_2_xml = """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://www.skills.sh/microsoft/azure-skills/azure-ai</loc></url>
<url><loc>https://www.skills.sh/anthropics/skills/frontend-design</loc></url>
</urlset>"""
def side_effect(url, *args, **kwargs):
resp = MagicMock(status_code=200)
if url.endswith("/sitemap.xml"):
resp.text = index_xml
elif "sitemap-skills-1" in url:
resp.text = skills_1_xml
elif "sitemap-skills-2" in url:
resp.text = skills_2_xml
else:
resp.status_code = 404
resp.text = ""
return resp
mock_get.side_effect = side_effect
results = self._source().search("", limit=0)
# 4 unique skills (the frontend-design dup across sitemaps collapsed).
assert len(results) == 4
identifiers = {r.identifier for r in results}
assert identifiers == {
"skills-sh/anthropics/skills/frontend-design",
"skills-sh/anthropics/skills/pdf",
"skills-sh/vercel-labs/agent-skills/react-best-practices",
"skills-sh/microsoft/azure-skills/azure-ai",
}
# Homepage was NOT fetched — the sitemap path is taken on empty query.
urls_called = [call.args[0] for call in mock_get.call_args_list]
assert not any(u == "https://skills.sh" or u == "https://skills.sh/" for u in urls_called)
class TestFindSkillInRepoTree:
"""Tests for GitHubSource._find_skill_in_repo_tree."""

View file

@ -1217,6 +1217,16 @@ class SkillsShSource(SkillSource):
BASE_URL = "https://skills.sh"
SEARCH_URL = f"{BASE_URL}/api/search"
# Sitemap index — the real catalog source. The homepage scrape only
# exposes a curated featured strip (~200 entries); the sitemap covers
# the full ~20k+ catalog. https://www.skills.sh/sitemap.xml points at
# sitemap-skills-1.xml + sitemap-skills-2.xml, each up to 10k URLs.
SITEMAP_INDEX_URL = "https://www.skills.sh/sitemap.xml"
_SITEMAP_LOC_RE = re.compile(r"<loc>([^<]+)</loc>", re.IGNORECASE)
_SITEMAP_SKILL_RE = re.compile(
r"^https?://(?:www\.)?skills\.sh/(?P<owner>[^/]+)/(?P<repo>[^/]+)/(?P<skill>[^/]+)/?$",
re.IGNORECASE,
)
_SKILL_LINK_RE = re.compile(r'href=["\']/(?P<id>(?!agents/|_next/|api/)[^"\'/]+/[^"\'/]+/[^"\'/]+)["\']')
_INSTALL_CMD_RE = re.compile(
r'npx\s+skills\s+add\s+(?P<repo>https?://github\.com/[^\s<]+|[^\s<]+)'
@ -1246,7 +1256,10 @@ class SkillsShSource(SkillSource):
def search(self, query: str, limit: int = 10) -> List[SkillMeta]:
if not query.strip():
return self._featured_skills(limit)
# Empty query = bulk catalog dump (what build_skills_index.py
# calls with). The homepage scrape only sees ~200 featured
# entries; the sitemap walks the full ~20k+ catalog.
return self._sitemap_catalog(limit)
cache_key = f"skills_sh_search_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}"
cached = _read_index_cache(cache_key)
@ -1307,6 +1320,97 @@ class SkillsShSource(SkillSource):
return self._finalize_inspect_meta(meta, canonical, detail)
return None
def _sitemap_catalog(self, limit: int) -> List[SkillMeta]:
"""Walk the skills.sh sitemap to enumerate the full catalog.
Cached for the standard index TTL so we don't refetch ~2 MB of
sitemap XML per build. Falls back to ``_featured_skills`` if the
sitemap is unreachable or empty (network failure, hostname
change, etc.).
"""
cache_key = "skills_sh_sitemap_v1"
cached = _read_index_cache(cache_key)
if cached is not None:
metas = [SkillMeta(**item) for item in cached]
return metas[:limit] if limit > 0 else metas
# skills.sh serves the per-skill sitemaps brotli-compressed, and
# httpx's optional brotlicffi backend has a streaming-decode bug
# that fails on these specific payloads. Excluding "br" from
# Accept-Encoding makes the server fall back to gzip (or
# identity), which works on every httpx install.
sitemap_headers = {"Accept-Encoding": "gzip"}
# Step 1: fetch the sitemap index → list of skill-sitemap URLs.
skill_sitemap_urls: List[str] = []
try:
resp = httpx.get(
self.SITEMAP_INDEX_URL,
timeout=20,
follow_redirects=True,
headers=sitemap_headers,
)
if resp.status_code != 200:
return self._featured_skills(limit)
for match in self._SITEMAP_LOC_RE.finditer(resp.text):
loc = match.group(1).strip()
# Sitemap index entries that point at the per-skill maps.
if "sitemap-skills" in loc:
skill_sitemap_urls.append(loc)
except httpx.HTTPError:
return self._featured_skills(limit)
if not skill_sitemap_urls:
return self._featured_skills(limit)
# Step 2: fetch each skill sitemap and collect canonical "owner/repo/skill" IDs.
seen: set[str] = set()
results: List[SkillMeta] = []
for sitemap_url in skill_sitemap_urls:
try:
resp = httpx.get(
sitemap_url,
timeout=30,
follow_redirects=True,
headers=sitemap_headers,
)
if resp.status_code != 200:
continue
except httpx.HTTPError:
continue
for loc_match in self._SITEMAP_LOC_RE.finditer(resp.text):
url = loc_match.group(1).strip()
m = self._SITEMAP_SKILL_RE.match(url)
if not m:
continue
owner = m.group("owner")
repo_name = m.group("repo")
skill_name = m.group("skill")
canonical = f"{owner}/{repo_name}/{skill_name}"
if canonical in seen:
continue
seen.add(canonical)
repo = f"{owner}/{repo_name}"
results.append(SkillMeta(
name=skill_name,
description=f"Indexed by skills.sh from {repo}",
source="skills.sh",
identifier=self._wrap_identifier(canonical),
trust_level=self.github.trust_level_for(canonical),
repo=repo,
path=skill_name,
extra={
"detail_url": f"{self.BASE_URL}/{canonical}",
"repo_url": f"https://github.com/{repo}",
},
))
if not results:
return self._featured_skills(limit)
_write_index_cache(cache_key, [_skill_meta_to_dict(item) for item in results])
return results[:limit] if limit > 0 else results
def _featured_skills(self, limit: int) -> List[SkillMeta]:
cache_key = "skills_sh_featured"
cached = _read_index_cache(cache_key)