fix(skills): pull full skills.sh catalog via sitemap (858 → 19,932) (#34025)

The skills.sh source was returning ~858 unique skills from a hardcoded list of 28 popular keyword searches (each capped at 50 results). The real catalog is ~20k — exposed via sitemap-skills-{1,2}.xml linked from the site's sitemap index. Switch the empty-query path in SkillsShSource.search() to walk the sitemap instead of scraping the homepage's curated featured strip. Falls back to the homepage scrape if the sitemap is unreachable. build_skills_index.crawl_skills_sh() now just calls search("", limit=0) instead of running 28 keyword searches — same result in one HTTP round instead of 28. Also handle a httpx + brotlicffi interaction: the per-skill sitemaps are ~900 KB brotli-compressed and the cffi backend's streaming decode chokes on them. Forcing Accept-Encoding to gzip dodges the bug without requiring a brotli library upgrade. E2E against live skills.sh: 19,932 unique skills walked in 0.7s. Tests: 137 pass (+1 new regression test exercising the sitemap path). Floor for skills.sh raised 100 → 10,000 in EXPECTED_FLOORS so a future regression hard-fails the build.
2026-07-17 14:42:06 +00:00 · 2026-05-28 11:28:12 -07:00 · 2026-05-28 11:28:12 -07:00 · 7050c052e3
commit 7050c052e3
parent b1d3ead7fb
3 changed files with 189 additions and 22 deletions
--- a/scripts/build_skills_index.py
+++ b/scripts/build_skills_index.py
@ -80,30 +80,27 @@ def crawl_source(source, source_name: str, limit: int) -> list:


 def crawl_skills_sh(source: SkillsShSource) -> list:
-    """Crawl skills.sh using popular queries for broad coverage."""
-    print("  Crawling skills.sh (popular queries)...", flush=True)
+    """Crawl skills.sh via its sitemap to enumerate the full catalog (~20k entries).
+
+    Previously walked a hardcoded list of ~28 popular keywords (each capped at
+    50 results) which yielded ~850 unique skills — about 4% of the real catalog.
+    The SkillsShSource.search("") path now hits the sitemap directly, returning
+    the full 20k-entry catalog deduplicated by canonical identifier.
+    """
+    print("  Crawling skills.sh (sitemap)...", flush=True)
    start = time.time()

-    queries = [
-        "",  # featured
-        "react", "python", "web", "api", "database", "docker",
-        "testing", "scraping", "design", "typescript", "git",
-        "aws", "security", "data", "ml", "ai", "devops",
-        "frontend", "backend", "mobile", "cli", "documentation",
-        "kubernetes", "terraform", "rust", "go", "java",
-    ]
+    try:
+        results = source.search("", limit=0)  # 0 = no cap, return the whole catalog
+    except Exception as e:
+        print(f"    Warning: skills.sh sitemap walk failed: {e}", file=sys.stderr)
+        results = []

    all_skills: dict[str, dict] = {}
-    for query in queries:
-        try:
-            results = source.search(query, limit=50)
-            for meta in results:
-                entry = _meta_to_dict(meta)
-                if entry["identifier"] not in all_skills:
-                    all_skills[entry["identifier"]] = entry
-        except Exception as e:
-            print(f"    Warning: skills.sh search '{query}' failed: {e}",
-                  file=sys.stderr)
+    for meta in results:
+        entry = _meta_to_dict(meta)
+        if entry["identifier"] not in all_skills:
+            all_skills[entry["identifier"]] = entry

    elapsed = time.time() - start
    print(f"  skills.sh: {len(all_skills)} unique skills ({elapsed:.1f}s)",
@ -345,7 +342,11 @@ def main():
    # or rate limiting kicked in.  Failing here forces a human look before
    # the broken index reaches the live docs.
    EXPECTED_FLOORS = {
-        "skills.sh": 100,
+        # skills.sh now uses the sitemap walker (~20k catalog as of May 2026).
+        # Anything under 10k means the sitemap shape changed or fetches failed
+        # — better to fail loudly than ship a regression to the 858-skill
+        # popular-queries era.
+        "skills.sh": 10000,
        "lobehub": 100,
        # ClawHub had 49,698+ skills as of May 2026 — anything under 20k means
        # pagination broke or the API surface changed.  Fail loudly rather
--- a/tests/tools/test_skills_hub.py
+++ b/tests/tools/test_skills_hub.py
@ -472,6 +472,68 @@ class TestSkillsShSource:
        requested_urls = [call.args[0] for call in mock_get.call_args_list]
        assert root_url not in requested_urls

+    @patch("tools.skills_hub._write_index_cache")
+    @patch("tools.skills_hub._read_index_cache", return_value=None)
+    @patch("tools.skills_hub.httpx.get")
+    def test_empty_query_walks_sitemap_not_homepage(
+        self, mock_get, _mock_read_cache, _mock_write_cache,
+    ):
+        """Empty query must walk the full sitemap.
+
+        Regression for skills.sh shipping ~858/20000 skills: the previous
+        empty-query path scraped the homepage's featured strip (~200 entries),
+        and build_skills_index.py supplemented it with 28 popular keyword
+        searches to drag the count to ~850. The sitemap walker hits the
+        full ~20k catalog in one pass.
+        """
+        index_xml = """<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <sitemap><loc>https://www.skills.sh/sitemap-misc.xml</loc></sitemap>
+  <sitemap><loc>https://www.skills.sh/sitemap-skills-1.xml</loc></sitemap>
+  <sitemap><loc>https://www.skills.sh/sitemap-skills-2.xml</loc></sitemap>
+</sitemapindex>"""
+        skills_1_xml = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://www.skills.sh/anthropics/skills/frontend-design</loc></url>
+  <url><loc>https://www.skills.sh/anthropics/skills/pdf</loc></url>
+  <url><loc>https://www.skills.sh/vercel-labs/agent-skills/react-best-practices</loc></url>
+</urlset>"""
+        skills_2_xml = """<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url><loc>https://www.skills.sh/microsoft/azure-skills/azure-ai</loc></url>
+  <url><loc>https://www.skills.sh/anthropics/skills/frontend-design</loc></url>
+</urlset>"""
+
+        def side_effect(url, *args, **kwargs):
+            resp = MagicMock(status_code=200)
+            if url.endswith("/sitemap.xml"):
+                resp.text = index_xml
+            elif "sitemap-skills-1" in url:
+                resp.text = skills_1_xml
+            elif "sitemap-skills-2" in url:
+                resp.text = skills_2_xml
+            else:
+                resp.status_code = 404
+                resp.text = ""
+            return resp
+
+        mock_get.side_effect = side_effect
+
+        results = self._source().search("", limit=0)
+
+        # 4 unique skills (the frontend-design dup across sitemaps collapsed).
+        assert len(results) == 4
+        identifiers = {r.identifier for r in results}
+        assert identifiers == {
+            "skills-sh/anthropics/skills/frontend-design",
+            "skills-sh/anthropics/skills/pdf",
+            "skills-sh/vercel-labs/agent-skills/react-best-practices",
+            "skills-sh/microsoft/azure-skills/azure-ai",
+        }
+        # Homepage was NOT fetched — the sitemap path is taken on empty query.
+        urls_called = [call.args[0] for call in mock_get.call_args_list]
+        assert not any(u == "https://skills.sh" or u == "https://skills.sh/" for u in urls_called)
+

 class TestFindSkillInRepoTree:
    """Tests for GitHubSource._find_skill_in_repo_tree."""
--- a/tools/skills_hub.py
+++ b/tools/skills_hub.py
@ -1217,6 +1217,16 @@ class SkillsShSource(SkillSource):

    BASE_URL = "https://skills.sh"
    SEARCH_URL = f"{BASE_URL}/api/search"
+    # Sitemap index — the real catalog source. The homepage scrape only
+    # exposes a curated featured strip (~200 entries); the sitemap covers
+    # the full ~20k+ catalog. https://www.skills.sh/sitemap.xml points at
+    # sitemap-skills-1.xml + sitemap-skills-2.xml, each up to 10k URLs.
+    SITEMAP_INDEX_URL = "https://www.skills.sh/sitemap.xml"
+    _SITEMAP_LOC_RE = re.compile(r"<loc>([^<]+)</loc>", re.IGNORECASE)
+    _SITEMAP_SKILL_RE = re.compile(
+        r"^https?://(?:www\.)?skills\.sh/(?P<owner>[^/]+)/(?P<repo>[^/]+)/(?P<skill>[^/]+)/?$",
+        re.IGNORECASE,
+    )
    _SKILL_LINK_RE = re.compile(r'href=["\']/(?P<id>(?!agents/|_next/|api/)[^"\'/]+/[^"\'/]+/[^"\'/]+)["\']')
    _INSTALL_CMD_RE = re.compile(
        r'npx\s+skills\s+add\s+(?P<repo>https?://github\.com/[^\s<]+|[^\s<]+)'
@ -1246,7 +1256,10 @@ class SkillsShSource(SkillSource):

    def search(self, query: str, limit: int = 10) -> List[SkillMeta]:
        if not query.strip():
-            return self._featured_skills(limit)
+            # Empty query = bulk catalog dump (what build_skills_index.py
+            # calls with). The homepage scrape only sees ~200 featured
+            # entries; the sitemap walks the full ~20k+ catalog.
+            return self._sitemap_catalog(limit)

        cache_key = f"skills_sh_search_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}"
        cached = _read_index_cache(cache_key)
@ -1307,6 +1320,97 @@ class SkillsShSource(SkillSource):
            return self._finalize_inspect_meta(meta, canonical, detail)
        return None

+    def _sitemap_catalog(self, limit: int) -> List[SkillMeta]:
+        """Walk the skills.sh sitemap to enumerate the full catalog.
+
+        Cached for the standard index TTL so we don't refetch ~2 MB of
+        sitemap XML per build. Falls back to ``_featured_skills`` if the
+        sitemap is unreachable or empty (network failure, hostname
+        change, etc.).
+        """
+        cache_key = "skills_sh_sitemap_v1"
+        cached = _read_index_cache(cache_key)
+        if cached is not None:
+            metas = [SkillMeta(**item) for item in cached]
+            return metas[:limit] if limit > 0 else metas
+
+        # skills.sh serves the per-skill sitemaps brotli-compressed, and
+        # httpx's optional brotlicffi backend has a streaming-decode bug
+        # that fails on these specific payloads. Excluding "br" from
+        # Accept-Encoding makes the server fall back to gzip (or
+        # identity), which works on every httpx install.
+        sitemap_headers = {"Accept-Encoding": "gzip"}
+
+        # Step 1: fetch the sitemap index → list of skill-sitemap URLs.
+        skill_sitemap_urls: List[str] = []
+        try:
+            resp = httpx.get(
+                self.SITEMAP_INDEX_URL,
+                timeout=20,
+                follow_redirects=True,
+                headers=sitemap_headers,
+            )
+            if resp.status_code != 200:
+                return self._featured_skills(limit)
+            for match in self._SITEMAP_LOC_RE.finditer(resp.text):
+                loc = match.group(1).strip()
+                # Sitemap index entries that point at the per-skill maps.
+                if "sitemap-skills" in loc:
+                    skill_sitemap_urls.append(loc)
+        except httpx.HTTPError:
+            return self._featured_skills(limit)
+
+        if not skill_sitemap_urls:
+            return self._featured_skills(limit)
+
+        # Step 2: fetch each skill sitemap and collect canonical "owner/repo/skill" IDs.
+        seen: set[str] = set()
+        results: List[SkillMeta] = []
+        for sitemap_url in skill_sitemap_urls:
+            try:
+                resp = httpx.get(
+                    sitemap_url,
+                    timeout=30,
+                    follow_redirects=True,
+                    headers=sitemap_headers,
+                )
+                if resp.status_code != 200:
+                    continue
+            except httpx.HTTPError:
+                continue
+            for loc_match in self._SITEMAP_LOC_RE.finditer(resp.text):
+                url = loc_match.group(1).strip()
+                m = self._SITEMAP_SKILL_RE.match(url)
+                if not m:
+                    continue
+                owner = m.group("owner")
+                repo_name = m.group("repo")
+                skill_name = m.group("skill")
+                canonical = f"{owner}/{repo_name}/{skill_name}"
+                if canonical in seen:
+                    continue
+                seen.add(canonical)
+                repo = f"{owner}/{repo_name}"
+                results.append(SkillMeta(
+                    name=skill_name,
+                    description=f"Indexed by skills.sh from {repo}",
+                    source="skills.sh",
+                    identifier=self._wrap_identifier(canonical),
+                    trust_level=self.github.trust_level_for(canonical),
+                    repo=repo,
+                    path=skill_name,
+                    extra={
+                        "detail_url": f"{self.BASE_URL}/{canonical}",
+                        "repo_url": f"https://github.com/{repo}",
+                    },
+                ))
+
+        if not results:
+            return self._featured_skills(limit)
+
+        _write_index_cache(cache_key, [_skill_meta_to_dict(item) for item in results])
+        return results[:limit] if limit > 0 else results
+
    def _featured_skills(self, limit: int) -> List[SkillMeta]:
        cache_key = "skills_sh_featured"
        cached = _read_index_cache(cache_key)