diff --git a/scripts/build_skills_index.py b/scripts/build_skills_index.py index 9b9277547f7..c1490669006 100644 --- a/scripts/build_skills_index.py +++ b/scripts/build_skills_index.py @@ -269,11 +269,28 @@ def main(): # Crawl skills.sh all_skills.extend(crawl_skills_sh(skills_sh_source)) - # Crawl other sources in parallel + # Crawl other sources in parallel. + # Per-source soft caps — sources stop returning when they run out, so these + # are ceilings, not targets. ClawHub has 20k+ skills; bumping to 100k + # (well above current catalog size) lets the full catalog land in the + # index instead of being truncated at an arbitrary build-time limit. + SOURCE_LIMITS = { + # ClawHub had 49,698+ skills as of May 2026; 200k leaves headroom. + "clawhub": 200_000, + "lobehub": 100_000, + "browse-sh": 5_000, + "claude-marketplace": 5_000, + "github": 5_000, + "well-known": 5_000, + "official": 5_000, + } + DEFAULT_SOURCE_LIMIT = 500 + with ThreadPoolExecutor(max_workers=4) as pool: futures = {} for name, source in sources.items(): - futures[pool.submit(crawl_source, source, name, 500)] = name + limit = SOURCE_LIMITS.get(name, DEFAULT_SOURCE_LIMIT) + futures[pool.submit(crawl_source, source, name, limit)] = name for future in as_completed(futures): try: all_skills.extend(future.result()) @@ -330,7 +347,11 @@ def main(): EXPECTED_FLOORS = { "skills.sh": 100, "lobehub": 100, - "clawhub": 50, + # ClawHub had 49,698+ skills as of May 2026 — anything under 20k means + # pagination broke or the API surface changed. Fail loudly rather + # than ship a degenerate index (we shipped 200/50000 silently for + # weeks because the floor was 50). + "clawhub": 20000, "official": 50, "github": 30, # collapsed across all GitHub taps "browse-sh": 50, diff --git a/tests/tools/test_skills_hub_clawhub.py b/tests/tools/test_skills_hub_clawhub.py index 2b2863498a3..6b45d081d09 100644 --- a/tests/tools/test_skills_hub_clawhub.py +++ b/tests/tools/test_skills_hub_clawhub.py @@ -298,6 +298,58 @@ class TestClawHubSource(unittest.TestCase): self.assertIsNone(bundle) self.assertEqual(mock_get.call_count, 3) + @patch("tools.skills_hub._write_index_cache") + @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch("tools.skills_hub.httpx.get") + def test_search_empty_query_paginates_full_catalog( + self, mock_get, _mock_read_cache, _mock_write_cache + ): + """Empty query must walk the cursor-paginated catalog. + + Regression for the silent 200-skill truncation: ClawHub's listing + endpoint caps any single page at 200 items + returns a `nextCursor`. + The build_skills_index.py crawler calls `search("", limit=N)` with a + large N to dump the full catalog. Before the fix, that hit a single + unpaginated request and silently dropped 99% of the catalog. + """ + # Three pages: 200 + 200 + 50 items, then no cursor → stop. + page_calls = {"n": 0} + pages = [ + { + "items": [{"slug": f"a-skill-{i}", "displayName": f"A {i}"} for i in range(200)], + "nextCursor": "cursor-page-2", + }, + { + "items": [{"slug": f"b-skill-{i}", "displayName": f"B {i}"} for i in range(200)], + "nextCursor": "cursor-page-3", + }, + { + "items": [{"slug": f"c-skill-{i}", "displayName": f"C {i}"} for i in range(50)], + "nextCursor": None, + }, + ] + + def side_effect(url, *args, **kwargs): + if url.endswith("/skills"): + idx = page_calls["n"] + page_calls["n"] += 1 + if idx < len(pages): + return _MockResponse(status_code=200, json_data=pages[idx]) + return _MockResponse(status_code=200, json_data={"items": []}) + return _MockResponse(status_code=404, json_data={}) + + mock_get.side_effect = side_effect + + results = self.src.search("", limit=10_000) + + # 200 + 200 + 50 = 450 unique skills, all retrieved via cursor pagination. + self.assertEqual(len(results), 450) + self.assertEqual(page_calls["n"], 3, "expected exactly 3 cursor-paginated pages") + identifiers = {meta.identifier for meta in results} + self.assertIn("a-skill-0", identifiers) + self.assertIn("b-skill-199", identifiers) + self.assertIn("c-skill-49", identifiers) + if __name__ == "__main__": unittest.main() diff --git a/tools/skills_hub.py b/tools/skills_hub.py index 01b53b68691..b0d58122b34 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -1859,8 +1859,18 @@ class ClawHubSource(SkillSource): results = self._search_catalog(query, limit=limit) if results: return results + else: + # Empty query: route through the paginating catalog walker so the + # full ClawHub catalog (20k+ skills) lands in the index. The + # single-request listing path below caps at one page (200 items) + # regardless of `limit`, which silently truncates the public + # skills index. The catalog walker follows `nextCursor`. + catalog = self._load_catalog_index() + if catalog: + return self._dedupe_results(catalog)[:limit] if limit > 0 else self._dedupe_results(catalog) - # Empty query or catalog fallback failure: use the lightweight listing API. + # Non-empty query catalog miss, or catalog walker failure: fall back to + # the lightweight listing API for a best-effort response. cache_key = f"clawhub_search_listing_v1_{hashlib.md5(query.encode()).hexdigest()}_{limit}" cached = _read_index_cache(cache_key) if cached is not None: @@ -1989,7 +1999,12 @@ class ClawHubSource(SkillSource): cursor: Optional[str] = None results: List[SkillMeta] = [] seen: set[str] = set() - max_pages = 50 + # ClawHub has 50k+ skills as of May 2026 (live E2E walked 49,698 with + # an active cursor still pending); 750 pages * 200/page = 150k ceiling + # leaves room for catalog growth. Walk-to-exhaustion typically + # terminates well before this on `nextCursor` going None — the cap is + # a safety rail against an infinite-cursor loop. + max_pages = 750 for _ in range(max_pages): params: Dict[str, Any] = {"limit": 200}