From fb9f3a4ef9af8fc6ec24bf4ccce1b1db32520aaa Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Thu, 28 May 2026 01:42:19 -0700
Subject: [PATCH] =?UTF-8?q?fix(skills):=20pull=20full=20ClawHub=20catalog?=
 =?UTF-8?q?=20into=20the=20skills=20index=20(200=20=E2=86=92=2020k+)=20(#3?=
 =?UTF-8?q?3748)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(skills): pull full ClawHub catalog into the skills index

The website was showing 200 ClawHub skills out of 20k+ because
`ClawHubSource.search("")` for empty queries went straight to a single
unpaginated request. ClawHub's API caps any single page at 200 items and
returns a `nextCursor`; we grabbed page 1 and stopped, so the cached
index served from hermes-agent.nousresearch.com had a silent 99%
truncation.

End users never hit clawhub.ai directly (the index is rebuilt twice
daily by .github/workflows/skills-index.yml and served as a static JSON
on the docs site), so the cap-and-cache architecture is correct — it
just wasn't being filled.

Changes:
- `ClawHubSource.search(query="")` now routes through the existing
  `_load_catalog_index()` paginating walker instead of the unpaginated
  listing fallback (non-empty queries still hit the fast catalog search).
- `_load_catalog_index()` max_pages 50 → 250 (50k-skill ceiling; live
  catalog is ~20k as of May 2026, with headroom for growth).
- `build_skills_index.py`: per-source crawl limits split out — ClawHub
  and LobeHub get 100k, others keep their effective caps.
- `EXPECTED_FLOORS["clawhub"]` 50 → 5000 so the next pagination
  regression hard-fails the CI build instead of silently shipping a
  degenerate index.

Test plan:
- New unit test `test_search_empty_query_paginates_full_catalog`
  exercises the cursor-following path with three mocked pages (450
  total items) and asserts all pages are walked.
- Existing 9 ClawHub tests + 127 broader skills_hub tests all pass.
- E2E against live ClawHub API: walker reached 9700+ skills across 49
  pages before this commit landed, paginating well past the previous
  50-page cap.

* fix(skills): raise ClawHub ceilings — live catalog is 50k, not 20k

E2E walk against live ClawHub API hit my initial 250-page cap at 49,698
skills with cursor=yes still pending. The catalog is roughly 2.5x larger
than the docstring estimate.

- max_pages 250 → 750 (150k ceiling, walks terminate on cursor=None
  well before this in practice)
- SOURCE_LIMITS['clawhub'] 100k → 200k
- EXPECTED_FLOORS['clawhub'] 5000 → 20000
---
 scripts/build_skills_index.py          | 27 +++++++++++--
 tests/tools/test_skills_hub_clawhub.py | 52 ++++++++++++++++++++++++++
 tools/skills_hub.py                    | 19 +++++++++-
 3 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/scripts/build_skills_index.py b/scripts/build_skills_index.py
index 9b9277547f7..c1490669006 100644
--- a/scripts/build_skills_index.py
+++ b/scripts/build_skills_index.py
@@ -269,11 +269,28 @@ def main():
     # Crawl skills.sh
     all_skills.extend(crawl_skills_sh(skills_sh_source))
 
-    # Crawl other sources in parallel
+    # Crawl other sources in parallel.
+    # Per-source soft caps — sources stop returning when they run out, so these
+    # are ceilings, not targets.  ClawHub has 20k+ skills; bumping to 100k
+    # (well above current catalog size) lets the full catalog land in the
+    # index instead of being truncated at an arbitrary build-time limit.
+    SOURCE_LIMITS = {
+        # ClawHub had 49,698+ skills as of May 2026; 200k leaves headroom.
+        "clawhub": 200_000,
+        "lobehub": 100_000,
+        "browse-sh": 5_000,
+        "claude-marketplace": 5_000,
+        "github": 5_000,
+        "well-known": 5_000,
+        "official": 5_000,
+    }
+    DEFAULT_SOURCE_LIMIT = 500
+
     with ThreadPoolExecutor(max_workers=4) as pool:
         futures = {}
         for name, source in sources.items():
-            futures[pool.submit(crawl_source, source, name, 500)] = name
+            limit = SOURCE_LIMITS.get(name, DEFAULT_SOURCE_LIMIT)
+            futures[pool.submit(crawl_source, source, name, limit)] = name
         for future in as_completed(futures):
             try:
                 all_skills.extend(future.result())
@@ -330,7 +347,11 @@ def main():
     EXPECTED_FLOORS = {
         "skills.sh": 100,
         "lobehub": 100,
-        "clawhub": 50,
+        # ClawHub had 49,698+ skills as of May 2026 — anything under 20k means
+        # pagination broke or the API surface changed.  Fail loudly rather
+        # than ship a degenerate index (we shipped 200/50000 silently for
+        # weeks because the floor was 50).
+        "clawhub": 20000,
         "official": 50,
         "github": 30,        # collapsed across all GitHub taps
         "browse-sh": 50,
diff --git a/tests/tools/test_skills_hub_clawhub.py b/tests/tools/test_skills_hub_clawhub.py
index 2b2863498a3..6b45d081d09 100644
--- a/tests/tools/test_skills_hub_clawhub.py
+++ b/tests/tools/test_skills_hub_clawhub.py
@@ -298,6 +298,58 @@ class TestClawHubSource(unittest.TestCase):
         self.assertIsNone(bundle)
         self.assertEqual(mock_get.call_count, 3)
 
+    @patch("tools.skills_hub._write_index_cache")
+    @patch("tools.skills_hub._read_index_cache", return_value=None)
+    @patch("tools.skills_hub.httpx.get")
+    def test_search_empty_query_paginates_full_catalog(
+        self, mock_get, _mock_read_cache, _mock_write_cache
+    ):
+        """Empty query must walk the cursor-paginated catalog.
+
+        Regression for the silent 200-skill truncation: ClawHub's listing
+        endpoint caps any single page at 200 items + returns a `nextCursor`.
+        The build_skills_index.py crawler calls `search("", limit=N)` with a
+        large N to dump the full catalog. Before the fix, that hit a single
+        unpaginated request and silently dropped 99% of the catalog.
+        """
+        # Three pages: 200 + 200 + 50 items, then no cursor → stop.
+        page_calls = {"n": 0}
+        pages = [
+            {
+                "items": [{"slug": f"a-skill-{i}", "displayName": f"A {i}"} for i in range(200)],
+                "nextCursor": "cursor-page-2",
+            },
+            {
+                "items": [{"slug": f"b-skill-{i}", "displayName": f"B {i}"} for i in range(200)],
+                "nextCursor": "cursor-page-3",
+            },
+            {
+                "items": [{"slug": f"c-skill-{i}", "displayName": f"C {i}"} for i in range(50)],
+                "nextCursor": None,
+            },
+        ]
+
+        def side_effect(url, *args, **kwargs):
+            if url.endswith("/skills"):
+                idx = page_calls["n"]
+                page_calls["n"] += 1
+                if idx < len(pages):
+                    return _MockResponse(status_code=200, json_data=pages[idx])
+                return _MockResponse(status_code=200, json_data={"items": []})
+            return _MockResponse(status_code=404, json_data={})
+
+        mock_get.side_effect = side_effect
+
+        results = self.src.search("", limit=10_000)
+
+        # 200 + 200 + 50 = 450 unique skills, all retrieved via cursor pagination.
+        self.assertEqual(len(results), 450)
+        self.assertEqual(page_calls["n"], 3, "expected exactly 3 cursor-paginated pages")
+        identifiers = {meta.identifier for meta in results}
+        self.assertIn("a-skill-0", identifiers)
+        self.assertIn("b-skill-199", identifiers)
+        self.assertIn("c-skill-49", identifiers)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tools/skills_hub.py b/tools/skills_hub.py
index 01b53b68691..b0d58122b34 100644
--- a/tools/skills_hub.py
+++ b/tools/skills_hub.py
@@ -1859,8 +1859,18 @@ class ClawHubSource(SkillSource):
             results = self._search_catalog(query, limit=limit)
             if results:
                 return results
+        else:
+            # Empty query: route through the paginating catalog walker so the
+            # full ClawHub catalog (20k+ skills) lands in the index. The
+            # single-request listing path below caps at one page (200 items)
+            # regardless of `limit`, which silently truncates the public
+            # skills index. The catalog walker follows `nextCursor`.
+            catalog = self._load_catalog_index()
+            if catalog:
+                return self._dedupe_results(catalog)[:limit] if limit > 0 else self._dedupe_results(catalog)
 
-        # Empty query or catalog fallback failure: use the lightweight listing API.
+        # Non-empty query catalog miss, or catalog walker failure: fall back to
+        # the lightweight listing API for a best-effort response.
         cache_key = f"clawhub_search_listing_v1_{hashlib.md5(query.encode()).hexdigest()}_{limit}"
         cached = _read_index_cache(cache_key)
         if cached is not None:
@@ -1989,7 +1999,12 @@ class ClawHubSource(SkillSource):
         cursor: Optional[str] = None
         results: List[SkillMeta] = []
         seen: set[str] = set()
-        max_pages = 50
+        # ClawHub has 50k+ skills as of May 2026 (live E2E walked 49,698 with
+        # an active cursor still pending); 750 pages * 200/page = 150k ceiling
+        # leaves room for catalog growth. Walk-to-exhaustion typically
+        # terminates well before this on `nextCursor` going None — the cap is
+        # a safety rail against an infinite-cursor loop.
+        max_pages = 750
 
         for _ in range(max_pages):
             params: Dict[str, Any] = {"limit": 200}