From eee1da45f07496fbaa028977195875df2709661f Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 10 Jun 2026 01:01:53 -0700 Subject: [PATCH] fix(skills): bound ClawHub catalog walk to requested page on cold start (#43395) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Browse renders one page but the cold-cache fallback walked the entire 50k+ ClawHub catalog, then sliced off the first N — pure waste behind the 12s budget band-aid. _load_catalog_index now takes max_items: browse's empty-query path bounds the walk to its limit and stops early; the offline index builder still passes limit=0 (unbounded) and walks to exhaustion. A bounded walk is partial, so it is not written to the shared full-catalog cache (same poison-guard as the budget-truncated case). --- tests/tools/test_skills_hub_clawhub.py | 106 +++++++++++++++++++++++++ tools/skills_hub.py | 44 +++++++--- 2 files changed, 140 insertions(+), 10 deletions(-) diff --git a/tests/tools/test_skills_hub_clawhub.py b/tests/tools/test_skills_hub_clawhub.py index 9af917a678..972175999f 100644 --- a/tests/tools/test_skills_hub_clawhub.py +++ b/tests/tools/test_skills_hub_clawhub.py @@ -423,5 +423,111 @@ class TestClawHubSource(unittest.TestCase): mock_write_cache.assert_called_once() +class TestClawHubCatalogWalkBounded(unittest.TestCase): + """max_items bounds the walk so browse's cold-start fallback renders one + page without walking the entire 50k+ catalog. The offline index builder + keeps max_items=0 (unbounded) and walks to exhaustion.""" + + def setUp(self): + self.src = ClawHubSource() + self._safe_patcher = patch("tools.skills_hub.is_safe_url", return_value=True) + self._policy_patcher = patch("tools.skills_hub.check_website_access", return_value=None) + self._safe_patcher.start() + self._policy_patcher.start() + + def tearDown(self): + self._policy_patcher.stop() + self._safe_patcher.stop() + + def _infinite_pages(self, page_calls): + """A side_effect that always advertises another cursor — the walk would + never stop on its own, so only max_items / budget can break it.""" + + def side_effect(url, *args, **kwargs): + if url.endswith("/skills"): + idx = page_calls["n"] + page_calls["n"] += 1 + return _MockResponse( + status_code=200, + json_data={ + "items": [ + {"slug": f"skill-{idx}", "displayName": f"Skill {idx}"} + ], + "nextCursor": f"cursor-{idx + 1}", + }, + ) + return _MockResponse(status_code=404, json_data={}) + + return side_effect + + @patch("tools.skills_hub._write_index_cache") + @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch("tools.skills_hub.httpx.get") + def test_max_items_stops_walk_early_and_does_not_cache( + self, mock_get, _mock_read_cache, mock_write_cache + ): + """A bounded walk stops as soon as it has >= max_items skills and must + NOT poison the shared full-catalog cache with the partial slice.""" + page_calls = {"n": 0} + mock_get.side_effect = self._infinite_pages(page_calls) + + results = self.src._load_catalog_index(max_items=5) + + # Each mocked page yields exactly 1 item, so ~5 pages cover the bound. + self.assertGreaterEqual(len(results), 5) + self.assertLess(page_calls["n"], 750, "bounded walk should stop well before the cap") + self.assertLess(page_calls["n"], 20, "should stop within a few pages of the bound") + # Partial (bounded) walk must not be cached. + mock_write_cache.assert_not_called() + + @patch("tools.skills_hub._write_index_cache") + @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch("tools.skills_hub.httpx.get") + def test_max_items_zero_is_unbounded_and_caches( + self, mock_get, _mock_read_cache, mock_write_cache + ): + """max_items=0 (the index builder's path) walks to natural termination + and DOES cache the complete catalog.""" + + def side_effect(url, *args, **kwargs): + if url.endswith("/skills"): + return _MockResponse( + status_code=200, + json_data={ + "items": [ + {"slug": "a", "displayName": "A"}, + {"slug": "b", "displayName": "B"}, + {"slug": "c", "displayName": "C"}, + ], + # No nextCursor -> natural termination. + }, + ) + return _MockResponse(status_code=404, json_data={}) + + mock_get.side_effect = side_effect + + results = self.src._load_catalog_index(max_items=0) + + self.assertEqual(len(results), 3) + mock_write_cache.assert_called_once() + + @patch("tools.skills_hub._write_index_cache") + @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch("tools.skills_hub.httpx.get") + def test_empty_query_browse_bounds_walk_to_limit( + self, mock_get, _mock_read_cache, _mock_write_cache + ): + """search("", limit=N) is the browse cold-start path — it must bound the + catalog walk to N rather than walking the whole 50k+ catalog.""" + page_calls = {"n": 0} + mock_get.side_effect = self._infinite_pages(page_calls) + + results = self.src.search("", limit=10) + + self.assertEqual(len(results), 10, "browse page should be exactly `limit` items") + # Walk stopped near the bound, not at the 750-page cap. + self.assertLess(page_calls["n"], 30) + + if __name__ == "__main__": unittest.main() diff --git a/tools/skills_hub.py b/tools/skills_hub.py index ec00c33aa3..7750eb1b96 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -2119,12 +2119,13 @@ class ClawHubSource(SkillSource): if results: return results else: - # Empty query: route through the paginating catalog walker so the - # full ClawHub catalog (20k+ skills) lands in the index. The - # single-request listing path below caps at one page (200 items) - # regardless of `limit`, which silently truncates the public - # skills index. The catalog walker follows `nextCursor`. - catalog = self._load_catalog_index() + # Empty query: route through the paginating catalog walker. When + # the full catalog is already disk-cached this returns it whole and + # the caller paginates client-side. On a cold cache, bound the walk + # to `limit` so a browse command renders its first page without + # walking the entire 50k+ catalog (max_items=0 → unbounded, used + # only by the offline index builder via search("", limit=0)). + catalog = self._load_catalog_index(max_items=limit if limit > 0 else 0) if catalog: return self._dedupe_results(catalog)[:limit] if limit > 0 else self._dedupe_results(catalog) @@ -2249,7 +2250,21 @@ class ClawHubSource(SkillSource): _write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results]) return results - def _load_catalog_index(self) -> List[SkillMeta]: + def _load_catalog_index(self, max_items: int = 0) -> List[SkillMeta]: + """Walk the ClawHub catalog via cursor pagination. + + ``max_items`` bounds the walk: once at least that many distinct skills + have been gathered the walk stops early. This is what browse's + cold-start fallback wants — it only renders one page, so walking the + entire 50k+ catalog just to slice off the first N is pure waste. + ``max_items=0`` (the default, used by the offline index builder) means + walk to exhaustion. + + Caching: only a *complete* catalog (cursor exhausted or page cap) is + written to the shared ``clawhub_catalog_v1`` cache. A walk truncated by + ``max_items`` OR the wall-clock budget is partial, so caching it would + poison the full-catalog cache with an incomplete slice. + """ cache_key = "clawhub_catalog_v1" cached = _read_index_cache(cache_key) if cached is not None: @@ -2266,6 +2281,7 @@ class ClawHubSource(SkillSource): max_pages = 750 deadline = time.monotonic() + self.CATALOG_WALK_BUDGET_SECONDS hit_deadline = False + hit_max_items = False for _ in range(max_pages): if time.monotonic() > deadline: @@ -2308,10 +2324,18 @@ class ClawHubSource(SkillSource): if not isinstance(cursor, str) or not cursor: break + # Browse's cold-start fallback only renders one page, so stop as + # soon as we have enough to satisfy the caller's bound. The index + # builder passes max_items=0 (unbounded) and walks to exhaustion. + if max_items > 0 and len(results) >= max_items: + hit_max_items = True + break + # Only cache a walk that reached a natural stop (cursor exhausted or - # page cap). A walk truncated by the wall-clock budget is partial, so - # writing it would poison the cache with incomplete catalog data. - if not hit_deadline: + # page cap). A walk truncated by the wall-clock budget OR by max_items + # is partial, so writing it would poison the shared full-catalog cache + # with incomplete data. + if not hit_deadline and not hit_max_items: _write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results]) return results