fix(skills): bound ClawHub catalog walk to requested page on cold start (#43395)

Browse renders one page but the cold-cache fallback walked the entire
50k+ ClawHub catalog, then sliced off the first N — pure waste behind the
12s budget band-aid. _load_catalog_index now takes max_items: browse's
empty-query path bounds the walk to its limit and stops early; the offline
index builder still passes limit=0 (unbounded) and walks to exhaustion.
A bounded walk is partial, so it is not written to the shared full-catalog
cache (same poison-guard as the budget-truncated case).
This commit is contained in:
Teknium 2026-06-10 01:01:53 -07:00 committed by GitHub
parent 6a30cfca82
commit eee1da45f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 140 additions and 10 deletions

View file

@ -423,5 +423,111 @@ class TestClawHubSource(unittest.TestCase):
mock_write_cache.assert_called_once()
class TestClawHubCatalogWalkBounded(unittest.TestCase):
"""max_items bounds the walk so browse's cold-start fallback renders one
page without walking the entire 50k+ catalog. The offline index builder
keeps max_items=0 (unbounded) and walks to exhaustion."""
def setUp(self):
self.src = ClawHubSource()
self._safe_patcher = patch("tools.skills_hub.is_safe_url", return_value=True)
self._policy_patcher = patch("tools.skills_hub.check_website_access", return_value=None)
self._safe_patcher.start()
self._policy_patcher.start()
def tearDown(self):
self._policy_patcher.stop()
self._safe_patcher.stop()
def _infinite_pages(self, page_calls):
"""A side_effect that always advertises another cursor — the walk would
never stop on its own, so only max_items / budget can break it."""
def side_effect(url, *args, **kwargs):
if url.endswith("/skills"):
idx = page_calls["n"]
page_calls["n"] += 1
return _MockResponse(
status_code=200,
json_data={
"items": [
{"slug": f"skill-{idx}", "displayName": f"Skill {idx}"}
],
"nextCursor": f"cursor-{idx + 1}",
},
)
return _MockResponse(status_code=404, json_data={})
return side_effect
@patch("tools.skills_hub._write_index_cache")
@patch("tools.skills_hub._read_index_cache", return_value=None)
@patch("tools.skills_hub.httpx.get")
def test_max_items_stops_walk_early_and_does_not_cache(
self, mock_get, _mock_read_cache, mock_write_cache
):
"""A bounded walk stops as soon as it has >= max_items skills and must
NOT poison the shared full-catalog cache with the partial slice."""
page_calls = {"n": 0}
mock_get.side_effect = self._infinite_pages(page_calls)
results = self.src._load_catalog_index(max_items=5)
# Each mocked page yields exactly 1 item, so ~5 pages cover the bound.
self.assertGreaterEqual(len(results), 5)
self.assertLess(page_calls["n"], 750, "bounded walk should stop well before the cap")
self.assertLess(page_calls["n"], 20, "should stop within a few pages of the bound")
# Partial (bounded) walk must not be cached.
mock_write_cache.assert_not_called()
@patch("tools.skills_hub._write_index_cache")
@patch("tools.skills_hub._read_index_cache", return_value=None)
@patch("tools.skills_hub.httpx.get")
def test_max_items_zero_is_unbounded_and_caches(
self, mock_get, _mock_read_cache, mock_write_cache
):
"""max_items=0 (the index builder's path) walks to natural termination
and DOES cache the complete catalog."""
def side_effect(url, *args, **kwargs):
if url.endswith("/skills"):
return _MockResponse(
status_code=200,
json_data={
"items": [
{"slug": "a", "displayName": "A"},
{"slug": "b", "displayName": "B"},
{"slug": "c", "displayName": "C"},
],
# No nextCursor -> natural termination.
},
)
return _MockResponse(status_code=404, json_data={})
mock_get.side_effect = side_effect
results = self.src._load_catalog_index(max_items=0)
self.assertEqual(len(results), 3)
mock_write_cache.assert_called_once()
@patch("tools.skills_hub._write_index_cache")
@patch("tools.skills_hub._read_index_cache", return_value=None)
@patch("tools.skills_hub.httpx.get")
def test_empty_query_browse_bounds_walk_to_limit(
self, mock_get, _mock_read_cache, _mock_write_cache
):
"""search("", limit=N) is the browse cold-start path — it must bound the
catalog walk to N rather than walking the whole 50k+ catalog."""
page_calls = {"n": 0}
mock_get.side_effect = self._infinite_pages(page_calls)
results = self.src.search("", limit=10)
self.assertEqual(len(results), 10, "browse page should be exactly `limit` items")
# Walk stopped near the bound, not at the 750-page cap.
self.assertLess(page_calls["n"], 30)
if __name__ == "__main__":
unittest.main()

View file

@ -2119,12 +2119,13 @@ class ClawHubSource(SkillSource):
if results:
return results
else:
# Empty query: route through the paginating catalog walker so the
# full ClawHub catalog (20k+ skills) lands in the index. The
# single-request listing path below caps at one page (200 items)
# regardless of `limit`, which silently truncates the public
# skills index. The catalog walker follows `nextCursor`.
catalog = self._load_catalog_index()
# Empty query: route through the paginating catalog walker. When
# the full catalog is already disk-cached this returns it whole and
# the caller paginates client-side. On a cold cache, bound the walk
# to `limit` so a browse command renders its first page without
# walking the entire 50k+ catalog (max_items=0 → unbounded, used
# only by the offline index builder via search("", limit=0)).
catalog = self._load_catalog_index(max_items=limit if limit > 0 else 0)
if catalog:
return self._dedupe_results(catalog)[:limit] if limit > 0 else self._dedupe_results(catalog)
@ -2249,7 +2250,21 @@ class ClawHubSource(SkillSource):
_write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results])
return results
def _load_catalog_index(self) -> List[SkillMeta]:
def _load_catalog_index(self, max_items: int = 0) -> List[SkillMeta]:
"""Walk the ClawHub catalog via cursor pagination.
``max_items`` bounds the walk: once at least that many distinct skills
have been gathered the walk stops early. This is what browse's
cold-start fallback wants it only renders one page, so walking the
entire 50k+ catalog just to slice off the first N is pure waste.
``max_items=0`` (the default, used by the offline index builder) means
walk to exhaustion.
Caching: only a *complete* catalog (cursor exhausted or page cap) is
written to the shared ``clawhub_catalog_v1`` cache. A walk truncated by
``max_items`` OR the wall-clock budget is partial, so caching it would
poison the full-catalog cache with an incomplete slice.
"""
cache_key = "clawhub_catalog_v1"
cached = _read_index_cache(cache_key)
if cached is not None:
@ -2266,6 +2281,7 @@ class ClawHubSource(SkillSource):
max_pages = 750
deadline = time.monotonic() + self.CATALOG_WALK_BUDGET_SECONDS
hit_deadline = False
hit_max_items = False
for _ in range(max_pages):
if time.monotonic() > deadline:
@ -2308,10 +2324,18 @@ class ClawHubSource(SkillSource):
if not isinstance(cursor, str) or not cursor:
break
# Browse's cold-start fallback only renders one page, so stop as
# soon as we have enough to satisfy the caller's bound. The index
# builder passes max_items=0 (unbounded) and walks to exhaustion.
if max_items > 0 and len(results) >= max_items:
hit_max_items = True
break
# Only cache a walk that reached a natural stop (cursor exhausted or
# page cap). A walk truncated by the wall-clock budget is partial, so
# writing it would poison the cache with incomplete catalog data.
if not hit_deadline:
# page cap). A walk truncated by the wall-clock budget OR by max_items
# is partial, so writing it would poison the shared full-catalog cache
# with incomplete data.
if not hit_deadline and not hit_max_items:
_write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results])
return results