mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(skills): pull full ClawHub catalog into the skills index (200 → 20k+) (#33748)
* fix(skills): pull full ClawHub catalog into the skills index
The website was showing 200 ClawHub skills out of 20k+ because
`ClawHubSource.search("")` for empty queries went straight to a single
unpaginated request. ClawHub's API caps any single page at 200 items and
returns a `nextCursor`; we grabbed page 1 and stopped, so the cached
index served from hermes-agent.nousresearch.com had a silent 99%
truncation.
End users never hit clawhub.ai directly (the index is rebuilt twice
daily by .github/workflows/skills-index.yml and served as a static JSON
on the docs site), so the cap-and-cache architecture is correct — it
just wasn't being filled.
Changes:
- `ClawHubSource.search(query="")` now routes through the existing
`_load_catalog_index()` paginating walker instead of the unpaginated
listing fallback (non-empty queries still hit the fast catalog search).
- `_load_catalog_index()` max_pages 50 → 250 (50k-skill ceiling; live
catalog is ~20k as of May 2026, with headroom for growth).
- `build_skills_index.py`: per-source crawl limits split out — ClawHub
and LobeHub get 100k, others keep their effective caps.
- `EXPECTED_FLOORS["clawhub"]` 50 → 5000 so the next pagination
regression hard-fails the CI build instead of silently shipping a
degenerate index.
Test plan:
- New unit test `test_search_empty_query_paginates_full_catalog`
exercises the cursor-following path with three mocked pages (450
total items) and asserts all pages are walked.
- Existing 9 ClawHub tests + 127 broader skills_hub tests all pass.
- E2E against live ClawHub API: walker reached 9700+ skills across 49
pages before this commit landed, paginating well past the previous
50-page cap.
* fix(skills): raise ClawHub ceilings — live catalog is 50k, not 20k
E2E walk against live ClawHub API hit my initial 250-page cap at 49,698
skills with cursor=yes still pending. The catalog is roughly 2.5x larger
than the docstring estimate.
- max_pages 250 → 750 (150k ceiling, walks terminate on cursor=None
well before this in practice)
- SOURCE_LIMITS['clawhub'] 100k → 200k
- EXPECTED_FLOORS['clawhub'] 5000 → 20000
This commit is contained in:
parent
09a5cd8084
commit
fb9f3a4ef9
3 changed files with 93 additions and 5 deletions
|
|
@ -298,6 +298,58 @@ class TestClawHubSource(unittest.TestCase):
|
|||
self.assertIsNone(bundle)
|
||||
self.assertEqual(mock_get.call_count, 3)
|
||||
|
||||
@patch("tools.skills_hub._write_index_cache")
|
||||
@patch("tools.skills_hub._read_index_cache", return_value=None)
|
||||
@patch("tools.skills_hub.httpx.get")
|
||||
def test_search_empty_query_paginates_full_catalog(
|
||||
self, mock_get, _mock_read_cache, _mock_write_cache
|
||||
):
|
||||
"""Empty query must walk the cursor-paginated catalog.
|
||||
|
||||
Regression for the silent 200-skill truncation: ClawHub's listing
|
||||
endpoint caps any single page at 200 items + returns a `nextCursor`.
|
||||
The build_skills_index.py crawler calls `search("", limit=N)` with a
|
||||
large N to dump the full catalog. Before the fix, that hit a single
|
||||
unpaginated request and silently dropped 99% of the catalog.
|
||||
"""
|
||||
# Three pages: 200 + 200 + 50 items, then no cursor → stop.
|
||||
page_calls = {"n": 0}
|
||||
pages = [
|
||||
{
|
||||
"items": [{"slug": f"a-skill-{i}", "displayName": f"A {i}"} for i in range(200)],
|
||||
"nextCursor": "cursor-page-2",
|
||||
},
|
||||
{
|
||||
"items": [{"slug": f"b-skill-{i}", "displayName": f"B {i}"} for i in range(200)],
|
||||
"nextCursor": "cursor-page-3",
|
||||
},
|
||||
{
|
||||
"items": [{"slug": f"c-skill-{i}", "displayName": f"C {i}"} for i in range(50)],
|
||||
"nextCursor": None,
|
||||
},
|
||||
]
|
||||
|
||||
def side_effect(url, *args, **kwargs):
|
||||
if url.endswith("/skills"):
|
||||
idx = page_calls["n"]
|
||||
page_calls["n"] += 1
|
||||
if idx < len(pages):
|
||||
return _MockResponse(status_code=200, json_data=pages[idx])
|
||||
return _MockResponse(status_code=200, json_data={"items": []})
|
||||
return _MockResponse(status_code=404, json_data={})
|
||||
|
||||
mock_get.side_effect = side_effect
|
||||
|
||||
results = self.src.search("", limit=10_000)
|
||||
|
||||
# 200 + 200 + 50 = 450 unique skills, all retrieved via cursor pagination.
|
||||
self.assertEqual(len(results), 450)
|
||||
self.assertEqual(page_calls["n"], 3, "expected exactly 3 cursor-paginated pages")
|
||||
identifiers = {meta.identifier for meta in results}
|
||||
self.assertIn("a-skill-0", identifiers)
|
||||
self.assertIn("b-skill-199", identifiers)
|
||||
self.assertIn("c-skill-49", identifiers)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue