mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-02 07:11:49 +00:00
fix(skills): pull full skills.sh catalog via sitemap (858 → 19,932) (#34025)
The skills.sh source was returning ~858 unique skills from a hardcoded
list of 28 popular keyword searches (each capped at 50 results). The
real catalog is ~20k — exposed via sitemap-skills-{1,2}.xml linked from
the site's sitemap index.
Switch the empty-query path in SkillsShSource.search() to walk the
sitemap instead of scraping the homepage's curated featured strip.
Falls back to the homepage scrape if the sitemap is unreachable.
build_skills_index.crawl_skills_sh() now just calls search("", limit=0)
instead of running 28 keyword searches — same result in one HTTP round
instead of 28.
Also handle a httpx + brotlicffi interaction: the per-skill sitemaps
are ~900 KB brotli-compressed and the cffi backend's streaming decode
chokes on them. Forcing Accept-Encoding to gzip dodges the bug without
requiring a brotli library upgrade.
E2E against live skills.sh: 19,932 unique skills walked in 0.7s.
Tests: 137 pass (+1 new regression test exercising the sitemap path).
Floor for skills.sh raised 100 → 10,000 in EXPECTED_FLOORS so a future
regression hard-fails the build.
This commit is contained in:
parent
b1d3ead7fb
commit
7050c052e3
3 changed files with 189 additions and 22 deletions
|
|
@ -80,30 +80,27 @@ def crawl_source(source, source_name: str, limit: int) -> list:
|
|||
|
||||
|
||||
def crawl_skills_sh(source: SkillsShSource) -> list:
|
||||
"""Crawl skills.sh using popular queries for broad coverage."""
|
||||
print(" Crawling skills.sh (popular queries)...", flush=True)
|
||||
"""Crawl skills.sh via its sitemap to enumerate the full catalog (~20k entries).
|
||||
|
||||
Previously walked a hardcoded list of ~28 popular keywords (each capped at
|
||||
50 results) which yielded ~850 unique skills — about 4% of the real catalog.
|
||||
The SkillsShSource.search("") path now hits the sitemap directly, returning
|
||||
the full 20k-entry catalog deduplicated by canonical identifier.
|
||||
"""
|
||||
print(" Crawling skills.sh (sitemap)...", flush=True)
|
||||
start = time.time()
|
||||
|
||||
queries = [
|
||||
"", # featured
|
||||
"react", "python", "web", "api", "database", "docker",
|
||||
"testing", "scraping", "design", "typescript", "git",
|
||||
"aws", "security", "data", "ml", "ai", "devops",
|
||||
"frontend", "backend", "mobile", "cli", "documentation",
|
||||
"kubernetes", "terraform", "rust", "go", "java",
|
||||
]
|
||||
try:
|
||||
results = source.search("", limit=0) # 0 = no cap, return the whole catalog
|
||||
except Exception as e:
|
||||
print(f" Warning: skills.sh sitemap walk failed: {e}", file=sys.stderr)
|
||||
results = []
|
||||
|
||||
all_skills: dict[str, dict] = {}
|
||||
for query in queries:
|
||||
try:
|
||||
results = source.search(query, limit=50)
|
||||
for meta in results:
|
||||
entry = _meta_to_dict(meta)
|
||||
if entry["identifier"] not in all_skills:
|
||||
all_skills[entry["identifier"]] = entry
|
||||
except Exception as e:
|
||||
print(f" Warning: skills.sh search '{query}' failed: {e}",
|
||||
file=sys.stderr)
|
||||
for meta in results:
|
||||
entry = _meta_to_dict(meta)
|
||||
if entry["identifier"] not in all_skills:
|
||||
all_skills[entry["identifier"]] = entry
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f" skills.sh: {len(all_skills)} unique skills ({elapsed:.1f}s)",
|
||||
|
|
@ -345,7 +342,11 @@ def main():
|
|||
# or rate limiting kicked in. Failing here forces a human look before
|
||||
# the broken index reaches the live docs.
|
||||
EXPECTED_FLOORS = {
|
||||
"skills.sh": 100,
|
||||
# skills.sh now uses the sitemap walker (~20k catalog as of May 2026).
|
||||
# Anything under 10k means the sitemap shape changed or fetches failed
|
||||
# — better to fail loudly than ship a regression to the 858-skill
|
||||
# popular-queries era.
|
||||
"skills.sh": 10000,
|
||||
"lobehub": 100,
|
||||
# ClawHub had 49,698+ skills as of May 2026 — anything under 20k means
|
||||
# pagination broke or the API surface changed. Fail loudly rather
|
||||
|
|
|
|||
|
|
@ -472,6 +472,68 @@ class TestSkillsShSource:
|
|||
requested_urls = [call.args[0] for call in mock_get.call_args_list]
|
||||
assert root_url not in requested_urls
|
||||
|
||||
@patch("tools.skills_hub._write_index_cache")
|
||||
@patch("tools.skills_hub._read_index_cache", return_value=None)
|
||||
@patch("tools.skills_hub.httpx.get")
|
||||
def test_empty_query_walks_sitemap_not_homepage(
|
||||
self, mock_get, _mock_read_cache, _mock_write_cache,
|
||||
):
|
||||
"""Empty query must walk the full sitemap.
|
||||
|
||||
Regression for skills.sh shipping ~858/20000 skills: the previous
|
||||
empty-query path scraped the homepage's featured strip (~200 entries),
|
||||
and build_skills_index.py supplemented it with 28 popular keyword
|
||||
searches to drag the count to ~850. The sitemap walker hits the
|
||||
full ~20k catalog in one pass.
|
||||
"""
|
||||
index_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<sitemap><loc>https://www.skills.sh/sitemap-misc.xml</loc></sitemap>
|
||||
<sitemap><loc>https://www.skills.sh/sitemap-skills-1.xml</loc></sitemap>
|
||||
<sitemap><loc>https://www.skills.sh/sitemap-skills-2.xml</loc></sitemap>
|
||||
</sitemapindex>"""
|
||||
skills_1_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://www.skills.sh/anthropics/skills/frontend-design</loc></url>
|
||||
<url><loc>https://www.skills.sh/anthropics/skills/pdf</loc></url>
|
||||
<url><loc>https://www.skills.sh/vercel-labs/agent-skills/react-best-practices</loc></url>
|
||||
</urlset>"""
|
||||
skills_2_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://www.skills.sh/microsoft/azure-skills/azure-ai</loc></url>
|
||||
<url><loc>https://www.skills.sh/anthropics/skills/frontend-design</loc></url>
|
||||
</urlset>"""
|
||||
|
||||
def side_effect(url, *args, **kwargs):
|
||||
resp = MagicMock(status_code=200)
|
||||
if url.endswith("/sitemap.xml"):
|
||||
resp.text = index_xml
|
||||
elif "sitemap-skills-1" in url:
|
||||
resp.text = skills_1_xml
|
||||
elif "sitemap-skills-2" in url:
|
||||
resp.text = skills_2_xml
|
||||
else:
|
||||
resp.status_code = 404
|
||||
resp.text = ""
|
||||
return resp
|
||||
|
||||
mock_get.side_effect = side_effect
|
||||
|
||||
results = self._source().search("", limit=0)
|
||||
|
||||
# 4 unique skills (the frontend-design dup across sitemaps collapsed).
|
||||
assert len(results) == 4
|
||||
identifiers = {r.identifier for r in results}
|
||||
assert identifiers == {
|
||||
"skills-sh/anthropics/skills/frontend-design",
|
||||
"skills-sh/anthropics/skills/pdf",
|
||||
"skills-sh/vercel-labs/agent-skills/react-best-practices",
|
||||
"skills-sh/microsoft/azure-skills/azure-ai",
|
||||
}
|
||||
# Homepage was NOT fetched — the sitemap path is taken on empty query.
|
||||
urls_called = [call.args[0] for call in mock_get.call_args_list]
|
||||
assert not any(u == "https://skills.sh" or u == "https://skills.sh/" for u in urls_called)
|
||||
|
||||
|
||||
class TestFindSkillInRepoTree:
|
||||
"""Tests for GitHubSource._find_skill_in_repo_tree."""
|
||||
|
|
|
|||
|
|
@ -1217,6 +1217,16 @@ class SkillsShSource(SkillSource):
|
|||
|
||||
BASE_URL = "https://skills.sh"
|
||||
SEARCH_URL = f"{BASE_URL}/api/search"
|
||||
# Sitemap index — the real catalog source. The homepage scrape only
|
||||
# exposes a curated featured strip (~200 entries); the sitemap covers
|
||||
# the full ~20k+ catalog. https://www.skills.sh/sitemap.xml points at
|
||||
# sitemap-skills-1.xml + sitemap-skills-2.xml, each up to 10k URLs.
|
||||
SITEMAP_INDEX_URL = "https://www.skills.sh/sitemap.xml"
|
||||
_SITEMAP_LOC_RE = re.compile(r"<loc>([^<]+)</loc>", re.IGNORECASE)
|
||||
_SITEMAP_SKILL_RE = re.compile(
|
||||
r"^https?://(?:www\.)?skills\.sh/(?P<owner>[^/]+)/(?P<repo>[^/]+)/(?P<skill>[^/]+)/?$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_SKILL_LINK_RE = re.compile(r'href=["\']/(?P<id>(?!agents/|_next/|api/)[^"\'/]+/[^"\'/]+/[^"\'/]+)["\']')
|
||||
_INSTALL_CMD_RE = re.compile(
|
||||
r'npx\s+skills\s+add\s+(?P<repo>https?://github\.com/[^\s<]+|[^\s<]+)'
|
||||
|
|
@ -1246,7 +1256,10 @@ class SkillsShSource(SkillSource):
|
|||
|
||||
def search(self, query: str, limit: int = 10) -> List[SkillMeta]:
|
||||
if not query.strip():
|
||||
return self._featured_skills(limit)
|
||||
# Empty query = bulk catalog dump (what build_skills_index.py
|
||||
# calls with). The homepage scrape only sees ~200 featured
|
||||
# entries; the sitemap walks the full ~20k+ catalog.
|
||||
return self._sitemap_catalog(limit)
|
||||
|
||||
cache_key = f"skills_sh_search_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}"
|
||||
cached = _read_index_cache(cache_key)
|
||||
|
|
@ -1307,6 +1320,97 @@ class SkillsShSource(SkillSource):
|
|||
return self._finalize_inspect_meta(meta, canonical, detail)
|
||||
return None
|
||||
|
||||
def _sitemap_catalog(self, limit: int) -> List[SkillMeta]:
|
||||
"""Walk the skills.sh sitemap to enumerate the full catalog.
|
||||
|
||||
Cached for the standard index TTL so we don't refetch ~2 MB of
|
||||
sitemap XML per build. Falls back to ``_featured_skills`` if the
|
||||
sitemap is unreachable or empty (network failure, hostname
|
||||
change, etc.).
|
||||
"""
|
||||
cache_key = "skills_sh_sitemap_v1"
|
||||
cached = _read_index_cache(cache_key)
|
||||
if cached is not None:
|
||||
metas = [SkillMeta(**item) for item in cached]
|
||||
return metas[:limit] if limit > 0 else metas
|
||||
|
||||
# skills.sh serves the per-skill sitemaps brotli-compressed, and
|
||||
# httpx's optional brotlicffi backend has a streaming-decode bug
|
||||
# that fails on these specific payloads. Excluding "br" from
|
||||
# Accept-Encoding makes the server fall back to gzip (or
|
||||
# identity), which works on every httpx install.
|
||||
sitemap_headers = {"Accept-Encoding": "gzip"}
|
||||
|
||||
# Step 1: fetch the sitemap index → list of skill-sitemap URLs.
|
||||
skill_sitemap_urls: List[str] = []
|
||||
try:
|
||||
resp = httpx.get(
|
||||
self.SITEMAP_INDEX_URL,
|
||||
timeout=20,
|
||||
follow_redirects=True,
|
||||
headers=sitemap_headers,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return self._featured_skills(limit)
|
||||
for match in self._SITEMAP_LOC_RE.finditer(resp.text):
|
||||
loc = match.group(1).strip()
|
||||
# Sitemap index entries that point at the per-skill maps.
|
||||
if "sitemap-skills" in loc:
|
||||
skill_sitemap_urls.append(loc)
|
||||
except httpx.HTTPError:
|
||||
return self._featured_skills(limit)
|
||||
|
||||
if not skill_sitemap_urls:
|
||||
return self._featured_skills(limit)
|
||||
|
||||
# Step 2: fetch each skill sitemap and collect canonical "owner/repo/skill" IDs.
|
||||
seen: set[str] = set()
|
||||
results: List[SkillMeta] = []
|
||||
for sitemap_url in skill_sitemap_urls:
|
||||
try:
|
||||
resp = httpx.get(
|
||||
sitemap_url,
|
||||
timeout=30,
|
||||
follow_redirects=True,
|
||||
headers=sitemap_headers,
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
continue
|
||||
except httpx.HTTPError:
|
||||
continue
|
||||
for loc_match in self._SITEMAP_LOC_RE.finditer(resp.text):
|
||||
url = loc_match.group(1).strip()
|
||||
m = self._SITEMAP_SKILL_RE.match(url)
|
||||
if not m:
|
||||
continue
|
||||
owner = m.group("owner")
|
||||
repo_name = m.group("repo")
|
||||
skill_name = m.group("skill")
|
||||
canonical = f"{owner}/{repo_name}/{skill_name}"
|
||||
if canonical in seen:
|
||||
continue
|
||||
seen.add(canonical)
|
||||
repo = f"{owner}/{repo_name}"
|
||||
results.append(SkillMeta(
|
||||
name=skill_name,
|
||||
description=f"Indexed by skills.sh from {repo}",
|
||||
source="skills.sh",
|
||||
identifier=self._wrap_identifier(canonical),
|
||||
trust_level=self.github.trust_level_for(canonical),
|
||||
repo=repo,
|
||||
path=skill_name,
|
||||
extra={
|
||||
"detail_url": f"{self.BASE_URL}/{canonical}",
|
||||
"repo_url": f"https://github.com/{repo}",
|
||||
},
|
||||
))
|
||||
|
||||
if not results:
|
||||
return self._featured_skills(limit)
|
||||
|
||||
_write_index_cache(cache_key, [_skill_meta_to_dict(item) for item in results])
|
||||
return results[:limit] if limit > 0 else results
|
||||
|
||||
def _featured_skills(self, limit: int) -> List[SkillMeta]:
|
||||
cache_key = "skills_sh_featured"
|
||||
cached = _read_index_cache(cache_key)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue