From fc7e04e9eddb6e50204e2fee0f91847fa05d6821 Mon Sep 17 00:00:00 2001 From: EloquentBrush0x <283442588+EloquentBrush0x@users.noreply.github.com> Date: Wed, 20 May 2026 22:28:39 +0300 Subject: [PATCH] fix(skills-hub): deduplicate search results by identifier, not name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Browse.sh exposes skills by task name (e.g. "search-listings"), which is shared across hundreds of sites. Deduplicating by name silently dropped every browse-sh skill after the first one with a given task name — e.g. only Airbnb's "search-listings" would survive, collapsing Booking.com, Zillow, and every other site's variant into nothing. Switch unified_search() and do_browse() to use r.identifier as the dedup key. identifier is always globally unique (e.g. "browse-sh/airbnb.com/search-listings-ddgioa"), so same-named skills from different browse-sh hostnames are preserved as distinct results. Update existing TestUnifiedSearchDedup tests to model the real scenario (same identifier appearing from two sources) and add a regression test that asserts browse-sh skills with the same name but different hostnames are never collapsed. --- hermes_cli/skills_hub.py | 8 +++++--- tests/tools/test_skills_hub.py | 35 ++++++++++++++++++++++++++-------- tools/skills_hub.py | 13 ++++++++----- 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/hermes_cli/skills_hub.py b/hermes_cli/skills_hub.py index 116dedb1c08..01f19ba3b21 100644 --- a/hermes_cli/skills_hub.py +++ b/hermes_cli/skills_hub.py @@ -319,12 +319,14 @@ def do_browse(page: int = 1, page_size: int = 20, source: str = "all", c.print("[dim]No skills found in the Skills Hub.[/]\n") return - # Deduplicate by name, preferring higher trust + # Deduplicate by identifier, preferring higher trust. + # identifier is always unique per skill; name is not (browse-sh skills from different + # sites can share the same task name, e.g. "search-listings" on Airbnb and Booking.com). seen: dict = {} for r in all_results: rank = _TRUST_RANK.get(r.trust_level, 0) - if r.name not in seen or rank > _TRUST_RANK.get(seen[r.name].trust_level, 0): - seen[r.name] = r + if r.identifier not in seen or rank > _TRUST_RANK.get(seen[r.identifier].trust_level, 0): + seen[r.identifier] = r deduped = list(seen.values()) # Sort: official first, then by trust level (desc), then alphabetically diff --git a/tests/tools/test_skills_hub.py b/tests/tools/test_skills_hub.py index e831b50943e..dc68aca1d33 100644 --- a/tests/tools/test_skills_hub.py +++ b/tests/tools/test_skills_hub.py @@ -1279,10 +1279,11 @@ class TestUnifiedSearchDedup: return src def test_dedup_keeps_first_seen(self): + # Same identifier from two sources — only the first (community) is kept when equal trust. s1 = SkillMeta(name="skill", description="from A", source="a", - identifier="a/skill", trust_level="community") + identifier="shared/skill", trust_level="community") s2 = SkillMeta(name="skill", description="from B", source="b", - identifier="b/skill", trust_level="community") + identifier="shared/skill", trust_level="community") src_a = self._make_source("a", [s1]) src_b = self._make_source("b", [s2]) results = unified_search("skill", [src_a, src_b]) @@ -1290,10 +1291,11 @@ class TestUnifiedSearchDedup: assert results[0].description == "from A" def test_dedup_prefers_trusted_over_community(self): + # Same identifier — trusted wins over community. community = SkillMeta(name="skill", description="community", source="a", - identifier="a/skill", trust_level="community") + identifier="shared/skill", trust_level="community") trusted = SkillMeta(name="skill", description="trusted", source="b", - identifier="b/skill", trust_level="trusted") + identifier="shared/skill", trust_level="trusted") src_a = self._make_source("a", [community]) src_b = self._make_source("b", [trusted]) results = unified_search("skill", [src_a, src_b]) @@ -1303,9 +1305,9 @@ class TestUnifiedSearchDedup: def test_dedup_prefers_builtin_over_trusted(self): """Regression: builtin must not be overwritten by trusted.""" builtin = SkillMeta(name="skill", description="builtin", source="a", - identifier="a/skill", trust_level="builtin") + identifier="shared/skill", trust_level="builtin") trusted = SkillMeta(name="skill", description="trusted", source="b", - identifier="b/skill", trust_level="trusted") + identifier="shared/skill", trust_level="trusted") src_a = self._make_source("a", [builtin]) src_b = self._make_source("b", [trusted]) results = unified_search("skill", [src_a, src_b]) @@ -1314,14 +1316,31 @@ class TestUnifiedSearchDedup: def test_dedup_trusted_not_overwritten_by_community(self): trusted = SkillMeta(name="skill", description="trusted", source="a", - identifier="a/skill", trust_level="trusted") + identifier="shared/skill", trust_level="trusted") community = SkillMeta(name="skill", description="community", source="b", - identifier="b/skill", trust_level="community") + identifier="shared/skill", trust_level="community") src_a = self._make_source("a", [trusted]) src_b = self._make_source("b", [community]) results = unified_search("skill", [src_a, src_b]) assert results[0].trust_level == "trusted" + def test_browse_sh_same_name_different_site_not_deduped(self): + # Browse.sh skills from different hostnames share task names (e.g. "search-listings") + # but have unique identifiers. They must NOT be collapsed into one result. + airbnb = SkillMeta( + name="search-listings", description="Airbnb search", source="browse-sh", + identifier="browse-sh/airbnb.com/search-listings-ddgioa", trust_level="community", + ) + booking = SkillMeta( + name="search-listings", description="Booking.com search", source="browse-sh", + identifier="browse-sh/booking.com/search-listings-xyzab", trust_level="community", + ) + src = self._make_source("browse-sh", [airbnb, booking]) + results = unified_search("search-listings", [src]) + assert len(results) == 2, ( + "browse-sh skills with the same name but different sites must not be deduplicated" + ) + def test_source_filter(self): s1 = SkillMeta(name="s1", description="d", source="a", identifier="x", trust_level="community") diff --git a/tools/skills_hub.py b/tools/skills_hub.py index 7725c745de4..9e808b09277 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -3425,14 +3425,17 @@ def unified_search(query: str, sources: List[SkillSource], overall_timeout=30, ) - # Deduplicate by name, preferring higher trust levels + # Deduplicate by identifier, preferring higher trust levels. + # identifier is always unique per skill (e.g. "browse-sh/airbnb.com/search-listings-ddgioa"). + # Using name would incorrectly collapse browse-sh skills from different sites that share + # the same task name (e.g. "search-listings" from Airbnb and Booking.com). _TRUST_RANK = {"builtin": 2, "trusted": 1, "community": 0} seen: Dict[str, SkillMeta] = {} for r in all_results: - if r.name not in seen: - seen[r.name] = r - elif _TRUST_RANK.get(r.trust_level, 0) > _TRUST_RANK.get(seen[r.name].trust_level, 0): - seen[r.name] = r + if r.identifier not in seen: + seen[r.identifier] = r + elif _TRUST_RANK.get(r.trust_level, 0) > _TRUST_RANK.get(seen[r.identifier].trust_level, 0): + seen[r.identifier] = r deduped = list(seen.values()) return deduped[:limit]