fix(skills-hub): deduplicate search results by identifier, not name

Browse.sh exposes skills by task name (e.g. "search-listings"), which is
shared across hundreds of sites. Deduplicating by name silently dropped
every browse-sh skill after the first one with a given task name — e.g.
only Airbnb's "search-listings" would survive, collapsing Booking.com,
Zillow, and every other site's variant into nothing.

Switch unified_search() and do_browse() to use r.identifier as the dedup
key. identifier is always globally unique (e.g.
"browse-sh/airbnb.com/search-listings-ddgioa"), so same-named skills from
different browse-sh hostnames are preserved as distinct results.

Update existing TestUnifiedSearchDedup tests to model the real scenario
(same identifier appearing from two sources) and add a regression test
that asserts browse-sh skills with the same name but different hostnames
are never collapsed.
This commit is contained in:
EloquentBrush0x 2026-05-20 22:28:39 +03:00 committed by Teknium
parent 3ce1cf2bb7
commit fc7e04e9ed
3 changed files with 40 additions and 16 deletions

View file

@ -1279,10 +1279,11 @@ class TestUnifiedSearchDedup:
return src
def test_dedup_keeps_first_seen(self):
# Same identifier from two sources — only the first (community) is kept when equal trust.
s1 = SkillMeta(name="skill", description="from A", source="a",
identifier="a/skill", trust_level="community")
identifier="shared/skill", trust_level="community")
s2 = SkillMeta(name="skill", description="from B", source="b",
identifier="b/skill", trust_level="community")
identifier="shared/skill", trust_level="community")
src_a = self._make_source("a", [s1])
src_b = self._make_source("b", [s2])
results = unified_search("skill", [src_a, src_b])
@ -1290,10 +1291,11 @@ class TestUnifiedSearchDedup:
assert results[0].description == "from A"
def test_dedup_prefers_trusted_over_community(self):
# Same identifier — trusted wins over community.
community = SkillMeta(name="skill", description="community", source="a",
identifier="a/skill", trust_level="community")
identifier="shared/skill", trust_level="community")
trusted = SkillMeta(name="skill", description="trusted", source="b",
identifier="b/skill", trust_level="trusted")
identifier="shared/skill", trust_level="trusted")
src_a = self._make_source("a", [community])
src_b = self._make_source("b", [trusted])
results = unified_search("skill", [src_a, src_b])
@ -1303,9 +1305,9 @@ class TestUnifiedSearchDedup:
def test_dedup_prefers_builtin_over_trusted(self):
"""Regression: builtin must not be overwritten by trusted."""
builtin = SkillMeta(name="skill", description="builtin", source="a",
identifier="a/skill", trust_level="builtin")
identifier="shared/skill", trust_level="builtin")
trusted = SkillMeta(name="skill", description="trusted", source="b",
identifier="b/skill", trust_level="trusted")
identifier="shared/skill", trust_level="trusted")
src_a = self._make_source("a", [builtin])
src_b = self._make_source("b", [trusted])
results = unified_search("skill", [src_a, src_b])
@ -1314,14 +1316,31 @@ class TestUnifiedSearchDedup:
def test_dedup_trusted_not_overwritten_by_community(self):
trusted = SkillMeta(name="skill", description="trusted", source="a",
identifier="a/skill", trust_level="trusted")
identifier="shared/skill", trust_level="trusted")
community = SkillMeta(name="skill", description="community", source="b",
identifier="b/skill", trust_level="community")
identifier="shared/skill", trust_level="community")
src_a = self._make_source("a", [trusted])
src_b = self._make_source("b", [community])
results = unified_search("skill", [src_a, src_b])
assert results[0].trust_level == "trusted"
def test_browse_sh_same_name_different_site_not_deduped(self):
# Browse.sh skills from different hostnames share task names (e.g. "search-listings")
# but have unique identifiers. They must NOT be collapsed into one result.
airbnb = SkillMeta(
name="search-listings", description="Airbnb search", source="browse-sh",
identifier="browse-sh/airbnb.com/search-listings-ddgioa", trust_level="community",
)
booking = SkillMeta(
name="search-listings", description="Booking.com search", source="browse-sh",
identifier="browse-sh/booking.com/search-listings-xyzab", trust_level="community",
)
src = self._make_source("browse-sh", [airbnb, booking])
results = unified_search("search-listings", [src])
assert len(results) == 2, (
"browse-sh skills with the same name but different sites must not be deduplicated"
)
def test_source_filter(self):
s1 = SkillMeta(name="s1", description="d", source="a",
identifier="x", trust_level="community")