mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(skills-hub): deduplicate search results by identifier, not name
Browse.sh exposes skills by task name (e.g. "search-listings"), which is shared across hundreds of sites. Deduplicating by name silently dropped every browse-sh skill after the first one with a given task name — e.g. only Airbnb's "search-listings" would survive, collapsing Booking.com, Zillow, and every other site's variant into nothing. Switch unified_search() and do_browse() to use r.identifier as the dedup key. identifier is always globally unique (e.g. "browse-sh/airbnb.com/search-listings-ddgioa"), so same-named skills from different browse-sh hostnames are preserved as distinct results. Update existing TestUnifiedSearchDedup tests to model the real scenario (same identifier appearing from two sources) and add a regression test that asserts browse-sh skills with the same name but different hostnames are never collapsed.
This commit is contained in:
parent
3ce1cf2bb7
commit
fc7e04e9ed
3 changed files with 40 additions and 16 deletions
|
|
@ -319,12 +319,14 @@ def do_browse(page: int = 1, page_size: int = 20, source: str = "all",
|
|||
c.print("[dim]No skills found in the Skills Hub.[/]\n")
|
||||
return
|
||||
|
||||
# Deduplicate by name, preferring higher trust
|
||||
# Deduplicate by identifier, preferring higher trust.
|
||||
# identifier is always unique per skill; name is not (browse-sh skills from different
|
||||
# sites can share the same task name, e.g. "search-listings" on Airbnb and Booking.com).
|
||||
seen: dict = {}
|
||||
for r in all_results:
|
||||
rank = _TRUST_RANK.get(r.trust_level, 0)
|
||||
if r.name not in seen or rank > _TRUST_RANK.get(seen[r.name].trust_level, 0):
|
||||
seen[r.name] = r
|
||||
if r.identifier not in seen or rank > _TRUST_RANK.get(seen[r.identifier].trust_level, 0):
|
||||
seen[r.identifier] = r
|
||||
deduped = list(seen.values())
|
||||
|
||||
# Sort: official first, then by trust level (desc), then alphabetically
|
||||
|
|
|
|||
|
|
@ -1279,10 +1279,11 @@ class TestUnifiedSearchDedup:
|
|||
return src
|
||||
|
||||
def test_dedup_keeps_first_seen(self):
|
||||
# Same identifier from two sources — only the first (community) is kept when equal trust.
|
||||
s1 = SkillMeta(name="skill", description="from A", source="a",
|
||||
identifier="a/skill", trust_level="community")
|
||||
identifier="shared/skill", trust_level="community")
|
||||
s2 = SkillMeta(name="skill", description="from B", source="b",
|
||||
identifier="b/skill", trust_level="community")
|
||||
identifier="shared/skill", trust_level="community")
|
||||
src_a = self._make_source("a", [s1])
|
||||
src_b = self._make_source("b", [s2])
|
||||
results = unified_search("skill", [src_a, src_b])
|
||||
|
|
@ -1290,10 +1291,11 @@ class TestUnifiedSearchDedup:
|
|||
assert results[0].description == "from A"
|
||||
|
||||
def test_dedup_prefers_trusted_over_community(self):
|
||||
# Same identifier — trusted wins over community.
|
||||
community = SkillMeta(name="skill", description="community", source="a",
|
||||
identifier="a/skill", trust_level="community")
|
||||
identifier="shared/skill", trust_level="community")
|
||||
trusted = SkillMeta(name="skill", description="trusted", source="b",
|
||||
identifier="b/skill", trust_level="trusted")
|
||||
identifier="shared/skill", trust_level="trusted")
|
||||
src_a = self._make_source("a", [community])
|
||||
src_b = self._make_source("b", [trusted])
|
||||
results = unified_search("skill", [src_a, src_b])
|
||||
|
|
@ -1303,9 +1305,9 @@ class TestUnifiedSearchDedup:
|
|||
def test_dedup_prefers_builtin_over_trusted(self):
|
||||
"""Regression: builtin must not be overwritten by trusted."""
|
||||
builtin = SkillMeta(name="skill", description="builtin", source="a",
|
||||
identifier="a/skill", trust_level="builtin")
|
||||
identifier="shared/skill", trust_level="builtin")
|
||||
trusted = SkillMeta(name="skill", description="trusted", source="b",
|
||||
identifier="b/skill", trust_level="trusted")
|
||||
identifier="shared/skill", trust_level="trusted")
|
||||
src_a = self._make_source("a", [builtin])
|
||||
src_b = self._make_source("b", [trusted])
|
||||
results = unified_search("skill", [src_a, src_b])
|
||||
|
|
@ -1314,14 +1316,31 @@ class TestUnifiedSearchDedup:
|
|||
|
||||
def test_dedup_trusted_not_overwritten_by_community(self):
|
||||
trusted = SkillMeta(name="skill", description="trusted", source="a",
|
||||
identifier="a/skill", trust_level="trusted")
|
||||
identifier="shared/skill", trust_level="trusted")
|
||||
community = SkillMeta(name="skill", description="community", source="b",
|
||||
identifier="b/skill", trust_level="community")
|
||||
identifier="shared/skill", trust_level="community")
|
||||
src_a = self._make_source("a", [trusted])
|
||||
src_b = self._make_source("b", [community])
|
||||
results = unified_search("skill", [src_a, src_b])
|
||||
assert results[0].trust_level == "trusted"
|
||||
|
||||
def test_browse_sh_same_name_different_site_not_deduped(self):
|
||||
# Browse.sh skills from different hostnames share task names (e.g. "search-listings")
|
||||
# but have unique identifiers. They must NOT be collapsed into one result.
|
||||
airbnb = SkillMeta(
|
||||
name="search-listings", description="Airbnb search", source="browse-sh",
|
||||
identifier="browse-sh/airbnb.com/search-listings-ddgioa", trust_level="community",
|
||||
)
|
||||
booking = SkillMeta(
|
||||
name="search-listings", description="Booking.com search", source="browse-sh",
|
||||
identifier="browse-sh/booking.com/search-listings-xyzab", trust_level="community",
|
||||
)
|
||||
src = self._make_source("browse-sh", [airbnb, booking])
|
||||
results = unified_search("search-listings", [src])
|
||||
assert len(results) == 2, (
|
||||
"browse-sh skills with the same name but different sites must not be deduplicated"
|
||||
)
|
||||
|
||||
def test_source_filter(self):
|
||||
s1 = SkillMeta(name="s1", description="d", source="a",
|
||||
identifier="x", trust_level="community")
|
||||
|
|
|
|||
|
|
@ -3425,14 +3425,17 @@ def unified_search(query: str, sources: List[SkillSource],
|
|||
overall_timeout=30,
|
||||
)
|
||||
|
||||
# Deduplicate by name, preferring higher trust levels
|
||||
# Deduplicate by identifier, preferring higher trust levels.
|
||||
# identifier is always unique per skill (e.g. "browse-sh/airbnb.com/search-listings-ddgioa").
|
||||
# Using name would incorrectly collapse browse-sh skills from different sites that share
|
||||
# the same task name (e.g. "search-listings" from Airbnb and Booking.com).
|
||||
_TRUST_RANK = {"builtin": 2, "trusted": 1, "community": 0}
|
||||
seen: Dict[str, SkillMeta] = {}
|
||||
for r in all_results:
|
||||
if r.name not in seen:
|
||||
seen[r.name] = r
|
||||
elif _TRUST_RANK.get(r.trust_level, 0) > _TRUST_RANK.get(seen[r.name].trust_level, 0):
|
||||
seen[r.name] = r
|
||||
if r.identifier not in seen:
|
||||
seen[r.identifier] = r
|
||||
elif _TRUST_RANK.get(r.trust_level, 0) > _TRUST_RANK.get(seen[r.identifier].trust_level, 0):
|
||||
seen[r.identifier] = r
|
||||
deduped = list(seen.values())
|
||||
|
||||
return deduped[:limit]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue