mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 01:21:43 +00:00
fix(session_search): supplement FTS5 with LIKE for CJK partial results (#14829)
FTS5 unicode61 tokenizer drops certain CJK characters, causing queries like '昨晚' to return only 12.5% of actual matches. The existing LIKE fallback only triggers when FTS5 returns zero results, missing the common case where FTS5 returns *some* but not all matches. Change the LIKE path from a fallback (only on empty results) to a supplement (always runs for CJK queries). Results are merged with deduplication by message id, so FTS5 results are preserved and LIKE fills in what FTS5 missed. - Always run LIKE for CJK queries, not just on zero FTS5 results - Deduplicate merged results by message id - Add regression test for partial-result supplementation - Add deduplication correctness test
This commit is contained in:
parent
05d8f11085
commit
7859a09ce9
2 changed files with 38 additions and 4 deletions
|
|
@ -716,6 +716,33 @@ class TestCJKSearchFallback:
|
|||
results = db.search_messages("Agent通信")
|
||||
assert len(results) == 1
|
||||
|
||||
def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
|
||||
"""When FTS5 returns *some* CJK results, LIKE must supplement them.
|
||||
|
||||
Regression test for #14829: FTS5 unicode61 tokenizer drops certain
|
||||
CJK characters, so multi-character queries may return partial results.
|
||||
The LIKE path must always run for CJK queries and merge results.
|
||||
"""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.create_session(session_id="s2", source="telegram")
|
||||
# Insert messages containing the same CJK substring.
|
||||
# FTS5 may index one but not the other depending on tokenizer quirks.
|
||||
db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
|
||||
db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
|
||||
results = db.search_messages("昨晚")
|
||||
# Both messages contain "昨晚" — LIKE must find both even if FTS5
|
||||
# only returns one (or zero). Dedup ensures no duplicates.
|
||||
assert len(results) == 2
|
||||
session_ids = {r["session_id"] for r in results}
|
||||
assert session_ids == {"s1", "s2"}
|
||||
|
||||
def test_cjk_like_dedup_no_duplicates(self, db):
|
||||
"""When FTS5 and LIKE both find the same message, no duplicates."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="测试去重逻辑")
|
||||
results = db.search_messages("测试")
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Session search and listing
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue