fix(session_search): supplement FTS5 with LIKE for CJK partial results (#14829)

FTS5 unicode61 tokenizer drops certain CJK characters, causing queries
like '昨晚' to return only 12.5% of actual matches. The existing LIKE
fallback only triggers when FTS5 returns zero results, missing the
common case where FTS5 returns *some* but not all matches.

Change the LIKE path from a fallback (only on empty results) to a
supplement (always runs for CJK queries). Results are merged with
deduplication by message id, so FTS5 results are preserved and LIKE
fills in what FTS5 missed.

- Always run LIKE for CJK queries, not just on zero FTS5 results
- Deduplicate merged results by message id
- Add regression test for partial-result supplementation
- Add deduplication correctness test
This commit is contained in:
kagura-agent 2026-04-24 09:18:31 +08:00
parent 05d8f11085
commit 7859a09ce9
No known key found for this signature in database
GPG key ID: E1BB09FC90A26721
2 changed files with 38 additions and 4 deletions

View file

@ -716,6 +716,33 @@ class TestCJKSearchFallback:
results = db.search_messages("Agent通信")
assert len(results) == 1
def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
"""When FTS5 returns *some* CJK results, LIKE must supplement them.
Regression test for #14829: FTS5 unicode61 tokenizer drops certain
CJK characters, so multi-character queries may return partial results.
The LIKE path must always run for CJK queries and merge results.
"""
db.create_session(session_id="s1", source="cli")
db.create_session(session_id="s2", source="telegram")
# Insert messages containing the same CJK substring.
# FTS5 may index one but not the other depending on tokenizer quirks.
db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
results = db.search_messages("昨晚")
# Both messages contain "昨晚" — LIKE must find both even if FTS5
# only returns one (or zero). Dedup ensures no duplicates.
assert len(results) == 2
session_ids = {r["session_id"] for r in results}
assert session_ids == {"s1", "s2"}
def test_cjk_like_dedup_no_duplicates(self, db):
"""When FTS5 and LIKE both find the same message, no duplicates."""
db.create_session(session_id="s1", source="cli")
db.append_message("s1", role="user", content="测试去重逻辑")
results = db.search_messages("测试")
assert len(results) == 1
# =========================================================================
# Session search and listing