From 7859a09ce99755bdb9abce0a02dde7b0b4ffd9d8 Mon Sep 17 00:00:00 2001 From: kagura-agent Date: Fri, 24 Apr 2026 09:18:31 +0800 Subject: [PATCH] fix(session_search): supplement FTS5 with LIKE for CJK partial results (#14829) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FTS5 unicode61 tokenizer drops certain CJK characters, causing queries like '昨晚' to return only 12.5% of actual matches. The existing LIKE fallback only triggers when FTS5 returns zero results, missing the common case where FTS5 returns *some* but not all matches. Change the LIKE path from a fallback (only on empty results) to a supplement (always runs for CJK queries). Results are merged with deduplication by message id, so FTS5 results are preserved and LIKE fills in what FTS5 missed. - Always run LIKE for CJK queries, not just on zero FTS5 results - Deduplicate merged results by message id - Add regression test for partial-result supplementation - Add deduplication correctness test --- hermes_state.py | 15 +++++++++++---- tests/test_hermes_state.py | 27 +++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/hermes_state.py b/hermes_state.py index ed95d25f45..b3c74d5bf4 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -1308,9 +1308,10 @@ class SessionDB: else: matches = [dict(row) for row in cursor.fetchall()] - # LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK - # characters individually, causing multi-character queries to fail. - if not matches and self._contains_cjk(query): + # LIKE supplement for CJK queries: FTS5 unicode61 tokenizer drops + # many CJK characters, causing partial or empty results. Always run + # LIKE for CJK queries and merge with FTS5 results (dedup by id). + if self._contains_cjk(query): raw_query = query.strip('"').strip() like_where = ["m.content LIKE ?"] like_params: list = [f"%{raw_query}%"] @@ -1341,7 +1342,13 @@ class SessionDB: like_params = [raw_query] + like_params with self._lock: like_cursor = self._conn.execute(like_sql, like_params) - matches = [dict(row) for row in like_cursor.fetchall()] + like_matches = [dict(row) for row in like_cursor.fetchall()] + # Merge: deduplicate by message id, LIKE results supplement FTS5 + seen_ids = {m["id"] for m in matches} + for m in like_matches: + if m["id"] not in seen_ids: + matches.append(m) + seen_ids.add(m["id"]) # Add surrounding context (1 message before + after each match). # Done outside the lock so we don't hold it across N sequential queries. diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index f405cf8bd5..d43fb687d1 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -716,6 +716,33 @@ class TestCJKSearchFallback: results = db.search_messages("Agent通信") assert len(results) == 1 + def test_cjk_partial_fts5_results_supplemented_by_like(self, db): + """When FTS5 returns *some* CJK results, LIKE must supplement them. + + Regression test for #14829: FTS5 unicode61 tokenizer drops certain + CJK characters, so multi-character queries may return partial results. + The LIKE path must always run for CJK queries and merge results. + """ + db.create_session(session_id="s1", source="cli") + db.create_session(session_id="s2", source="telegram") + # Insert messages containing the same CJK substring. + # FTS5 may index one but not the other depending on tokenizer quirks. + db.append_message("s1", role="user", content="昨晚讨论了记忆系统") + db.append_message("s2", role="user", content="昨晚的会议纪要已发送") + results = db.search_messages("昨晚") + # Both messages contain "昨晚" — LIKE must find both even if FTS5 + # only returns one (or zero). Dedup ensures no duplicates. + assert len(results) == 2 + session_ids = {r["session_id"] for r in results} + assert session_ids == {"s1", "s2"} + + def test_cjk_like_dedup_no_duplicates(self, db): + """When FTS5 and LIKE both find the same message, no duplicates.""" + db.create_session(session_id="s1", source="cli") + db.append_message("s1", role="user", content="测试去重逻辑") + results = db.search_messages("测试") + assert len(results) == 1 + # ========================================================================= # Session search and listing