mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-02 02:01:47 +00:00
fix(session_search): supplement FTS5 with LIKE for CJK partial results (#14829)
FTS5 unicode61 tokenizer drops certain CJK characters, causing queries like '昨晚' to return only 12.5% of actual matches. The existing LIKE fallback only triggers when FTS5 returns zero results, missing the common case where FTS5 returns *some* but not all matches. Change the LIKE path from a fallback (only on empty results) to a supplement (always runs for CJK queries). Results are merged with deduplication by message id, so FTS5 results are preserved and LIKE fills in what FTS5 missed. - Always run LIKE for CJK queries, not just on zero FTS5 results - Deduplicate merged results by message id - Add regression test for partial-result supplementation - Add deduplication correctness test
This commit is contained in:
parent
05d8f11085
commit
7859a09ce9
2 changed files with 38 additions and 4 deletions
|
|
@ -1308,9 +1308,10 @@ class SessionDB:
|
|||
else:
|
||||
matches = [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK
|
||||
# characters individually, causing multi-character queries to fail.
|
||||
if not matches and self._contains_cjk(query):
|
||||
# LIKE supplement for CJK queries: FTS5 unicode61 tokenizer drops
|
||||
# many CJK characters, causing partial or empty results. Always run
|
||||
# LIKE for CJK queries and merge with FTS5 results (dedup by id).
|
||||
if self._contains_cjk(query):
|
||||
raw_query = query.strip('"').strip()
|
||||
like_where = ["m.content LIKE ?"]
|
||||
like_params: list = [f"%{raw_query}%"]
|
||||
|
|
@ -1341,7 +1342,13 @@ class SessionDB:
|
|||
like_params = [raw_query] + like_params
|
||||
with self._lock:
|
||||
like_cursor = self._conn.execute(like_sql, like_params)
|
||||
matches = [dict(row) for row in like_cursor.fetchall()]
|
||||
like_matches = [dict(row) for row in like_cursor.fetchall()]
|
||||
# Merge: deduplicate by message id, LIKE results supplement FTS5
|
||||
seen_ids = {m["id"] for m in matches}
|
||||
for m in like_matches:
|
||||
if m["id"] not in seen_ids:
|
||||
matches.append(m)
|
||||
seen_ids.add(m["id"])
|
||||
|
||||
# Add surrounding context (1 message before + after each match).
|
||||
# Done outside the lock so we don't hold it across N sequential queries.
|
||||
|
|
|
|||
|
|
@ -716,6 +716,33 @@ class TestCJKSearchFallback:
|
|||
results = db.search_messages("Agent通信")
|
||||
assert len(results) == 1
|
||||
|
||||
def test_cjk_partial_fts5_results_supplemented_by_like(self, db):
|
||||
"""When FTS5 returns *some* CJK results, LIKE must supplement them.
|
||||
|
||||
Regression test for #14829: FTS5 unicode61 tokenizer drops certain
|
||||
CJK characters, so multi-character queries may return partial results.
|
||||
The LIKE path must always run for CJK queries and merge results.
|
||||
"""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.create_session(session_id="s2", source="telegram")
|
||||
# Insert messages containing the same CJK substring.
|
||||
# FTS5 may index one but not the other depending on tokenizer quirks.
|
||||
db.append_message("s1", role="user", content="昨晚讨论了记忆系统")
|
||||
db.append_message("s2", role="user", content="昨晚的会议纪要已发送")
|
||||
results = db.search_messages("昨晚")
|
||||
# Both messages contain "昨晚" — LIKE must find both even if FTS5
|
||||
# only returns one (or zero). Dedup ensures no duplicates.
|
||||
assert len(results) == 2
|
||||
session_ids = {r["session_id"] for r in results}
|
||||
assert session_ids == {"s1", "s2"}
|
||||
|
||||
def test_cjk_like_dedup_no_duplicates(self, db):
|
||||
"""When FTS5 and LIKE both find the same message, no duplicates."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="测试去重逻辑")
|
||||
results = db.search_messages("测试")
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Session search and listing
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue