fix: FTS5 LIKE fallback for CJK (Chinese/Japanese/Korean) queries

FTS5 default tokenizer splits CJK text character-by-character, causing
multi-character queries like '记忆断裂' to return 0 results.

This fix adds a LIKE fallback: when FTS5 returns no results and the
query contains CJK characters, retry with WHERE content LIKE '%query%'.
Preserves FTS5 performance for English queries.

Fixes #11511
This commit is contained in:
vominh1919 2026-04-17 16:35:02 +07:00 committed by Teknium
parent a2c9f5d0a7
commit 8826d9c197

View file

@ -987,6 +987,22 @@ class SessionDB:
return sanitized.strip() return sanitized.strip()
@staticmethod
def _contains_cjk(text: str) -> bool:
"""Check if text contains CJK (Chinese, Japanese, Korean) characters."""
for ch in text:
cp = ord(ch)
if (0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
0x3400 <= cp <= 0x4DBF or # CJK Extension A
0x20000 <= cp <= 0x2A6DF or # CJK Extension B
0x3000 <= cp <= 0x303F or # CJK Symbols
0x3040 <= cp <= 0x309F or # Hiragana
0x30A0 <= cp <= 0x30FF or # Katakana
0xAC00 <= cp <= 0xD7AF): # Hangul Syllables
return True
return False
def search_messages( def search_messages(
self, self,
query: str, query: str,
@ -1062,8 +1078,42 @@ class SessionDB:
cursor = self._conn.execute(sql, params) cursor = self._conn.execute(sql, params)
except sqlite3.OperationalError: except sqlite3.OperationalError:
# FTS5 query syntax error despite sanitization — return empty # FTS5 query syntax error despite sanitization — return empty
return [] # unless query contains CJK (fall back to LIKE below)
matches = [dict(row) for row in cursor.fetchall()] if not self._contains_cjk(query):
return []
matches = []
else:
matches = [dict(row) for row in cursor.fetchall()]
# LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK
# characters individually, causing multi-character queries to fail.
if not matches and self._contains_cjk(query):
raw_query = query.strip('"').strip()
like_where = ["m.content LIKE ?"]
like_params: list = [f"%{raw_query}%"]
if source_filter is not None:
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
like_params.extend(source_filter)
if exclude_sources is not None:
like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
like_params.extend(exclude_sources)
if role_filter:
like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
like_params.extend(role_filter)
like_sql = f"""
SELECT m.id, m.session_id, m.role, m.content AS snippet,
m.content, m.timestamp, m.tool_name,
s.source, s.model, s.started_at AS session_started
FROM messages m
JOIN sessions s ON s.id = m.session_id
WHERE {' AND '.join(like_where)}
ORDER BY m.timestamp DESC
LIMIT ? OFFSET ?
"""
like_params.extend([limit, offset])
with self._lock:
like_cursor = self._conn.execute(like_sql, like_params)
matches = [dict(row) for row in like_cursor.fetchall()]
# Add surrounding context (1 message before + after each match). # Add surrounding context (1 message before + after each match).
# Done outside the lock so we don't hold it across N sequential queries. # Done outside the lock so we don't hold it across N sequential queries.