mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: FTS5 LIKE fallback for CJK (Chinese/Japanese/Korean) queries
FTS5 default tokenizer splits CJK text character-by-character, causing multi-character queries like '记忆断裂' to return 0 results. This fix adds a LIKE fallback: when FTS5 returns no results and the query contains CJK characters, retry with WHERE content LIKE '%query%'. Preserves FTS5 performance for English queries. Fixes #11511
This commit is contained in:
parent
a2c9f5d0a7
commit
8826d9c197
1 changed files with 52 additions and 2 deletions
|
|
@ -987,6 +987,22 @@ class SessionDB:
|
||||||
|
|
||||||
return sanitized.strip()
|
return sanitized.strip()
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _contains_cjk(text: str) -> bool:
|
||||||
|
"""Check if text contains CJK (Chinese, Japanese, Korean) characters."""
|
||||||
|
for ch in text:
|
||||||
|
cp = ord(ch)
|
||||||
|
if (0x4E00 <= cp <= 0x9FFF or # CJK Unified Ideographs
|
||||||
|
0x3400 <= cp <= 0x4DBF or # CJK Extension A
|
||||||
|
0x20000 <= cp <= 0x2A6DF or # CJK Extension B
|
||||||
|
0x3000 <= cp <= 0x303F or # CJK Symbols
|
||||||
|
0x3040 <= cp <= 0x309F or # Hiragana
|
||||||
|
0x30A0 <= cp <= 0x30FF or # Katakana
|
||||||
|
0xAC00 <= cp <= 0xD7AF): # Hangul Syllables
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def search_messages(
|
def search_messages(
|
||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
|
|
@ -1062,8 +1078,42 @@ class SessionDB:
|
||||||
cursor = self._conn.execute(sql, params)
|
cursor = self._conn.execute(sql, params)
|
||||||
except sqlite3.OperationalError:
|
except sqlite3.OperationalError:
|
||||||
# FTS5 query syntax error despite sanitization — return empty
|
# FTS5 query syntax error despite sanitization — return empty
|
||||||
return []
|
# unless query contains CJK (fall back to LIKE below)
|
||||||
matches = [dict(row) for row in cursor.fetchall()]
|
if not self._contains_cjk(query):
|
||||||
|
return []
|
||||||
|
matches = []
|
||||||
|
else:
|
||||||
|
matches = [dict(row) for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
# LIKE fallback for CJK queries: FTS5 default tokenizer splits CJK
|
||||||
|
# characters individually, causing multi-character queries to fail.
|
||||||
|
if not matches and self._contains_cjk(query):
|
||||||
|
raw_query = query.strip('"').strip()
|
||||||
|
like_where = ["m.content LIKE ?"]
|
||||||
|
like_params: list = [f"%{raw_query}%"]
|
||||||
|
if source_filter is not None:
|
||||||
|
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
||||||
|
like_params.extend(source_filter)
|
||||||
|
if exclude_sources is not None:
|
||||||
|
like_where.append(f"s.source NOT IN ({','.join('?' for _ in exclude_sources)})")
|
||||||
|
like_params.extend(exclude_sources)
|
||||||
|
if role_filter:
|
||||||
|
like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
|
||||||
|
like_params.extend(role_filter)
|
||||||
|
like_sql = f"""
|
||||||
|
SELECT m.id, m.session_id, m.role, m.content AS snippet,
|
||||||
|
m.content, m.timestamp, m.tool_name,
|
||||||
|
s.source, s.model, s.started_at AS session_started
|
||||||
|
FROM messages m
|
||||||
|
JOIN sessions s ON s.id = m.session_id
|
||||||
|
WHERE {' AND '.join(like_where)}
|
||||||
|
ORDER BY m.timestamp DESC
|
||||||
|
LIMIT ? OFFSET ?
|
||||||
|
"""
|
||||||
|
like_params.extend([limit, offset])
|
||||||
|
with self._lock:
|
||||||
|
like_cursor = self._conn.execute(like_sql, like_params)
|
||||||
|
matches = [dict(row) for row in like_cursor.fetchall()]
|
||||||
|
|
||||||
# Add surrounding context (1 message before + after each match).
|
# Add surrounding context (1 message before + after each match).
|
||||||
# Done outside the lock so we don't hold it across N sequential queries.
|
# Done outside the lock so we don't hold it across N sequential queries.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue