mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(session): route OR-combined short CJK tokens to LIKE fallback (#20494)
The FTS5 trigram tokenizer requires >=3 CJK characters per individual token to produce matchable trigrams. A query like "广西 OR 桂林 OR 漓江" has cjk_count=6 (passes the existing >=3 guard) but each token is only 2 CJK chars, so the trigram index returns 0 results. Fix: - Add per-token check: if any non-operator CJK token has <3 CJK chars, force the LIKE fallback path regardless of total cjk_count. - Expand the LIKE fallback to build one LIKE condition per non-operator token joined with OR, so each term is matched independently. Regression tests added in TestCJKSearchFallback: - test_cjk_or_combined_short_tokens_returns_results - test_cjk_short_token_or_query_preserves_filters
This commit is contained in:
parent
35f773c459
commit
058c50816c
2 changed files with 66 additions and 8 deletions
|
|
@ -1958,7 +1958,19 @@ class SessionDB:
|
||||||
raw_query = query.strip('"').strip()
|
raw_query = query.strip('"').strip()
|
||||||
cjk_count = self._count_cjk(raw_query)
|
cjk_count = self._count_cjk(raw_query)
|
||||||
|
|
||||||
if cjk_count >= 3:
|
# Per-token CJK length check (#20494): trigram needs >=3 CJK chars
|
||||||
|
# per token. A query like "广西 OR 桂林 OR 漓江" has cjk_count=6
|
||||||
|
# (>=3) but each individual token is only 2 chars — trigram returns 0.
|
||||||
|
# Route to LIKE when any non-operator CJK token is <3 CJK chars.
|
||||||
|
_tokens_for_check = [
|
||||||
|
t for t in raw_query.split()
|
||||||
|
if t.upper() not in ("AND", "OR", "NOT") and self._contains_cjk(t)
|
||||||
|
]
|
||||||
|
_any_short_cjk = any(
|
||||||
|
self._count_cjk(t) < 3 for t in _tokens_for_check
|
||||||
|
)
|
||||||
|
|
||||||
|
if cjk_count >= 3 and not _any_short_cjk:
|
||||||
# Trigram FTS5 path — quote each non-operator token to handle
|
# Trigram FTS5 path — quote each non-operator token to handle
|
||||||
# FTS5 special chars (%, *, etc.) while preserving boolean
|
# FTS5 special chars (%, *, etc.) while preserving boolean
|
||||||
# operators (AND, OR, NOT) for multi-term queries.
|
# operators (AND, OR, NOT) for multi-term queries.
|
||||||
|
|
@ -2009,11 +2021,24 @@ class SessionDB:
|
||||||
else:
|
else:
|
||||||
matches = [dict(row) for row in tri_cursor.fetchall()]
|
matches = [dict(row) for row in tri_cursor.fetchall()]
|
||||||
else:
|
else:
|
||||||
# Short CJK query (1-2 chars) — trigram needs ≥3 CJK chars.
|
# Short / mixed CJK query: trigram cannot match tokens with
|
||||||
# Fall back to LIKE substring search.
|
# <3 CJK chars. Fall back to LIKE substring search.
|
||||||
escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
|
# For multi-token OR queries (e.g. "广西 OR 桂林 OR 漓江"),
|
||||||
like_where = ["(m.content LIKE ? ESCAPE '\\' OR m.tool_name LIKE ? ESCAPE '\\' OR m.tool_calls LIKE ? ESCAPE '\\')"]
|
# build one LIKE condition per non-operator token so each term
|
||||||
like_params: list = [f"%{escaped}%", f"%{escaped}%", f"%{escaped}%"]
|
# is matched independently (#20494).
|
||||||
|
non_op_tokens = [
|
||||||
|
t for t in raw_query.split()
|
||||||
|
if t.upper() not in ("AND", "OR", "NOT")
|
||||||
|
] or [raw_query]
|
||||||
|
token_clauses = []
|
||||||
|
like_params: list = []
|
||||||
|
for tok in non_op_tokens:
|
||||||
|
esc = tok.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
|
||||||
|
token_clauses.append(
|
||||||
|
"(m.content LIKE ? ESCAPE '\\' OR m.tool_name LIKE ? ESCAPE '\\' OR m.tool_calls LIKE ? ESCAPE '\\')"
|
||||||
|
)
|
||||||
|
like_params += [f"%{esc}%", f"%{esc}%", f"%{esc}%"]
|
||||||
|
like_where = [f"({' OR '.join(token_clauses)})"]
|
||||||
if source_filter is not None:
|
if source_filter is not None:
|
||||||
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})")
|
||||||
like_params.extend(source_filter)
|
like_params.extend(source_filter)
|
||||||
|
|
@ -2037,8 +2062,8 @@ class SessionDB:
|
||||||
LIMIT ? OFFSET ?
|
LIMIT ? OFFSET ?
|
||||||
"""
|
"""
|
||||||
like_params.extend([limit, offset])
|
like_params.extend([limit, offset])
|
||||||
# instr() parameter goes first in the bound list
|
# instr() for snippet uses first search token
|
||||||
like_params = [raw_query] + like_params
|
like_params = [non_op_tokens[0]] + like_params
|
||||||
with self._lock:
|
with self._lock:
|
||||||
like_cursor = self._conn.execute(like_sql, like_params)
|
like_cursor = self._conn.execute(like_sql, like_params)
|
||||||
matches = [dict(row) for row in like_cursor.fetchall()]
|
matches = [dict(row) for row in like_cursor.fetchall()]
|
||||||
|
|
|
||||||
|
|
@ -957,6 +957,39 @@ class TestCJKSearchFallback:
|
||||||
session_ids = {r["session_id"] for r in results}
|
session_ids = {r["session_id"] for r in results}
|
||||||
assert session_ids == {"s1", "s2"}
|
assert session_ids == {"s1", "s2"}
|
||||||
|
|
||||||
|
def test_cjk_or_combined_short_tokens_returns_results(self, db):
|
||||||
|
"""Regression test for #20494.
|
||||||
|
|
||||||
|
OR-combined 2-char CJK tokens (e.g. "广西 OR 桂林 OR 漓江 OR 旅游")
|
||||||
|
previously returned 0 results because _count_cjk of the whole query
|
||||||
|
was >=3 (8 chars here), selecting the trigram path, but each individual
|
||||||
|
token is only 2 CJK chars and trigram requires >=3 chars per token.
|
||||||
|
The per-token check must route such queries to the LIKE fallback.
|
||||||
|
"""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.create_session(session_id="s2", source="telegram")
|
||||||
|
db.create_session(session_id="s3", source="cli")
|
||||||
|
db.append_message("s1", role="user", content="广西是个好地方,去过桂林")
|
||||||
|
db.append_message("s2", role="user", content="漓江风景很美,值得旅游")
|
||||||
|
db.append_message("s3", role="user", content="unrelated English content")
|
||||||
|
|
||||||
|
results = db.search_messages("广西 OR 桂林 OR 漓江 OR 旅游")
|
||||||
|
session_ids = {r["session_id"] for r in results}
|
||||||
|
assert "s1" in session_ids, "广西/桂林 terms not matched"
|
||||||
|
assert "s2" in session_ids, "漓江/旅游 terms not matched"
|
||||||
|
assert "s3" not in session_ids, "unrelated message must not match"
|
||||||
|
|
||||||
|
def test_cjk_short_token_or_query_preserves_filters(self, db):
|
||||||
|
"""Source filter applies correctly in the short-token LIKE path (#20494)."""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.create_session(session_id="s2", source="telegram")
|
||||||
|
db.append_message("s1", role="user", content="广西旅游攻略cli")
|
||||||
|
db.append_message("s2", role="user", content="广西旅游攻略telegram")
|
||||||
|
|
||||||
|
results = db.search_messages("广西 OR 旅游", source_filter=["telegram"])
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0]["source"] == "telegram"
|
||||||
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Session search and listing
|
# Session search and listing
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue