diff --git a/hermes_state.py b/hermes_state.py index 58511b2eab4..913563f69b8 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -1958,7 +1958,19 @@ class SessionDB: raw_query = query.strip('"').strip() cjk_count = self._count_cjk(raw_query) - if cjk_count >= 3: + # Per-token CJK length check (#20494): trigram needs >=3 CJK chars + # per token. A query like "广西 OR 桂林 OR 漓江" has cjk_count=6 + # (>=3) but each individual token is only 2 chars — trigram returns 0. + # Route to LIKE when any non-operator CJK token is <3 CJK chars. + _tokens_for_check = [ + t for t in raw_query.split() + if t.upper() not in ("AND", "OR", "NOT") and self._contains_cjk(t) + ] + _any_short_cjk = any( + self._count_cjk(t) < 3 for t in _tokens_for_check + ) + + if cjk_count >= 3 and not _any_short_cjk: # Trigram FTS5 path — quote each non-operator token to handle # FTS5 special chars (%, *, etc.) while preserving boolean # operators (AND, OR, NOT) for multi-term queries. @@ -2009,11 +2021,24 @@ class SessionDB: else: matches = [dict(row) for row in tri_cursor.fetchall()] else: - # Short CJK query (1-2 chars) — trigram needs ≥3 CJK chars. - # Fall back to LIKE substring search. - escaped = raw_query.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") - like_where = ["(m.content LIKE ? ESCAPE '\\' OR m.tool_name LIKE ? ESCAPE '\\' OR m.tool_calls LIKE ? ESCAPE '\\')"] - like_params: list = [f"%{escaped}%", f"%{escaped}%", f"%{escaped}%"] + # Short / mixed CJK query: trigram cannot match tokens with + # <3 CJK chars. Fall back to LIKE substring search. + # For multi-token OR queries (e.g. "广西 OR 桂林 OR 漓江"), + # build one LIKE condition per non-operator token so each term + # is matched independently (#20494). + non_op_tokens = [ + t for t in raw_query.split() + if t.upper() not in ("AND", "OR", "NOT") + ] or [raw_query] + token_clauses = [] + like_params: list = [] + for tok in non_op_tokens: + esc = tok.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + token_clauses.append( + "(m.content LIKE ? ESCAPE '\\' OR m.tool_name LIKE ? ESCAPE '\\' OR m.tool_calls LIKE ? ESCAPE '\\')" + ) + like_params += [f"%{esc}%", f"%{esc}%", f"%{esc}%"] + like_where = [f"({' OR '.join(token_clauses)})"] if source_filter is not None: like_where.append(f"s.source IN ({','.join('?' for _ in source_filter)})") like_params.extend(source_filter) @@ -2037,8 +2062,8 @@ class SessionDB: LIMIT ? OFFSET ? """ like_params.extend([limit, offset]) - # instr() parameter goes first in the bound list - like_params = [raw_query] + like_params + # instr() for snippet uses first search token + like_params = [non_op_tokens[0]] + like_params with self._lock: like_cursor = self._conn.execute(like_sql, like_params) matches = [dict(row) for row in like_cursor.fetchall()] diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index 55249406683..3bae763b941 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -957,6 +957,39 @@ class TestCJKSearchFallback: session_ids = {r["session_id"] for r in results} assert session_ids == {"s1", "s2"} + def test_cjk_or_combined_short_tokens_returns_results(self, db): + """Regression test for #20494. + + OR-combined 2-char CJK tokens (e.g. "广西 OR 桂林 OR 漓江 OR 旅游") + previously returned 0 results because _count_cjk of the whole query + was >=3 (8 chars here), selecting the trigram path, but each individual + token is only 2 CJK chars and trigram requires >=3 chars per token. + The per-token check must route such queries to the LIKE fallback. + """ + db.create_session(session_id="s1", source="cli") + db.create_session(session_id="s2", source="telegram") + db.create_session(session_id="s3", source="cli") + db.append_message("s1", role="user", content="广西是个好地方,去过桂林") + db.append_message("s2", role="user", content="漓江风景很美,值得旅游") + db.append_message("s3", role="user", content="unrelated English content") + + results = db.search_messages("广西 OR 桂林 OR 漓江 OR 旅游") + session_ids = {r["session_id"] for r in results} + assert "s1" in session_ids, "广西/桂林 terms not matched" + assert "s2" in session_ids, "漓江/旅游 terms not matched" + assert "s3" not in session_ids, "unrelated message must not match" + + def test_cjk_short_token_or_query_preserves_filters(self, db): + """Source filter applies correctly in the short-token LIKE path (#20494).""" + db.create_session(session_id="s1", source="cli") + db.create_session(session_id="s2", source="telegram") + db.append_message("s1", role="user", content="广西旅游攻略cli") + db.append_message("s2", role="user", content="广西旅游攻略telegram") + + results = db.search_messages("广西 OR 旅游", source_filter=["telegram"]) + assert len(results) == 1 + assert results[0]["source"] == "telegram" + # ========================================================================= # Session search and listing