test(session-search): regression coverage for CJK LIKE fallback

Twelve tests under TestCJKSearchFallback guarding: - CJK detection across Chinese/Japanese/Korean/Hiragana/Katakana ranges (including the full Hangul syllables block \uac00-\ud7af, to catch the shorter-range typo from one of the duplicate PRs) - Substring match for multi-char Chinese, Japanese, Korean queries - Filter preservation (source_filter, exclude_sources, role_filter) in the LIKE path — guards against the SQL-builder bug from another duplicate PR where filter clauses landed after LIMIT/OFFSET - Snippet centered on the matched term (instr-based substr window), not the leading 200 chars of content - English fast-path untouched - Empty/no-match cases - Mixed CJK+English queries Also: - hermes_state.py: LIKE-fallback snippet is now `substr(content, max(1, instr(content, ?) - 40), 120)`, centered on the match instead of the whole-content default. Credit goes to @iamagenius00 for the snippet idea in PR #11517. - scripts/release.py: add @iamagenius00 to AUTHOR_MAP so future release attribution resolves cleanly. Refs #11511, #11516, #11517, #11541. Co-authored-by: iamagenius00 <iamagenius00@users.noreply.github.com>
2026-06-18 09:51:59 +00:00 · 2026-04-18 01:56:22 -07:00 · 2026-04-18 01:56:22 -07:00 · 3b69b2fd61
commit 3b69b2fd61
parent 8826d9c197
3 changed files with 142 additions and 1 deletions
--- a/hermes_state.py
+++ b/hermes_state.py
@ -1101,7 +1101,10 @@ class SessionDB:
                like_where.append(f"m.role IN ({','.join('?' for _ in role_filter)})")
                like_params.extend(role_filter)
            like_sql = f"""
-                SELECT m.id, m.session_id, m.role, m.content AS snippet,
+                SELECT m.id, m.session_id, m.role,
+                       substr(m.content,
+                              max(1, instr(m.content, ?) - 40),
+                              120) AS snippet,
                       m.content, m.timestamp, m.tool_name,
                       s.source, s.model, s.started_at AS session_started
                FROM messages m
@ -1111,6 +1114,8 @@ class SessionDB:
                LIMIT ? OFFSET ?
            """
            like_params.extend([limit, offset])
+            # instr() parameter goes first in the bound list
+            like_params = [raw_query] + like_params
            with self._lock:
                like_cursor = self._conn.execute(like_sql, like_params)
                matches = [dict(row) for row in like_cursor.fetchall()]
--- a/scripts/release.py
+++ b/scripts/release.py
@ -207,6 +207,7 @@ AUTHOR_MAP = {
    "cola-runner@users.noreply.github.com": "cola-runner",
    "ygd58@users.noreply.github.com": "ygd58",
    "vominh1919@users.noreply.github.com": "vominh1919",
+    "iamagenius00@users.noreply.github.com": "iamagenius00",
    "trevmanthony@gmail.com": "trevthefoolish",
    "ziliangpeng@users.noreply.github.com": "ziliangpeng",
    "centripetal-star@users.noreply.github.com": "centripetal-star",
--- a/tests/test_hermes_state.py
+++ b/tests/test_hermes_state.py
@ -479,6 +479,141 @@ class TestFTS5Search:
        assert s('my-app.config.ts') == '"my-app.config.ts"'


+# =========================================================================
+# CJK (Chinese/Japanese/Korean) LIKE fallback
+# =========================================================================
+
+class TestCJKSearchFallback:
+    """Regression tests for CJK search (see #11511).
+
+    SQLite FTS5's default tokenizer treats contiguous CJK runs as a single
+    token ("和其他agent的聊天记录" → one token), so substring queries like
+    "记忆断裂" return 0 rows despite the data being present. SessionDB falls
+    back to LIKE substring matching whenever FTS5 returns no results and
+    the query contains CJK characters.
+    """
+
+    def test_cjk_detection_covers_all_ranges(self):
+        from hermes_state import SessionDB
+        f = SessionDB._contains_cjk
+        # Chinese (CJK Unified Ideographs)
+        assert f("记忆断裂") is True
+        # Japanese Hiragana + Katakana
+        assert f("こんにちは") is True
+        assert f("カタカナ") is True
+        # Korean Hangul syllables (both early and late — guards against
+        # the \ud7a0-\ud7af typo seen in one of the duplicate PRs)
+        assert f("안녕하세요") is True
+        assert f("기억") is True
+        # Non-CJK
+        assert f("hello world") is False
+        assert f("日本語mixedwithenglish") is True
+        assert f("") is False
+
+    def test_chinese_multichar_query_returns_results(self, db):
+        """The headline bug: multi-char Chinese query must not return []."""
+        db.create_session(session_id="s1", source="cli")
+        db.append_message(
+            "s1", role="user",
+            content="昨天和其他Agent的聊天记录，记忆断裂问题复现了",
+        )
+        results = db.search_messages("记忆断裂")
+        assert len(results) == 1
+        assert results[0]["session_id"] == "s1"
+
+    def test_chinese_bigram_query(self, db):
+        db.create_session(session_id="s1", source="telegram")
+        db.append_message("s1", role="user", content="今天讨论A2A通信协议的实现")
+        results = db.search_messages("通信")
+        assert len(results) == 1
+
+    def test_korean_query_returns_results(self, db):
+        """Guards against Hangul range typos (\\uac00-\\ud7af, not \\ud7a0-)."""
+        db.create_session(session_id="s1", source="cli")
+        db.append_message("s1", role="user", content="안녕하세요 반갑습니다")
+        results = db.search_messages("안녕")
+        assert len(results) == 1
+
+    def test_japanese_query_returns_results(self, db):
+        db.create_session(session_id="s1", source="cli")
+        db.append_message("s1", role="user", content="こんにちは世界")
+        assert len(db.search_messages("こんにちは")) == 1
+        assert len(db.search_messages("世界")) == 1
+
+    def test_cjk_fallback_preserves_source_filter(self, db):
+        """Guards against the SQL-builder bug where filter clauses land
+        after LIMIT/OFFSET (seen in one of the duplicate PRs)."""
+        db.create_session(session_id="s1", source="cli")
+        db.create_session(session_id="s2", source="telegram")
+        db.append_message("s1", role="user", content="记忆断裂在CLI")
+        db.append_message("s2", role="user", content="记忆断裂在Telegram")
+
+        results = db.search_messages("记忆断裂", source_filter=["telegram"])
+        assert len(results) == 1
+        assert results[0]["source"] == "telegram"
+
+    def test_cjk_fallback_preserves_exclude_sources(self, db):
+        db.create_session(session_id="s1", source="cli")
+        db.create_session(session_id="s2", source="tool")
+        db.append_message("s1", role="user", content="记忆断裂在CLI")
+        db.append_message("s2", role="assistant", content="记忆断裂在tool")
+
+        results = db.search_messages("记忆断裂", exclude_sources=["tool"])
+        sources = {r["source"] for r in results}
+        assert "tool" not in sources
+        assert "cli" in sources
+
+    def test_cjk_fallback_preserves_role_filter(self, db):
+        db.create_session(session_id="s1", source="cli")
+        db.append_message("s1", role="user", content="用户说的记忆断裂")
+        db.append_message("s1", role="assistant", content="助手说的记忆断裂")
+
+        results = db.search_messages("记忆断裂", role_filter=["assistant"])
+        assert len(results) == 1
+        assert results[0]["role"] == "assistant"
+
+    def test_cjk_snippet_is_centered_on_match(self, db):
+        """Snippet should contain the search term, not just the first N chars."""
+        db.create_session(session_id="s1", source="cli")
+        long_prefix = "这是一段很长的前缀用来把匹配位置推到文档中间" * 3
+        long_suffix = "这是一段很长的后缀内容填充剩余空间" * 3
+        db.append_message(
+            "s1", role="user",
+            content=f"{long_prefix}记忆断裂{long_suffix}",
+        )
+        results = db.search_messages("记忆断裂")
+        assert len(results) == 1
+        # The centered substr() snippet must include the matched term.
+        assert "记忆断裂" in results[0]["snippet"]
+
+    def test_english_query_still_uses_fts5_fast_path(self, db):
+        """English queries must not trigger the LIKE fallback (fast path regression)."""
+        db.create_session(session_id="s1", source="cli")
+        db.append_message("s1", role="user", content="Deploy docker containers")
+        results = db.search_messages("docker")
+        assert len(results) == 1
+        # No CJK in query → LIKE fallback must not run. We don't assert this
+        # directly (no instrumentation), but the FTS5 path produces an
+        # FTS5-style snippet with highlight markers when the term is short.
+        # At minimum: english queries must still match.
+
+    def test_cjk_query_with_no_matches_returns_empty(self, db):
+        db.create_session(session_id="s1", source="cli")
+        db.append_message("s1", role="user", content="unrelated English content")
+        results = db.search_messages("记忆断裂")
+        assert results == []
+
+    def test_mixed_cjk_english_query(self, db):
+        """Mixed queries should still fall back to LIKE when FTS5 misses."""
+        db.create_session(session_id="s1", source="cli")
+        db.append_message("s1", role="user", content="讨论Agent通信协议")
+        # "Agent通信" is CJK+English — FTS5 default tokenizer indexes the
+        # whole CJK run with embedded "agent" as separate tokens; the LIKE
+        # fallback handles the substring correctly.
+        results = db.search_messages("Agent通信")
+        assert len(results) == 1
+
+
 # =========================================================================
 # Session search and listing
 # =========================================================================