From c10aa5dc9c69e8e2cc03178be4b189844df29965 Mon Sep 17 00:00:00 2001 From: liuhao1024 Date: Tue, 16 Jun 2026 12:47:07 +0800 Subject: [PATCH] fix(agent): address review feedback on trigram tokenizer fallback - Scope 'no such tokenizer' matcher to trigram specifically (#779) - Decouple base FTS and trigram backfill in v11 migration (#1195) - CJK search falls back to LIKE when trigram unavailable (#3384/#3430) - Add _trigram_available tracking across init, migration, and startup - Add regression tests for migration backfill and CJK LIKE fallback - Add _is_trigram_unavailable_error and _warn_trigram_unavailable helpers --- hermes_state.py | 76 ++++++++++++++++++++++++---------- tests/test_hermes_state.py | 84 +++++++++++++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 22 deletions(-) diff --git a/hermes_state.py b/hermes_state.py index f54fbbd6af5..99cb24748e6 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -684,6 +684,7 @@ class SessionDB: self._lock = threading.Lock() self._write_count = 0 self._fts_enabled = False + self._trigram_available = False self._fts_unavailable_warned = False self._conn = None try: @@ -776,14 +777,29 @@ class SessionDB: return True # SQLite builds that have FTS5 but lack the optional trigram tokenizer # raise "no such tokenizer: trigram" instead of "no such module". - if "no such tokenizer" in err: + # Scope to trigram specifically to avoid masking unrelated tokenizer errors. + if "no such tokenizer: trigram" in err: return True return False @staticmethod - def _is_tokenizer_unavailable_error(exc: sqlite3.OperationalError) -> bool: - """Check if the error is about a specific tokenizer (not the whole FTS5 module).""" - return "no such tokenizer" in str(exc).lower() + def _is_trigram_unavailable_error(exc: sqlite3.OperationalError) -> bool: + """True when only the trigram tokenizer is missing (FTS5 itself works).""" + return "no such tokenizer: trigram" in str(exc).lower() + + def _warn_trigram_unavailable(self, exc: sqlite3.OperationalError) -> None: + """Log once that the trigram tokenizer is missing; base FTS5 stays enabled.""" + if getattr(self, "_trigram_unavailable_warned", False): + return + self._trigram_unavailable_warned = True + logger.info( + "SQLite trigram tokenizer unavailable for %s " + "(requires SQLite >= 3.34, this build is %s); " + "CJK/substring search will fall back to LIKE: %s", + self.db_path, + sqlite3.sqlite_version, + exc, + ) def _warn_fts5_unavailable(self, exc: sqlite3.OperationalError) -> None: self._fts_enabled = False @@ -856,7 +872,10 @@ class SessionDB: except sqlite3.OperationalError as exc: if self._is_fts5_unavailable_error(exc): # Only disable FTS entirely when the whole module is missing. - if not self._is_tokenizer_unavailable_error(exc): + # A missing trigram tokenizer only affects trigram searches. + if self._is_trigram_unavailable_error(exc): + self._warn_trigram_unavailable(exc) + else: self._warn_fts5_unavailable(exc) return None if "no such table" in str(exc).lower(): @@ -884,7 +903,9 @@ class SessionDB: # Only disable FTS entirely when the whole FTS5 module is missing. # A missing specific tokenizer (e.g. trigram) means only that # particular table cannot be created — the base FTS5 table is fine. - if not self._is_tokenizer_unavailable_error(exc): + if self._is_trigram_unavailable_error(exc): + self._warn_trigram_unavailable(exc) + else: self._warn_fts5_unavailable(exc) return False @@ -1183,22 +1204,23 @@ class SessionDB: except sqlite3.OperationalError as exc: if not self._is_fts5_unavailable_error(exc): raise - if not self._is_tokenizer_unavailable_error(exc): + if self._is_trigram_unavailable_error(exc): + self._warn_trigram_unavailable(exc) + else: self._warn_fts5_unavailable(exc) - fts5_available = False - fts_migrations_complete = False + fts5_available = False + fts_migrations_complete = False break if fts5_available: # Recreate virtual tables + triggers with the new inline-mode # schema that indexes content || tool_name || tool_calls. - if ( - self._ensure_fts_schema(cursor, "messages_fts", FTS_SQL) - and self._ensure_fts_schema( - cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL - ) - ): - # Backfill both indexes from every existing messages row. + # Handle base and trigram independently — a missing + # trigram tokenizer should not prevent base FTS backfill. + base_fts_ok = self._ensure_fts_schema( + cursor, "messages_fts", FTS_SQL + ) + if base_fts_ok: cursor.execute( "INSERT INTO messages_fts(rowid, content) " "SELECT id, " @@ -1207,6 +1229,10 @@ class SessionDB: "COALESCE(tool_calls, '') " "FROM messages" ) + trigram_ok = self._ensure_fts_schema( + cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL + ) + if trigram_ok: cursor.execute( "INSERT INTO messages_fts_trigram(rowid, content) " "SELECT id, " @@ -1215,8 +1241,12 @@ class SessionDB: "COALESCE(tool_calls, '') " "FROM messages" ) - else: + if not base_fts_ok: fts_migrations_complete = False + # Track trigram availability for CJK LIKE fallback. + self._trigram_available = trigram_ok + else: + fts_migrations_complete = False else: fts_migrations_complete = False if current_version < 12: @@ -1286,6 +1316,7 @@ class SessionDB: trigram_enabled = self._ensure_fts_schema( cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL ) + self._trigram_available = trigram_enabled if trigram_enabled and triggers_need_repair: self._rebuild_fts_indexes(cursor) @@ -3422,7 +3453,8 @@ class SessionDB: self._count_cjk(t) < 3 for t in _tokens_for_check ) - if cjk_count >= 3 and not _any_short_cjk: + _trigram_succeeded = False + if cjk_count >= 3 and not _any_short_cjk and self._trigram_available: # Trigram FTS5 path — quote each non-operator token to handle # FTS5 special chars (%, *, etc.) while preserving boolean # operators (AND, OR, NOT) for multi-term queries. @@ -3471,11 +3503,13 @@ class SessionDB: try: tri_cursor = self._conn.execute(tri_sql, tri_params) except sqlite3.OperationalError: - matches = [] + # Trigram query failed at runtime — fall through to LIKE. + pass else: matches = [dict(row) for row in tri_cursor.fetchall()] - else: - # Short / mixed CJK query: trigram cannot match tokens with + _trigram_succeeded = True + if not _trigram_succeeded: + # Short / mixed CJK query, trigram unavailable, or trigram # <3 CJK chars. Fall back to LIKE substring search. # For multi-token OR queries (e.g. "广西 OR 桂林 OR 漓江"), # build one LIKE condition per non-operator token so each term diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index 4bdc12d4642..0baf3226401 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -345,15 +345,28 @@ class TestSessionLifecycle: restored.close() def test_is_fts5_unavailable_error_catches_trigram_tokenizer(self): - """Unit test: _is_fts5_unavailable_error matches 'no such tokenizer'.""" + """Unit test: _is_fts5_unavailable_error matches 'no such tokenizer: trigram'.""" fts5_err = sqlite3.OperationalError("no such module: fts5") trigram_err = sqlite3.OperationalError("no such tokenizer: trigram") + generic_tokenizer_err = sqlite3.OperationalError("no such tokenizer: foo") unrelated_err = sqlite3.OperationalError("no such table: foo") assert SessionDB._is_fts5_unavailable_error(fts5_err) is True assert SessionDB._is_fts5_unavailable_error(trigram_err) is True + # Generic tokenizer errors should NOT match — only trigram. + assert SessionDB._is_fts5_unavailable_error(generic_tokenizer_err) is False assert SessionDB._is_fts5_unavailable_error(unrelated_err) is False + def test_is_trigram_unavailable_error(self): + """Unit test: _is_trigram_unavailable_error is scoped to trigram.""" + trigram_err = sqlite3.OperationalError("no such tokenizer: trigram") + generic_err = sqlite3.OperationalError("no such tokenizer: foo") + fts5_err = sqlite3.OperationalError("no such module: fts5") + + assert SessionDB._is_trigram_unavailable_error(trigram_err) is True + assert SessionDB._is_trigram_unavailable_error(generic_err) is False + assert SessionDB._is_trigram_unavailable_error(fts5_err) is False + def test_db_initializes_without_trigram_tokenizer(self, tmp_path, monkeypatch): """SessionDB must not crash when FTS5 exists but trigram tokenizer is missing.""" real_connect = sqlite3.connect @@ -384,6 +397,75 @@ class TestSessionLifecycle: finally: db.close() + def test_v11_migration_backfills_base_fts_when_trigram_unavailable( + self, tmp_path, monkeypatch + ): + """Regression: v11 migration must backfill base FTS even when trigram is unavailable.""" + real_connect = sqlite3.connect + db_path = tmp_path / "state.db" + + # Phase 1: create a DB at schema v10 with messages. + db = SessionDB(db_path=db_path) + db.create_session(session_id="s1", source="cli") + db.append_message("s1", role="user", content="legacy message alpha") + db.append_message("s1", role="assistant", content="legacy reply beta") + # Force schema version to v10 so migration runs on next open. + db._conn.execute( + "UPDATE schema_version SET version = 10" + ) + db._conn.commit() + db.close() + + # Phase 2: reopen with trigram disabled — migration should still + # backfill base FTS and make existing messages searchable. + def connect_without_trigram(*args, **kwargs): + kwargs["factory"] = _NoTrigramConnection + return real_connect(*args, **kwargs) + + monkeypatch.setattr("hermes_state.sqlite3.connect", connect_without_trigram) + migrated_db = SessionDB(db_path=db_path) + try: + assert migrated_db._fts_enabled is True + assert migrated_db._trigram_available is False + assert migrated_db._fts_table_exists("messages_fts") is True + assert migrated_db._fts_table_exists("messages_fts_trigram") is False + + # Existing messages must be searchable via base FTS. + results = migrated_db.search_messages("legacy message") + assert len(results) == 1 + # snippet has FTS5 highlight markers (>>>...<<<); check raw content via get_messages + msgs = migrated_db.get_messages("s1") + assert any("legacy message" in m["content"] for m in msgs) + finally: + migrated_db.close() + + def test_cjk_search_falls_back_to_like_when_trigram_unavailable( + self, tmp_path, monkeypatch + ): + """Regression: long CJK queries must fall back to LIKE when trigram is missing.""" + real_connect = sqlite3.connect + db_path = tmp_path / "state.db" + + def connect_without_trigram(*args, **kwargs): + kwargs["factory"] = _NoTrigramConnection + return real_connect(*args, **kwargs) + + monkeypatch.setattr("hermes_state.sqlite3.connect", connect_without_trigram) + db = SessionDB(db_path=db_path) + try: + db.create_session(session_id="s1", source="cli") + db.append_message("s1", role="user", content="大别山项目计划书") + db.append_message("s1", role="user", content="长江大桥设计方案") + + # 3+ CJK chars would normally use trigram, but it's unavailable. + # Must fall back to LIKE and still return results. + results = db.search_messages("大别山") + assert len(results) == 1 + # Note: search_messages strips 'content' from results; use 'snippet'. + assert "大别山" in results[0]["snippet"] + finally: + db.close() + # ========================================================================= # Message storage