mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-20 10:11:58 +00:00
fix(agent): address review feedback on trigram tokenizer fallback
- Scope 'no such tokenizer' matcher to trigram specifically (#779) - Decouple base FTS and trigram backfill in v11 migration (#1195) - CJK search falls back to LIKE when trigram unavailable (#3384/#3430) - Add _trigram_available tracking across init, migration, and startup - Add regression tests for migration backfill and CJK LIKE fallback - Add _is_trigram_unavailable_error and _warn_trigram_unavailable helpers
This commit is contained in:
parent
0403f41f9c
commit
c10aa5dc9c
2 changed files with 138 additions and 22 deletions
|
|
@ -684,6 +684,7 @@ class SessionDB:
|
|||
self._lock = threading.Lock()
|
||||
self._write_count = 0
|
||||
self._fts_enabled = False
|
||||
self._trigram_available = False
|
||||
self._fts_unavailable_warned = False
|
||||
self._conn = None
|
||||
try:
|
||||
|
|
@ -776,14 +777,29 @@ class SessionDB:
|
|||
return True
|
||||
# SQLite builds that have FTS5 but lack the optional trigram tokenizer
|
||||
# raise "no such tokenizer: trigram" instead of "no such module".
|
||||
if "no such tokenizer" in err:
|
||||
# Scope to trigram specifically to avoid masking unrelated tokenizer errors.
|
||||
if "no such tokenizer: trigram" in err:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _is_tokenizer_unavailable_error(exc: sqlite3.OperationalError) -> bool:
|
||||
"""Check if the error is about a specific tokenizer (not the whole FTS5 module)."""
|
||||
return "no such tokenizer" in str(exc).lower()
|
||||
def _is_trigram_unavailable_error(exc: sqlite3.OperationalError) -> bool:
|
||||
"""True when only the trigram tokenizer is missing (FTS5 itself works)."""
|
||||
return "no such tokenizer: trigram" in str(exc).lower()
|
||||
|
||||
def _warn_trigram_unavailable(self, exc: sqlite3.OperationalError) -> None:
|
||||
"""Log once that the trigram tokenizer is missing; base FTS5 stays enabled."""
|
||||
if getattr(self, "_trigram_unavailable_warned", False):
|
||||
return
|
||||
self._trigram_unavailable_warned = True
|
||||
logger.info(
|
||||
"SQLite trigram tokenizer unavailable for %s "
|
||||
"(requires SQLite >= 3.34, this build is %s); "
|
||||
"CJK/substring search will fall back to LIKE: %s",
|
||||
self.db_path,
|
||||
sqlite3.sqlite_version,
|
||||
exc,
|
||||
)
|
||||
|
||||
def _warn_fts5_unavailable(self, exc: sqlite3.OperationalError) -> None:
|
||||
self._fts_enabled = False
|
||||
|
|
@ -856,7 +872,10 @@ class SessionDB:
|
|||
except sqlite3.OperationalError as exc:
|
||||
if self._is_fts5_unavailable_error(exc):
|
||||
# Only disable FTS entirely when the whole module is missing.
|
||||
if not self._is_tokenizer_unavailable_error(exc):
|
||||
# A missing trigram tokenizer only affects trigram searches.
|
||||
if self._is_trigram_unavailable_error(exc):
|
||||
self._warn_trigram_unavailable(exc)
|
||||
else:
|
||||
self._warn_fts5_unavailable(exc)
|
||||
return None
|
||||
if "no such table" in str(exc).lower():
|
||||
|
|
@ -884,7 +903,9 @@ class SessionDB:
|
|||
# Only disable FTS entirely when the whole FTS5 module is missing.
|
||||
# A missing specific tokenizer (e.g. trigram) means only that
|
||||
# particular table cannot be created — the base FTS5 table is fine.
|
||||
if not self._is_tokenizer_unavailable_error(exc):
|
||||
if self._is_trigram_unavailable_error(exc):
|
||||
self._warn_trigram_unavailable(exc)
|
||||
else:
|
||||
self._warn_fts5_unavailable(exc)
|
||||
return False
|
||||
|
||||
|
|
@ -1183,22 +1204,23 @@ class SessionDB:
|
|||
except sqlite3.OperationalError as exc:
|
||||
if not self._is_fts5_unavailable_error(exc):
|
||||
raise
|
||||
if not self._is_tokenizer_unavailable_error(exc):
|
||||
if self._is_trigram_unavailable_error(exc):
|
||||
self._warn_trigram_unavailable(exc)
|
||||
else:
|
||||
self._warn_fts5_unavailable(exc)
|
||||
fts5_available = False
|
||||
fts_migrations_complete = False
|
||||
fts5_available = False
|
||||
fts_migrations_complete = False
|
||||
break
|
||||
|
||||
if fts5_available:
|
||||
# Recreate virtual tables + triggers with the new inline-mode
|
||||
# schema that indexes content || tool_name || tool_calls.
|
||||
if (
|
||||
self._ensure_fts_schema(cursor, "messages_fts", FTS_SQL)
|
||||
and self._ensure_fts_schema(
|
||||
cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL
|
||||
)
|
||||
):
|
||||
# Backfill both indexes from every existing messages row.
|
||||
# Handle base and trigram independently — a missing
|
||||
# trigram tokenizer should not prevent base FTS backfill.
|
||||
base_fts_ok = self._ensure_fts_schema(
|
||||
cursor, "messages_fts", FTS_SQL
|
||||
)
|
||||
if base_fts_ok:
|
||||
cursor.execute(
|
||||
"INSERT INTO messages_fts(rowid, content) "
|
||||
"SELECT id, "
|
||||
|
|
@ -1207,6 +1229,10 @@ class SessionDB:
|
|||
"COALESCE(tool_calls, '') "
|
||||
"FROM messages"
|
||||
)
|
||||
trigram_ok = self._ensure_fts_schema(
|
||||
cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL
|
||||
)
|
||||
if trigram_ok:
|
||||
cursor.execute(
|
||||
"INSERT INTO messages_fts_trigram(rowid, content) "
|
||||
"SELECT id, "
|
||||
|
|
@ -1215,8 +1241,12 @@ class SessionDB:
|
|||
"COALESCE(tool_calls, '') "
|
||||
"FROM messages"
|
||||
)
|
||||
else:
|
||||
if not base_fts_ok:
|
||||
fts_migrations_complete = False
|
||||
# Track trigram availability for CJK LIKE fallback.
|
||||
self._trigram_available = trigram_ok
|
||||
else:
|
||||
fts_migrations_complete = False
|
||||
else:
|
||||
fts_migrations_complete = False
|
||||
if current_version < 12:
|
||||
|
|
@ -1286,6 +1316,7 @@ class SessionDB:
|
|||
trigram_enabled = self._ensure_fts_schema(
|
||||
cursor, "messages_fts_trigram", FTS_TRIGRAM_SQL
|
||||
)
|
||||
self._trigram_available = trigram_enabled
|
||||
if trigram_enabled and triggers_need_repair:
|
||||
self._rebuild_fts_indexes(cursor)
|
||||
|
||||
|
|
@ -3422,7 +3453,8 @@ class SessionDB:
|
|||
self._count_cjk(t) < 3 for t in _tokens_for_check
|
||||
)
|
||||
|
||||
if cjk_count >= 3 and not _any_short_cjk:
|
||||
_trigram_succeeded = False
|
||||
if cjk_count >= 3 and not _any_short_cjk and self._trigram_available:
|
||||
# Trigram FTS5 path — quote each non-operator token to handle
|
||||
# FTS5 special chars (%, *, etc.) while preserving boolean
|
||||
# operators (AND, OR, NOT) for multi-term queries.
|
||||
|
|
@ -3471,11 +3503,13 @@ class SessionDB:
|
|||
try:
|
||||
tri_cursor = self._conn.execute(tri_sql, tri_params)
|
||||
except sqlite3.OperationalError:
|
||||
matches = []
|
||||
# Trigram query failed at runtime — fall through to LIKE.
|
||||
pass
|
||||
else:
|
||||
matches = [dict(row) for row in tri_cursor.fetchall()]
|
||||
else:
|
||||
# Short / mixed CJK query: trigram cannot match tokens with
|
||||
_trigram_succeeded = True
|
||||
if not _trigram_succeeded:
|
||||
# Short / mixed CJK query, trigram unavailable, or trigram
|
||||
# <3 CJK chars. Fall back to LIKE substring search.
|
||||
# For multi-token OR queries (e.g. "广西 OR 桂林 OR 漓江"),
|
||||
# build one LIKE condition per non-operator token so each term
|
||||
|
|
|
|||
|
|
@ -345,15 +345,28 @@ class TestSessionLifecycle:
|
|||
restored.close()
|
||||
|
||||
def test_is_fts5_unavailable_error_catches_trigram_tokenizer(self):
|
||||
"""Unit test: _is_fts5_unavailable_error matches 'no such tokenizer'."""
|
||||
"""Unit test: _is_fts5_unavailable_error matches 'no such tokenizer: trigram'."""
|
||||
fts5_err = sqlite3.OperationalError("no such module: fts5")
|
||||
trigram_err = sqlite3.OperationalError("no such tokenizer: trigram")
|
||||
generic_tokenizer_err = sqlite3.OperationalError("no such tokenizer: foo")
|
||||
unrelated_err = sqlite3.OperationalError("no such table: foo")
|
||||
|
||||
assert SessionDB._is_fts5_unavailable_error(fts5_err) is True
|
||||
assert SessionDB._is_fts5_unavailable_error(trigram_err) is True
|
||||
# Generic tokenizer errors should NOT match — only trigram.
|
||||
assert SessionDB._is_fts5_unavailable_error(generic_tokenizer_err) is False
|
||||
assert SessionDB._is_fts5_unavailable_error(unrelated_err) is False
|
||||
|
||||
def test_is_trigram_unavailable_error(self):
|
||||
"""Unit test: _is_trigram_unavailable_error is scoped to trigram."""
|
||||
trigram_err = sqlite3.OperationalError("no such tokenizer: trigram")
|
||||
generic_err = sqlite3.OperationalError("no such tokenizer: foo")
|
||||
fts5_err = sqlite3.OperationalError("no such module: fts5")
|
||||
|
||||
assert SessionDB._is_trigram_unavailable_error(trigram_err) is True
|
||||
assert SessionDB._is_trigram_unavailable_error(generic_err) is False
|
||||
assert SessionDB._is_trigram_unavailable_error(fts5_err) is False
|
||||
|
||||
def test_db_initializes_without_trigram_tokenizer(self, tmp_path, monkeypatch):
|
||||
"""SessionDB must not crash when FTS5 exists but trigram tokenizer is missing."""
|
||||
real_connect = sqlite3.connect
|
||||
|
|
@ -384,6 +397,75 @@ class TestSessionLifecycle:
|
|||
finally:
|
||||
db.close()
|
||||
|
||||
def test_v11_migration_backfills_base_fts_when_trigram_unavailable(
|
||||
self, tmp_path, monkeypatch
|
||||
):
|
||||
"""Regression: v11 migration must backfill base FTS even when trigram is unavailable."""
|
||||
real_connect = sqlite3.connect
|
||||
db_path = tmp_path / "state.db"
|
||||
|
||||
# Phase 1: create a DB at schema v10 with messages.
|
||||
db = SessionDB(db_path=db_path)
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="legacy message alpha")
|
||||
db.append_message("s1", role="assistant", content="legacy reply beta")
|
||||
# Force schema version to v10 so migration runs on next open.
|
||||
db._conn.execute(
|
||||
"UPDATE schema_version SET version = 10"
|
||||
)
|
||||
db._conn.commit()
|
||||
db.close()
|
||||
|
||||
# Phase 2: reopen with trigram disabled — migration should still
|
||||
# backfill base FTS and make existing messages searchable.
|
||||
def connect_without_trigram(*args, **kwargs):
|
||||
kwargs["factory"] = _NoTrigramConnection
|
||||
return real_connect(*args, **kwargs)
|
||||
|
||||
monkeypatch.setattr("hermes_state.sqlite3.connect", connect_without_trigram)
|
||||
migrated_db = SessionDB(db_path=db_path)
|
||||
try:
|
||||
assert migrated_db._fts_enabled is True
|
||||
assert migrated_db._trigram_available is False
|
||||
assert migrated_db._fts_table_exists("messages_fts") is True
|
||||
assert migrated_db._fts_table_exists("messages_fts_trigram") is False
|
||||
|
||||
# Existing messages must be searchable via base FTS.
|
||||
results = migrated_db.search_messages("legacy message")
|
||||
assert len(results) == 1
|
||||
# snippet has FTS5 highlight markers (>>>...<<<); check raw content via get_messages
|
||||
msgs = migrated_db.get_messages("s1")
|
||||
assert any("legacy message" in m["content"] for m in msgs)
|
||||
finally:
|
||||
migrated_db.close()
|
||||
|
||||
def test_cjk_search_falls_back_to_like_when_trigram_unavailable(
|
||||
self, tmp_path, monkeypatch
|
||||
):
|
||||
"""Regression: long CJK queries must fall back to LIKE when trigram is missing."""
|
||||
real_connect = sqlite3.connect
|
||||
db_path = tmp_path / "state.db"
|
||||
|
||||
def connect_without_trigram(*args, **kwargs):
|
||||
kwargs["factory"] = _NoTrigramConnection
|
||||
return real_connect(*args, **kwargs)
|
||||
|
||||
monkeypatch.setattr("hermes_state.sqlite3.connect", connect_without_trigram)
|
||||
db = SessionDB(db_path=db_path)
|
||||
try:
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message("s1", role="user", content="大别山项目计划书")
|
||||
db.append_message("s1", role="user", content="长江大桥设计方案")
|
||||
|
||||
# 3+ CJK chars would normally use trigram, but it's unavailable.
|
||||
# Must fall back to LIKE and still return results.
|
||||
results = db.search_messages("大别山")
|
||||
assert len(results) == 1
|
||||
# Note: search_messages strips 'content' from results; use 'snippet'.
|
||||
assert "大别山" in results[0]["snippet"]
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Message storage
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue