mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-14 09:11:54 +00:00
* fix(state.db): recover from malformed sqlite_master so hidden sessions reappear
The corruption class behind "Desktop/Dashboard show no sessions while
hundreds of session files sit on disk" is a malformed sqlite_master — most
often a duplicate object row, e.g. two CREATE VIRTUAL TABLE messages_fts
entries — surfacing as:
sqlite3.DatabaseError: malformed database schema (messages_fts) -
table messages_fts already exists
SQLite parses the whole schema while preparing the FIRST statement on a
connection, so on this class every statement fails before it runs: PRAGMA
journal_mode (which is where SessionDB.__init__ actually trips, in
apply_wal_with_fallback, BEFORE _init_schema), PRAGMA integrity_check, and
even DROP TABLE. The only operations that still work are
PRAGMA writable_schema=ON plus direct sqlite_master surgery. A plain
FTS-index rebuild at the _init_schema layer therefore cannot reach or fix
this; the canonical sessions/messages rows are intact — only the derived
schema is broken.
Add a dedicated recovery that operates where the failure actually happens:
- hermes_state.repair_state_db_schema(): backs up the raw file first, then a
least-destructive ladder — (1) de-duplicate sqlite_master keeping the
lowest rowid per object (preserves the existing FTS index), escalating to
(2) drop every messages_fts* schema object + VACUUM and let the next open
rebuild the FTS index from messages. sessions/messages are never modified.
Plus is_malformed_db_error() to discriminate this class.
- SessionDB.__init__ auto-heals: on a malformed-schema open error it repairs
once (process-guarded against loops / concurrent web_server opens) and
reopens, so Desktop/Dashboard recover on their own instead of silently
showing "no sessions".
- hermes doctor --fix detects the malformed class and repairs it (reporting
the recovered session count + backup name).
- hermes sessions repair [--check-only] [--no-backup] runs on the raw file
path, since SessionDB() itself cannot open a malformed DB.
Supersedes #32589 and #33869: both targeted FTS corruption but gated their
repair behind statements (integrity_check / SELECT / DROP TABLE) that
themselves fail on this class, and neither addressed the apply_wal_with_fallback
open-time failure. Credit preserved via Co-authored-by.
Closes #33865.
Co-authored-by: João Vitor Cunha <145560011+plcunha@users.noreply.github.com>
Co-authored-by: Tuna Dev <273476039+tuancookiez-hub@users.noreply.github.com>
* test(state.db): cover strat-B escalation + unrepairable safe-fail paths
---------
Co-authored-by: João Vitor Cunha <145560011+plcunha@users.noreply.github.com>
Co-authored-by: Tuna Dev <273476039+tuancookiez-hub@users.noreply.github.com>
244 lines
8.7 KiB
Python
244 lines
8.7 KiB
Python
"""Recovery from a malformed state.db schema (duplicate sqlite_master rows).
|
|
|
|
This is the corruption class behind the user-reported symptom where Desktop /
|
|
Dashboard show "no sessions yet" while hundreds of session JSON files sit on
|
|
disk, and the backend logs:
|
|
|
|
sqlite3.DatabaseError: malformed database schema (messages_fts) -
|
|
table messages_fts already exists
|
|
|
|
The error fires on the *first* statement of any connection (PRAGMA
|
|
journal_mode in apply_wal_with_fallback), before _init_schema runs — so it
|
|
cannot be handled at the FTS-rebuild layer. These tests verify the
|
|
sqlite_master surgery path recovers the canonical data and self-heals on open.
|
|
"""
|
|
import sqlite3
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
import hermes_state
|
|
from hermes_state import (
|
|
SessionDB,
|
|
is_malformed_db_error,
|
|
repair_state_db_schema,
|
|
)
|
|
|
|
|
|
def _build_healthy_db(db_path: Path) -> str:
|
|
db = SessionDB(db_path=db_path)
|
|
sid = db.create_session(session_id=str(uuid.uuid4()), source="cli")
|
|
for i in range(5):
|
|
db.append_message(sid, role="user", content=f"hello world {i}")
|
|
db.append_message(sid, role="assistant", content=f"reply about pizza {i}")
|
|
db.close()
|
|
return sid
|
|
|
|
|
|
def _corrupt_duplicate_fts(db_path: Path) -> None:
|
|
"""Inject a duplicate messages_fts row into sqlite_master.
|
|
|
|
Reproduces 'malformed database schema (messages_fts) - table
|
|
messages_fts already exists'.
|
|
"""
|
|
conn = sqlite3.connect(str(db_path))
|
|
conn.execute("PRAGMA writable_schema=ON")
|
|
conn.execute(
|
|
"INSERT INTO sqlite_master (type, name, tbl_name, rootpage, sql) "
|
|
"SELECT type, name, tbl_name, rootpage, sql FROM sqlite_master "
|
|
"WHERE name='messages_fts'"
|
|
)
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
|
|
def test_duplicate_fts_makes_every_statement_fail(tmp_path):
|
|
"""Document the failure: not even PRAGMA journal_mode survives."""
|
|
db_path = tmp_path / "state.db"
|
|
_build_healthy_db(db_path)
|
|
_corrupt_duplicate_fts(db_path)
|
|
|
|
conn = sqlite3.connect(str(db_path))
|
|
with pytest.raises(sqlite3.DatabaseError) as exc_info:
|
|
conn.execute("PRAGMA journal_mode").fetchone()
|
|
conn.close()
|
|
assert is_malformed_db_error(exc_info.value)
|
|
|
|
|
|
def test_repair_preserves_sessions_and_messages(tmp_path):
|
|
db_path = tmp_path / "state.db"
|
|
_build_healthy_db(db_path)
|
|
_corrupt_duplicate_fts(db_path)
|
|
|
|
report = repair_state_db_schema(db_path)
|
|
assert report["repaired"] is True
|
|
assert report["strategy"] in {"dedup_schema", "drop_fts_rebuild"}
|
|
# A backup of the malformed file is preserved.
|
|
assert report["backup_path"] and Path(report["backup_path"]).exists()
|
|
|
|
conn = sqlite3.connect(str(db_path))
|
|
assert conn.execute("PRAGMA integrity_check").fetchone()[0] == "ok"
|
|
assert conn.execute("SELECT COUNT(*) FROM sessions").fetchone()[0] == 1
|
|
assert conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0] == 10
|
|
conn.close()
|
|
|
|
|
|
def test_repaired_db_search_works(tmp_path):
|
|
db_path = tmp_path / "state.db"
|
|
_build_healthy_db(db_path)
|
|
_corrupt_duplicate_fts(db_path)
|
|
repair_state_db_schema(db_path)
|
|
|
|
# Reopen and confirm the FTS index is usable (rebuilt or preserved).
|
|
db = SessionDB(db_path=db_path)
|
|
try:
|
|
hits = db._conn.execute(
|
|
"SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'pizza'"
|
|
).fetchone()[0]
|
|
assert hits == 5
|
|
msg_count = db._conn.execute("SELECT COUNT(*) FROM messages_fts").fetchone()[0]
|
|
assert msg_count == 10
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
def test_sessiondb_auto_heals_on_open(tmp_path, monkeypatch):
|
|
db_path = tmp_path / "state.db"
|
|
sid = _build_healthy_db(db_path)
|
|
_corrupt_duplicate_fts(db_path)
|
|
|
|
# Fresh process-global guard so the attempt isn't pre-claimed.
|
|
monkeypatch.setattr(hermes_state, "_repair_attempted_paths", set())
|
|
|
|
db = SessionDB(db_path=db_path)
|
|
try:
|
|
assert db._conn.execute("SELECT COUNT(*) FROM sessions").fetchone()[0] == 1
|
|
assert db._conn.execute(
|
|
"SELECT id FROM sessions WHERE id=?", (sid,)
|
|
).fetchone() is not None
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
def test_auto_heal_attempted_once_per_process(tmp_path, monkeypatch):
|
|
"""A still-broken DB must not loop: the second open just raises."""
|
|
db_path = tmp_path / "state.db"
|
|
_build_healthy_db(db_path)
|
|
_corrupt_duplicate_fts(db_path)
|
|
monkeypatch.setattr(hermes_state, "_repair_attempted_paths", set())
|
|
|
|
calls = {"n": 0}
|
|
real_repair = hermes_state.repair_state_db_schema
|
|
|
|
def fake_repair(path, **kw):
|
|
calls["n"] += 1
|
|
# Pretend repair failed so the guard's one-shot behavior is exercised.
|
|
return {"repaired": False, "strategy": None, "backup_path": None, "error": "x"}
|
|
|
|
monkeypatch.setattr(hermes_state, "repair_state_db_schema", fake_repair)
|
|
|
|
with pytest.raises(sqlite3.DatabaseError):
|
|
SessionDB(db_path=db_path)
|
|
with pytest.raises(sqlite3.DatabaseError):
|
|
SessionDB(db_path=db_path)
|
|
assert calls["n"] == 1 # repair attempted only once across both opens
|
|
|
|
monkeypatch.setattr(hermes_state, "repair_state_db_schema", real_repair)
|
|
|
|
|
|
def test_is_malformed_db_error_discriminates():
|
|
assert is_malformed_db_error(
|
|
sqlite3.DatabaseError("malformed database schema (messages_fts) - ...")
|
|
)
|
|
assert is_malformed_db_error(sqlite3.DatabaseError("database disk image is malformed"))
|
|
assert not is_malformed_db_error(sqlite3.OperationalError("database is locked"))
|
|
assert not is_malformed_db_error(ValueError("nope"))
|
|
|
|
|
|
def test_strategy_b_rebuild_when_dedup_insufficient(tmp_path, monkeypatch):
|
|
"""If the dedup pass can't fix it, the drop-FTS + rebuild pass must.
|
|
|
|
Force strat 1 to be a no-op so the escalation path is exercised against a
|
|
real malformed file. Data must still survive and search must work.
|
|
"""
|
|
db_path = tmp_path / "state.db"
|
|
_build_healthy_db(db_path)
|
|
_corrupt_duplicate_fts(db_path)
|
|
|
|
# Make the post-strat-1 verification report "still broken" exactly once,
|
|
# so the routine escalates to strat 2 (drop FTS + VACUUM) and runs its
|
|
# real SQL against the file; the strat-2 verification then uses the real
|
|
# check and passes.
|
|
real_check = hermes_state._db_opens_cleanly
|
|
calls = {"n": 0}
|
|
|
|
def flaky_check(path):
|
|
calls["n"] += 1
|
|
if calls["n"] == 1:
|
|
return "pretend strat 1 was insufficient"
|
|
return real_check(path)
|
|
|
|
monkeypatch.setattr(hermes_state, "_db_opens_cleanly", flaky_check)
|
|
report = repair_state_db_schema(db_path)
|
|
monkeypatch.undo()
|
|
|
|
assert report["repaired"] is True
|
|
assert report["strategy"] == "drop_fts_rebuild"
|
|
assert calls["n"] >= 2
|
|
|
|
db = SessionDB(db_path=db_path)
|
|
try:
|
|
assert db._conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0] == 10
|
|
assert db._conn.execute(
|
|
"SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'pizza'"
|
|
).fetchone()[0] == 5
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
def test_unrepairable_file_fails_safely(tmp_path, monkeypatch):
|
|
"""A file too damaged to recover must report failure, keep a backup, and
|
|
never raise from the repair routine itself."""
|
|
db_path = tmp_path / "state.db"
|
|
db_path.write_bytes(b"SQLite format 3\x00" + b"\x00\xde\xad\xbe\xef" * 200)
|
|
|
|
report = repair_state_db_schema(db_path)
|
|
assert report["repaired"] is False
|
|
assert report["error"]
|
|
# The (damaged) original bytes are preserved for manual restore.
|
|
assert report["backup_path"] and Path(report["backup_path"]).exists()
|
|
|
|
|
|
def test_non_malformed_error_is_not_auto_repaired(tmp_path, monkeypatch):
|
|
"""Auto-heal must only trigger for the malformed-schema class, not for
|
|
e.g. 'file is not a database' — those raise unchanged."""
|
|
db_path = tmp_path / "state.db"
|
|
db_path.write_bytes(b"this is definitely not a sqlite database")
|
|
monkeypatch.setattr(hermes_state, "_repair_attempted_paths", set())
|
|
|
|
called = {"n": 0}
|
|
orig = hermes_state.repair_state_db_schema
|
|
|
|
def spy(*a, **kw):
|
|
called["n"] += 1
|
|
return orig(*a, **kw)
|
|
|
|
monkeypatch.setattr(hermes_state, "repair_state_db_schema", spy)
|
|
with pytest.raises(sqlite3.DatabaseError):
|
|
SessionDB(db_path=db_path)
|
|
assert called["n"] == 0 # never attempted repair for a non-malformed error
|
|
|
|
|
|
def test_repair_on_clean_db_is_noop(tmp_path):
|
|
"""Dedup-keyed repair must not damage a healthy DB if invoked."""
|
|
db_path = tmp_path / "state.db"
|
|
_build_healthy_db(db_path)
|
|
|
|
report = repair_state_db_schema(db_path, backup=False)
|
|
assert report["repaired"] is True # opens cleanly after a no-op dedup
|
|
|
|
conn = sqlite3.connect(str(db_path))
|
|
assert conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0] == 10
|
|
assert conn.execute("PRAGMA integrity_check").fetchone()[0] == "ok"
|
|
conn.close()
|