hermes-agent/tests/test_state_db_malformed_repair.py
brooklyn! 218452b050
fix(state.db): recover from malformed sqlite_master so hidden sessions reappear (#43149)
* fix(state.db): recover from malformed sqlite_master so hidden sessions reappear

The corruption class behind "Desktop/Dashboard show no sessions while
hundreds of session files sit on disk" is a malformed sqlite_master — most
often a duplicate object row, e.g. two CREATE VIRTUAL TABLE messages_fts
entries — surfacing as:

    sqlite3.DatabaseError: malformed database schema (messages_fts) -
    table messages_fts already exists

SQLite parses the whole schema while preparing the FIRST statement on a
connection, so on this class every statement fails before it runs: PRAGMA
journal_mode (which is where SessionDB.__init__ actually trips, in
apply_wal_with_fallback, BEFORE _init_schema), PRAGMA integrity_check, and
even DROP TABLE. The only operations that still work are
PRAGMA writable_schema=ON plus direct sqlite_master surgery. A plain
FTS-index rebuild at the _init_schema layer therefore cannot reach or fix
this; the canonical sessions/messages rows are intact — only the derived
schema is broken.

Add a dedicated recovery that operates where the failure actually happens:

- hermes_state.repair_state_db_schema(): backs up the raw file first, then a
  least-destructive ladder — (1) de-duplicate sqlite_master keeping the
  lowest rowid per object (preserves the existing FTS index), escalating to
  (2) drop every messages_fts* schema object + VACUUM and let the next open
  rebuild the FTS index from messages. sessions/messages are never modified.
  Plus is_malformed_db_error() to discriminate this class.
- SessionDB.__init__ auto-heals: on a malformed-schema open error it repairs
  once (process-guarded against loops / concurrent web_server opens) and
  reopens, so Desktop/Dashboard recover on their own instead of silently
  showing "no sessions".
- hermes doctor --fix detects the malformed class and repairs it (reporting
  the recovered session count + backup name).
- hermes sessions repair [--check-only] [--no-backup] runs on the raw file
  path, since SessionDB() itself cannot open a malformed DB.

Supersedes #32589 and #33869: both targeted FTS corruption but gated their
repair behind statements (integrity_check / SELECT / DROP TABLE) that
themselves fail on this class, and neither addressed the apply_wal_with_fallback
open-time failure. Credit preserved via Co-authored-by.

Closes #33865.

Co-authored-by: João Vitor Cunha <145560011+plcunha@users.noreply.github.com>
Co-authored-by: Tuna Dev <273476039+tuancookiez-hub@users.noreply.github.com>

* test(state.db): cover strat-B escalation + unrepairable safe-fail paths

---------

Co-authored-by: João Vitor Cunha <145560011+plcunha@users.noreply.github.com>
Co-authored-by: Tuna Dev <273476039+tuancookiez-hub@users.noreply.github.com>
2026-06-09 18:49:08 -05:00

244 lines
8.7 KiB
Python

"""Recovery from a malformed state.db schema (duplicate sqlite_master rows).
This is the corruption class behind the user-reported symptom where Desktop /
Dashboard show "no sessions yet" while hundreds of session JSON files sit on
disk, and the backend logs:
sqlite3.DatabaseError: malformed database schema (messages_fts) -
table messages_fts already exists
The error fires on the *first* statement of any connection (PRAGMA
journal_mode in apply_wal_with_fallback), before _init_schema runs — so it
cannot be handled at the FTS-rebuild layer. These tests verify the
sqlite_master surgery path recovers the canonical data and self-heals on open.
"""
import sqlite3
import uuid
from pathlib import Path
import pytest
import hermes_state
from hermes_state import (
SessionDB,
is_malformed_db_error,
repair_state_db_schema,
)
def _build_healthy_db(db_path: Path) -> str:
db = SessionDB(db_path=db_path)
sid = db.create_session(session_id=str(uuid.uuid4()), source="cli")
for i in range(5):
db.append_message(sid, role="user", content=f"hello world {i}")
db.append_message(sid, role="assistant", content=f"reply about pizza {i}")
db.close()
return sid
def _corrupt_duplicate_fts(db_path: Path) -> None:
"""Inject a duplicate messages_fts row into sqlite_master.
Reproduces 'malformed database schema (messages_fts) - table
messages_fts already exists'.
"""
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA writable_schema=ON")
conn.execute(
"INSERT INTO sqlite_master (type, name, tbl_name, rootpage, sql) "
"SELECT type, name, tbl_name, rootpage, sql FROM sqlite_master "
"WHERE name='messages_fts'"
)
conn.commit()
conn.close()
def test_duplicate_fts_makes_every_statement_fail(tmp_path):
"""Document the failure: not even PRAGMA journal_mode survives."""
db_path = tmp_path / "state.db"
_build_healthy_db(db_path)
_corrupt_duplicate_fts(db_path)
conn = sqlite3.connect(str(db_path))
with pytest.raises(sqlite3.DatabaseError) as exc_info:
conn.execute("PRAGMA journal_mode").fetchone()
conn.close()
assert is_malformed_db_error(exc_info.value)
def test_repair_preserves_sessions_and_messages(tmp_path):
db_path = tmp_path / "state.db"
_build_healthy_db(db_path)
_corrupt_duplicate_fts(db_path)
report = repair_state_db_schema(db_path)
assert report["repaired"] is True
assert report["strategy"] in {"dedup_schema", "drop_fts_rebuild"}
# A backup of the malformed file is preserved.
assert report["backup_path"] and Path(report["backup_path"]).exists()
conn = sqlite3.connect(str(db_path))
assert conn.execute("PRAGMA integrity_check").fetchone()[0] == "ok"
assert conn.execute("SELECT COUNT(*) FROM sessions").fetchone()[0] == 1
assert conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0] == 10
conn.close()
def test_repaired_db_search_works(tmp_path):
db_path = tmp_path / "state.db"
_build_healthy_db(db_path)
_corrupt_duplicate_fts(db_path)
repair_state_db_schema(db_path)
# Reopen and confirm the FTS index is usable (rebuilt or preserved).
db = SessionDB(db_path=db_path)
try:
hits = db._conn.execute(
"SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'pizza'"
).fetchone()[0]
assert hits == 5
msg_count = db._conn.execute("SELECT COUNT(*) FROM messages_fts").fetchone()[0]
assert msg_count == 10
finally:
db.close()
def test_sessiondb_auto_heals_on_open(tmp_path, monkeypatch):
db_path = tmp_path / "state.db"
sid = _build_healthy_db(db_path)
_corrupt_duplicate_fts(db_path)
# Fresh process-global guard so the attempt isn't pre-claimed.
monkeypatch.setattr(hermes_state, "_repair_attempted_paths", set())
db = SessionDB(db_path=db_path)
try:
assert db._conn.execute("SELECT COUNT(*) FROM sessions").fetchone()[0] == 1
assert db._conn.execute(
"SELECT id FROM sessions WHERE id=?", (sid,)
).fetchone() is not None
finally:
db.close()
def test_auto_heal_attempted_once_per_process(tmp_path, monkeypatch):
"""A still-broken DB must not loop: the second open just raises."""
db_path = tmp_path / "state.db"
_build_healthy_db(db_path)
_corrupt_duplicate_fts(db_path)
monkeypatch.setattr(hermes_state, "_repair_attempted_paths", set())
calls = {"n": 0}
real_repair = hermes_state.repair_state_db_schema
def fake_repair(path, **kw):
calls["n"] += 1
# Pretend repair failed so the guard's one-shot behavior is exercised.
return {"repaired": False, "strategy": None, "backup_path": None, "error": "x"}
monkeypatch.setattr(hermes_state, "repair_state_db_schema", fake_repair)
with pytest.raises(sqlite3.DatabaseError):
SessionDB(db_path=db_path)
with pytest.raises(sqlite3.DatabaseError):
SessionDB(db_path=db_path)
assert calls["n"] == 1 # repair attempted only once across both opens
monkeypatch.setattr(hermes_state, "repair_state_db_schema", real_repair)
def test_is_malformed_db_error_discriminates():
assert is_malformed_db_error(
sqlite3.DatabaseError("malformed database schema (messages_fts) - ...")
)
assert is_malformed_db_error(sqlite3.DatabaseError("database disk image is malformed"))
assert not is_malformed_db_error(sqlite3.OperationalError("database is locked"))
assert not is_malformed_db_error(ValueError("nope"))
def test_strategy_b_rebuild_when_dedup_insufficient(tmp_path, monkeypatch):
"""If the dedup pass can't fix it, the drop-FTS + rebuild pass must.
Force strat 1 to be a no-op so the escalation path is exercised against a
real malformed file. Data must still survive and search must work.
"""
db_path = tmp_path / "state.db"
_build_healthy_db(db_path)
_corrupt_duplicate_fts(db_path)
# Make the post-strat-1 verification report "still broken" exactly once,
# so the routine escalates to strat 2 (drop FTS + VACUUM) and runs its
# real SQL against the file; the strat-2 verification then uses the real
# check and passes.
real_check = hermes_state._db_opens_cleanly
calls = {"n": 0}
def flaky_check(path):
calls["n"] += 1
if calls["n"] == 1:
return "pretend strat 1 was insufficient"
return real_check(path)
monkeypatch.setattr(hermes_state, "_db_opens_cleanly", flaky_check)
report = repair_state_db_schema(db_path)
monkeypatch.undo()
assert report["repaired"] is True
assert report["strategy"] == "drop_fts_rebuild"
assert calls["n"] >= 2
db = SessionDB(db_path=db_path)
try:
assert db._conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0] == 10
assert db._conn.execute(
"SELECT COUNT(*) FROM messages_fts WHERE messages_fts MATCH 'pizza'"
).fetchone()[0] == 5
finally:
db.close()
def test_unrepairable_file_fails_safely(tmp_path, monkeypatch):
"""A file too damaged to recover must report failure, keep a backup, and
never raise from the repair routine itself."""
db_path = tmp_path / "state.db"
db_path.write_bytes(b"SQLite format 3\x00" + b"\x00\xde\xad\xbe\xef" * 200)
report = repair_state_db_schema(db_path)
assert report["repaired"] is False
assert report["error"]
# The (damaged) original bytes are preserved for manual restore.
assert report["backup_path"] and Path(report["backup_path"]).exists()
def test_non_malformed_error_is_not_auto_repaired(tmp_path, monkeypatch):
"""Auto-heal must only trigger for the malformed-schema class, not for
e.g. 'file is not a database' — those raise unchanged."""
db_path = tmp_path / "state.db"
db_path.write_bytes(b"this is definitely not a sqlite database")
monkeypatch.setattr(hermes_state, "_repair_attempted_paths", set())
called = {"n": 0}
orig = hermes_state.repair_state_db_schema
def spy(*a, **kw):
called["n"] += 1
return orig(*a, **kw)
monkeypatch.setattr(hermes_state, "repair_state_db_schema", spy)
with pytest.raises(sqlite3.DatabaseError):
SessionDB(db_path=db_path)
assert called["n"] == 0 # never attempted repair for a non-malformed error
def test_repair_on_clean_db_is_noop(tmp_path):
"""Dedup-keyed repair must not damage a healthy DB if invoked."""
db_path = tmp_path / "state.db"
_build_healthy_db(db_path)
report = repair_state_db_schema(db_path, backup=False)
assert report["repaired"] is True # opens cleanly after a no-op dedup
conn = sqlite3.connect(str(db_path))
assert conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0] == 10
assert conn.execute("PRAGMA integrity_check").fetchone()[0] == "ok"
conn.close()