mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-07 08:02:23 +00:00
fix(kanban): add post-commit page_count invariant check to write_txn
Reads header bytes 28-31 after every COMMIT and compares against actual file size. Raises sqlite3.DatabaseError on torn-extend (actual_pages < page_count). Also sets PRAGMA wal_autocheckpoint=100 in connect(). Refs: #31208 (Bug E - same file, coordinate), #30973 (wal_autocheckpoint) Refs: #30445, #30896, #30908 (corruption reports)
This commit is contained in:
parent
c002668ff0
commit
99c19eb2fe
2 changed files with 147 additions and 0 deletions
|
|
@ -1212,6 +1212,7 @@ def connect(
|
||||||
# FULL (was NORMAL): fsync before each checkpoint to narrow the
|
# FULL (was NORMAL): fsync before each checkpoint to narrow the
|
||||||
# crash window that can leave a b-tree page header torn.
|
# crash window that can leave a b-tree page header torn.
|
||||||
conn.execute("PRAGMA synchronous=FULL")
|
conn.execute("PRAGMA synchronous=FULL")
|
||||||
|
conn.execute("PRAGMA wal_autocheckpoint=100")
|
||||||
conn.execute("PRAGMA foreign_keys=ON")
|
conn.execute("PRAGMA foreign_keys=ON")
|
||||||
# Zero freed pages so a later torn write cannot expose stale
|
# Zero freed pages so a later torn write cannot expose stale
|
||||||
# cell content; persisted in the DB header for new DBs.
|
# cell content; persisted in the DB header for new DBs.
|
||||||
|
|
@ -1502,6 +1503,45 @@ def _migrate_add_optional_columns(conn: sqlite3.Connection) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_file_length_invariant(conn: sqlite3.Connection) -> None:
|
||||||
|
"""Read the SQLite header page_count and compare against actual file size.
|
||||||
|
|
||||||
|
Raises sqlite3.DatabaseError if the file is shorter than the header claims
|
||||||
|
(torn-extend corruption).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
row = conn.execute("PRAGMA database_list").fetchone()
|
||||||
|
if row is None:
|
||||||
|
return
|
||||||
|
path_str = row[2] # column 2 is the file path; empty for in-memory DBs
|
||||||
|
if not path_str:
|
||||||
|
return # in-memory or unnamed DB; skip
|
||||||
|
path = path_str
|
||||||
|
page_size = conn.execute("PRAGMA page_size").fetchone()[0]
|
||||||
|
file_size = os.path.getsize(path)
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
f.seek(28)
|
||||||
|
header_bytes = f.read(4)
|
||||||
|
if len(header_bytes) < 4:
|
||||||
|
return # can't read header; skip
|
||||||
|
header_page_count = int.from_bytes(header_bytes, "big")
|
||||||
|
if header_page_count == 0:
|
||||||
|
return # new/empty DB; skip
|
||||||
|
actual_pages = file_size // page_size
|
||||||
|
if actual_pages < header_page_count:
|
||||||
|
raise sqlite3.DatabaseError(
|
||||||
|
f"torn-extend detected: page count mismatch on {path}: "
|
||||||
|
f"header claims {header_page_count} pages, "
|
||||||
|
f"file has {actual_pages} pages "
|
||||||
|
f"(missing {header_page_count - actual_pages} pages, "
|
||||||
|
f"file_size={file_size}, page_size={page_size})"
|
||||||
|
)
|
||||||
|
except sqlite3.DatabaseError:
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
pass # I/O errors during check are non-fatal; let normal ops continue
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def write_txn(conn: sqlite3.Connection):
|
def write_txn(conn: sqlite3.Connection):
|
||||||
"""Context manager for an IMMEDIATE write transaction.
|
"""Context manager for an IMMEDIATE write transaction.
|
||||||
|
|
@ -1528,6 +1568,9 @@ def write_txn(conn: sqlite3.Connection):
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
conn.execute("COMMIT")
|
conn.execute("COMMIT")
|
||||||
|
# Post-commit file-length check: header page_count must match actual file pages.
|
||||||
|
# A discrepancy means a torn-extend — raise now rather than silently corrupt.
|
||||||
|
_check_file_length_invariant(conn)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ import concurrent.futures
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
|
import unittest.mock
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -3548,3 +3549,106 @@ def test_write_txn_preserves_original_exception_when_rollback_fails(kanban_home)
|
||||||
f"write_txn surfaced the rollback failure instead of the original "
|
f"write_txn surfaced the rollback failure instead of the original "
|
||||||
f"OperationalError; got {msg!r}"
|
f"OperationalError; got {msg!r}"
|
||||||
)
|
)
|
||||||
|
def test_write_txn_healthy_commit_no_exception(tmp_path):
|
||||||
|
"""Normal commit does not trigger the torn-extend check."""
|
||||||
|
from hermes_cli.kanban_db import connect, write_txn, create_task
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
conn = connect(db_path=db)
|
||||||
|
# Should not raise
|
||||||
|
with write_txn(conn) as c:
|
||||||
|
c.execute(
|
||||||
|
"INSERT INTO tasks (id, title, assignee, status, priority, created_at) "
|
||||||
|
"VALUES ('t_test01', 'test task', 'tester', 'todo', 0, 1234567890)"
|
||||||
|
)
|
||||||
|
row = conn.execute("SELECT title FROM tasks WHERE id='t_test01'").fetchone()
|
||||||
|
assert row["title"] == "test task"
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_write_txn_raises_on_truncated_file(tmp_path):
|
||||||
|
"""A mocked smaller file size triggers the torn-extend check."""
|
||||||
|
from hermes_cli.kanban_db import connect, write_txn
|
||||||
|
import hermes_cli.kanban_db as kanban_db_module
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
conn = connect(db_path=db)
|
||||||
|
# Get actual page size so we can fake a smaller file
|
||||||
|
page_size = conn.execute("PRAGMA page_size").fetchone()[0]
|
||||||
|
original_getsize = os.path.getsize
|
||||||
|
|
||||||
|
def fake_getsize(path):
|
||||||
|
# Return a size that implies at least 1 fewer page than header claims
|
||||||
|
real_size = original_getsize(path)
|
||||||
|
return max(0, real_size - page_size)
|
||||||
|
|
||||||
|
with pytest.raises(sqlite3.DatabaseError, match="torn-extend|page count mismatch"):
|
||||||
|
with unittest.mock.patch("hermes_cli.kanban_db.os.path.getsize", side_effect=fake_getsize):
|
||||||
|
with write_txn(conn) as c:
|
||||||
|
c.execute(
|
||||||
|
"INSERT INTO tasks (id, title, assignee, status, priority, created_at) "
|
||||||
|
"VALUES ('t_test02', 'test task 2', 'tester', 'todo', 0, 1234567890)"
|
||||||
|
)
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_write_txn_post_commit_check_fires_every_call(tmp_path):
|
||||||
|
"""The invariant check runs on every write_txn call."""
|
||||||
|
from hermes_cli.kanban_db import connect, write_txn
|
||||||
|
import hermes_cli.kanban_db as kanban_db_module
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
conn = connect(db_path=db)
|
||||||
|
call_count = 0
|
||||||
|
real_check = kanban_db_module._check_file_length_invariant
|
||||||
|
|
||||||
|
def counting_check(c):
|
||||||
|
nonlocal call_count
|
||||||
|
call_count += 1
|
||||||
|
real_check(c)
|
||||||
|
|
||||||
|
with unittest.mock.patch.object(kanban_db_module, "_check_file_length_invariant", counting_check):
|
||||||
|
for i in range(3):
|
||||||
|
with write_txn(conn) as c:
|
||||||
|
c.execute(
|
||||||
|
f"INSERT INTO tasks (id, title, assignee, status, priority, created_at) "
|
||||||
|
f"VALUES ('t_fire{i:02d}', 'task {i}', 'tester', 'todo', 0, 1234567890)"
|
||||||
|
)
|
||||||
|
assert call_count == 3
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_connect_sets_wal_autocheckpoint_100(tmp_path):
|
||||||
|
"""connect() sets wal_autocheckpoint to 100."""
|
||||||
|
from hermes_cli.kanban_db import connect
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
conn = connect(db_path=db)
|
||||||
|
val = conn.execute("PRAGMA wal_autocheckpoint").fetchone()[0]
|
||||||
|
assert val == 100
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_write_txn_check_reads_correct_header_fields(tmp_path):
|
||||||
|
"""Synthetic DB file with mismatched header page_count triggers the check."""
|
||||||
|
import struct
|
||||||
|
from hermes_cli.kanban_db import connect, write_txn, _check_file_length_invariant
|
||||||
|
db = tmp_path / "synthetic.db"
|
||||||
|
conn = connect(db_path=db)
|
||||||
|
page_size = conn.execute("PRAGMA page_size").fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
# Now corrupt the file: claim N pages but truncate to N-1 pages
|
||||||
|
with open(db, "rb") as f:
|
||||||
|
data = bytearray(f.read())
|
||||||
|
# Read current page_count from header bytes 28-31
|
||||||
|
real_page_count = struct.unpack(">I", data[28:32])[0]
|
||||||
|
if real_page_count < 2:
|
||||||
|
# Need at least 2 pages to fake a truncation
|
||||||
|
pytest.skip("DB too small for synthetic truncation test")
|
||||||
|
# Truncate to N-1 pages
|
||||||
|
truncated = bytes(data[: (real_page_count - 1) * page_size])
|
||||||
|
with open(db, "wb") as f:
|
||||||
|
f.write(truncated)
|
||||||
|
# Now open and check — should raise
|
||||||
|
# We can't use connect() because _validate_sqlite_header may block; use a raw connection
|
||||||
|
raw_conn = sqlite3.connect(str(db), isolation_level=None)
|
||||||
|
with pytest.raises(sqlite3.DatabaseError, match="torn-extend|page count mismatch"):
|
||||||
|
_check_file_length_invariant(raw_conn)
|
||||||
|
raw_conn.close()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue