hermes-agent/tests/test_hermes_state_wal_fallback.py

"""Tests for the WAL→DELETE journal-mode fallback on NFS / SMB / FUSE.

When ``PRAGMA journal_mode=WAL`` raises ``OperationalError("locking protocol")``
(SQLITE_PROTOCOL — typical on NFS/SMB), Hermes must fall back to
``journal_mode=DELETE`` so ``state.db`` / ``kanban.db`` remain usable.

Without this fallback, users on NFS-mounted ``HERMES_HOME`` silently lose
``/resume``, ``/title``, ``/history``, ``/branch``, session search, and the
kanban dispatcher — because ``SessionDB()`` init propagates the error and
every caller swallows it, leaving ``_session_db = None``.

See: https://www.sqlite.org/wal.html — "WAL does not work over a network
filesystem".
"""

import sqlite3
from unittest.mock import patch

import pytest

import hermes_state
from hermes_state import (
    SessionDB,
    apply_wal_with_fallback,
    format_session_db_unavailable,
    get_last_init_error,
)


# ``sqlite3.Connection.execute`` is a C-level slot and can't be monkeypatched
# directly (``'sqlite3.Connection' object attribute 'execute' is read-only``).
# A factory-built subclass lets us intercept journal_mode=WAL per-test with
# its own mutable counter, avoiding the xdist-parallel class-state race.
def _make_blocking_factory(reason: str, attempt_counter: list):
    """Return a sqlite3.Connection subclass that raises on PRAGMA journal_mode=WAL."""

    class _WalBlockingConnection(sqlite3.Connection):
        def execute(self, sql, *args, **kwargs):  # type: ignore[override]
            if "journal_mode=wal" in sql.lower().replace(" ", ""):
                attempt_counter[0] += 1
                raise sqlite3.OperationalError(reason)
            return super().execute(sql, *args, **kwargs)

    return _WalBlockingConnection


def _open_blocking(path, reason="locking protocol", **kwargs):
    """Open a connection whose WAL pragma raises ``reason``.

    Returns ``(conn, attempt_counter_list)`` so callers can assert how many
    times WAL was attempted.
    """
    attempts = [0]
    factory = _make_blocking_factory(reason, attempts)
    return sqlite3.connect(str(path), factory=factory, **kwargs), attempts


@pytest.fixture(autouse=True)
def _reset_last_init_error():
    """Reset the module-global last-error before and after each test."""
    hermes_state._set_last_init_error(None)
    yield
    hermes_state._set_last_init_error(None)


@pytest.fixture(autouse=True)
def _reset_wal_fallback_warned_paths():
    """Reset the WAL-fallback warned-paths set so dedup doesn't leak between tests."""
    hermes_state._wal_fallback_warned_paths.clear()
    yield
    hermes_state._wal_fallback_warned_paths.clear()


class TestApplyWalWithFallback:
    def test_succeeds_on_local_fs(self, tmp_path):
        """Happy path: WAL works on a normal filesystem."""
        conn = sqlite3.connect(str(tmp_path / "ok.db"), isolation_level=None)
        mode = apply_wal_with_fallback(conn)
        assert mode == "wal"
        cur = conn.execute("PRAGMA journal_mode")
        assert cur.fetchone()[0].lower() == "wal"
        conn.close()

    def test_falls_back_to_delete_on_locking_protocol(self, tmp_path, caplog):
        """NFS-style ``locking protocol`` error → DELETE mode + one WARNING."""
        conn, _ = _open_blocking(tmp_path / "nfs.db", isolation_level=None)
        with caplog.at_level("WARNING", logger="hermes_state"):
            mode = apply_wal_with_fallback(conn, db_label="test.db")

        assert mode == "delete"
        warnings = [r for r in caplog.records if r.levelname == "WARNING"]
        assert len(warnings) == 1
        msg = warnings[0].getMessage()
        assert "test.db" in msg
        assert "journal_mode=DELETE" in msg
        assert "locking protocol" in msg

        # Post-fallback the DB is still usable for real writes
        conn.execute("CREATE TABLE t (x INTEGER)")
        conn.execute("INSERT INTO t VALUES (1)")
        assert list(conn.execute("SELECT x FROM t"))[0][0] == 1
        conn.close()

    def test_falls_back_on_not_authorized(self, tmp_path):
        """Some FUSE mounts block WAL pragma outright ('not authorized')."""
        conn, _ = _open_blocking(
            tmp_path / "fuse.db", reason="not authorized", isolation_level=None
        )
        mode = apply_wal_with_fallback(conn)
        assert mode == "delete"
        conn.close()

    def test_reraises_on_disk_io_error(self, tmp_path):
        """Transient EIO from ``PRAGMA journal_mode=WAL`` must NOT silently
        downgrade to DELETE.

        Regression for "Bug D": treating transient EIO as a permanent
        WAL-incompat marker produced the mixed-journal-mode-across-processes
        corruption pattern (process A downgrades to DELETE, sibling
        processes successfully set WAL, SQLite corrupts the file because
        the two locking protocols are documented as incompatible). EIO is
        usually transient (page-cache pressure, lock contention, brief
        storage hiccups); the right behavior is to re-raise so the caller
        can retry, not to walk the DB into a permanently downgraded state.
        """
        conn, _ = _open_blocking(
            tmp_path / "flaky.db", reason="disk I/O error", isolation_level=None
        )
        with pytest.raises(sqlite3.OperationalError, match="disk I/O error"):
            apply_wal_with_fallback(conn)
        conn.close()

    def test_does_not_downgrade_when_disk_says_wal(self, tmp_path):
        """Belt-and-suspenders: even if a marker matches, refuse to
        downgrade when the on-disk DB header is already WAL.

        Prevents a future addition to ``_WAL_INCOMPAT_MARKERS`` from
        accidentally reintroducing the mixed-journal-mode corruption
        pattern. We construct a DB already in WAL on disk, then open a
        new connection whose ``PRAGMA journal_mode=WAL`` raises one of
        the legit markers — the function must still re-raise (refusing
        the downgrade) because the on-disk file is WAL.
        """
        # Prime the file in WAL mode using a normal connection
        primer = sqlite3.connect(
            str(tmp_path / "already-wal.db"), isolation_level=None
        )
        try:
            primer.execute("PRAGMA journal_mode=WAL")
            primer.execute("CREATE TABLE t (x INTEGER)")
            primer.execute("INSERT INTO t VALUES (1)")
            assert (
                primer.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal"
            )
        finally:
            primer.close()

        # New connection whose WAL pragma raises "locking protocol" — a
        # marker that WOULD normally trigger downgrade. With the on-disk
        # guard, we must instead re-raise.
        conn, _ = _open_blocking(
            tmp_path / "already-wal.db",
            reason="locking protocol",
            isolation_level=None,
        )
        with pytest.raises(sqlite3.OperationalError, match="locking protocol"):
            apply_wal_with_fallback(conn)
        conn.close()

        # And the file is STILL WAL on disk — nothing got rewritten
        check = sqlite3.connect(str(tmp_path / "already-wal.db"))
        try:
            assert (
                check.execute("PRAGMA journal_mode").fetchone()[0].lower() == "wal"
            )
        finally:
            check.close()

    def test_reraises_unrelated_operational_error(self, tmp_path):
        """Non-WAL-compat errors must NOT be silently swallowed by the fallback."""
        conn, _ = _open_blocking(
            tmp_path / "other.db",
            reason="no such table: nope",
            isolation_level=None,
        )
        with pytest.raises(sqlite3.OperationalError, match="no such table"):
            apply_wal_with_fallback(conn)
        conn.close()

    def test_warning_deduplicated_per_db_label(self, tmp_path, caplog):
        """Repeated calls with the same db_label log exactly ONE warning.

        Prevents log spam when NFS users run kanban (which opens a fresh
        connection on every operation — see hermes_cli/kanban_db.py).
        Regression guard: the fix for #22032 ran apply_wal_with_fallback()
        on every kb.connect() call; without dedup, errors.log fills with
        hundreds of identical warnings per hour.
        """
        with caplog.at_level("WARNING", logger="hermes_state"):
            # Three separate connections to "the same DB" via the same label
            for i in range(3):
                conn, _ = _open_blocking(
                    tmp_path / f"dup-{i}.db", isolation_level=None
                )
                mode = apply_wal_with_fallback(conn, db_label="shared.db")
                assert mode == "delete"
                conn.close()

        # Exactly one warning across all three calls
        warnings = [
            r for r in caplog.records
            if r.levelname == "WARNING" and "shared.db" in r.getMessage()
        ]
        assert len(warnings) == 1, (
            f"Expected 1 deduplicated warning, got {len(warnings)}: "
            f"{[r.getMessage() for r in warnings]}"
        )

    def test_warning_fires_independently_per_db_label(self, tmp_path, caplog):
        """Different db_labels each get their own one warning (not globally dedup'd)."""
        with caplog.at_level("WARNING", logger="hermes_state"):
            conn1, _ = _open_blocking(tmp_path / "a.db", isolation_level=None)
            apply_wal_with_fallback(conn1, db_label="state.db")
            conn1.close()

            conn2, _ = _open_blocking(tmp_path / "b.db", isolation_level=None)
            apply_wal_with_fallback(conn2, db_label="kanban.db")
            conn2.close()

        warnings = [r for r in caplog.records if r.levelname == "WARNING"]
        labels_warned = {
            lbl for r in warnings for lbl in ("state.db", "kanban.db")
            if lbl in r.getMessage()
        }
        assert labels_warned == {"state.db", "kanban.db"}, (
            f"Each db_label should warn once; got {labels_warned}"
        )


class TestGetLastInitError:
    def test_none_on_successful_init(self, tmp_path):
        """Happy-path SessionDB init does NOT clear a stale error from a prior thread.

        We deliberately don't clear on success so that in multi-threaded
        callers (gateway / web_server per-request SessionDB()), a concurrent
        successful open racing past a different thread's failure won't
        erase the cause string the failing thread's /resume is about to
        format.  The caller or test fixture is responsible for explicitly
        calling _set_last_init_error(None) to reset.
        """
        # Autouse fixture starts at None — success-path leaves it None
        db = SessionDB(db_path=tmp_path / "ok.db")
        try:
            assert get_last_init_error() is None
        finally:
            db.close()

    def test_success_does_not_clear_prior_error(self, tmp_path):
        """Thread-safety guard: a successful init must not erase a pre-existing error.

        Simulates the multi-threaded race: thread A fails, records cause;
        thread B succeeds concurrently.  thread A's /resume handler must
        still see A's cause — not B's None.
        """
        hermes_state._set_last_init_error("OperationalError: locking protocol")
        # Now a "successful" init happens on another path — must NOT clear
        db = SessionDB(db_path=tmp_path / "ok2.db")
        try:
            assert get_last_init_error() == "OperationalError: locking protocol"
        finally:
            db.close()

    def test_captures_cause_on_failed_init(self, tmp_path):
        """When SessionDB() raises, the cause is preserved for slash commands.

        Simulates a filesystem where BOTH WAL and DELETE journal modes fail —
        e.g. a read-only mount where no ``PRAGMA journal_mode=X`` works.  The
        fallback tries DELETE and also gets rejected; the exception bubbles
        out of ``SessionDB.__init__`` and the cause is captured.
        """
        target = tmp_path / "broken.db"
        real_connect = sqlite3.connect

        class _BothPragmasFailConnection(sqlite3.Connection):
            def execute(self, sql, *args, **kwargs):  # type: ignore[override]
                if "journal_mode" in sql.lower():
                    raise sqlite3.OperationalError(
                        "locking protocol: read-only filesystem"
                    )
                return super().execute(sql, *args, **kwargs)

        def gated_connect(*args, **kwargs):
            return real_connect(str(target), factory=_BothPragmasFailConnection, **kwargs)

        with patch("hermes_state.sqlite3.connect", side_effect=gated_connect):
            with pytest.raises(sqlite3.OperationalError):
                SessionDB(db_path=target)

        cause = get_last_init_error()
        assert cause is not None
        assert "OperationalError" in cause
        assert "locking protocol" in cause


class TestFormatSessionDbUnavailable:
    def test_bare_message_when_no_cause(self):
        """No init error recorded → generic message."""
        hermes_state._set_last_init_error(None)
        assert format_session_db_unavailable() == "Session database not available."

    def test_includes_cause(self):
        """Cause is surfaced for slash-command error strings."""
        hermes_state._set_last_init_error("OperationalError: generic SQLite error")
        msg = format_session_db_unavailable()
        assert "generic SQLite error" in msg
        assert msg.startswith("Session database not available:")
        assert msg.endswith(".")

    def test_adds_nfs_hint_for_locking_protocol(self):
        """Locking-protocol cause gets an NFS/SMB pointer for the user."""
        hermes_state._set_last_init_error("OperationalError: locking protocol")
        msg = format_session_db_unavailable()
        assert "locking protocol" in msg
        assert "NFS/SMB" in msg
        assert "sqlite.org/wal.html" in msg

    def test_custom_prefix(self):
        """Callers can customize the prefix for context-specific messages."""
        hermes_state._set_last_init_error("OperationalError: locking protocol")
        msg = format_session_db_unavailable(prefix="Cannot /resume")
        assert msg.startswith("Cannot /resume:")


class TestSessionDbUsesWalFallback:
    def test_sessiondb_works_when_wal_unavailable(self, tmp_path):
        """E2E: SessionDB initializes and performs a write on a WAL-blocked FS."""
        target = tmp_path / "nfs_style.db"

        real_connect = sqlite3.connect
        attempts = [0]
        factory = _make_blocking_factory("locking protocol", attempts)

        def gated_connect(*args, **kwargs):
            return real_connect(str(target), factory=factory, **kwargs)

        with patch("hermes_state.sqlite3.connect", side_effect=gated_connect):
            db = SessionDB(db_path=target)

        try:
            # WAL was attempted and rejected — fallback kicked in
            assert attempts[0] >= 1, (
                "WAL pragma was never executed — check the patch target"
            )
            # SessionDB is usable end-to-end: create a session, read it back
            db.create_session(session_id="s1", source="cli", model="test")
            sess = db.get_session("s1")
            assert sess is not None
            assert sess["source"] == "cli"
            # No init error was recorded since init succeeded via the fallback
            assert get_last_init_error() is None
        finally:
            db.close()