mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-22 05:22:09 +00:00
fix(sqlite): fall back to journal_mode=DELETE on NFS/SMB/FUSE (#22043)
SQLite's WAL mode requires shared-memory (mmap) coordination and fcntl byte-range locks that don't reliably work on network filesystems. Upstream documents this explicitly: https://www.sqlite.org/wal.html#sometimes_queries_return_sqlite_busy_in_wal_mode On NFS / SMB / some FUSE mounts / WSL1, 'PRAGMA journal_mode=WAL' raises 'sqlite3.OperationalError: locking protocol' (SQLITE_PROTOCOL). Before this change, every feature backed by state.db or kanban.db broke silently: - /resume, /title, /history, /branch returned 'Session database not available.' with no cause - gateway logged the init failure at DEBUG (invisible in errors.log) - kanban dispatcher crashed every 60s, driving the known migration race (duplicate column name: consecutive_failures, #21708 / #21374) Changes: - hermes_state.apply_wal_with_fallback(): shared helper that tries WAL and falls back to DELETE on SQLITE_PROTOCOL-style errors with one WARNING explaining why - hermes_state.get_last_init_error() + format_session_db_unavailable(): capture the init failure cause and surface it in user-facing strings (with an NFS/SMB pointer for 'locking protocol') - hermes_cli/kanban_db.connect(): use the shared helper - gateway/run.py: bump SessionDB init failure log DEBUG -> WARNING (matches cli.py's existing correct behavior) - cli.py (4 sites) + gateway/run.py (5 sites): replace bare 'Session database not available.' with format_session_db_unavailable() Tests: 12 new tests in tests/test_hermes_state_wal_fallback.py + 1 new test in tests/hermes_cli/test_kanban_db.py. Existing suites (state, kanban, gateway, cli) remain green for all tests unrelated to pre-existing failures on main. Evidence: real-world user on NFSv3 mount (172.26.224.200:d2dfac12/home, local_lock=none) reporting 'Session database not available.' on /resume; 'locking protocol' appears in 4 distinct log entries across backup, kanban, TUI, and CLI paths in the same session. closes #22032
This commit is contained in:
parent
ae005ec588
commit
2a7047c2ed
10 changed files with 584 additions and 32 deletions
196
hermes_state.py
196
hermes_state.py
|
|
@ -35,6 +35,153 @@ DEFAULT_DB_PATH = get_hermes_home() / "state.db"
|
|||
|
||||
SCHEMA_VERSION = 11
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# WAL-compatibility fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
# SQLite's WAL mode requires shared-memory (mmap) coordination and fcntl
|
||||
# byte-range locks that don't reliably work on network filesystems (NFS,
|
||||
# SMB/CIFS, some FUSE mounts, WSL1). Upstream documents this explicitly:
|
||||
# https://www.sqlite.org/wal.html#sometimes_queries_return_sqlite_busy_in_wal_mode
|
||||
#
|
||||
# On those filesystems ``PRAGMA journal_mode=WAL`` raises
|
||||
# ``sqlite3.OperationalError: locking protocol`` (SQLITE_PROTOCOL). If we
|
||||
# propagate that, every feature backed by state.db / kanban.db breaks
|
||||
# silently — /resume, /title, /history, /branch, kanban dispatcher, etc.
|
||||
#
|
||||
# Instead, fall back to ``journal_mode=DELETE`` (the pre-WAL default) which
|
||||
# works on NFS. Concurrency drops — concurrent readers are blocked during
|
||||
# a write — but the feature works.
|
||||
_WAL_INCOMPAT_MARKERS = (
|
||||
"locking protocol", # SQLITE_PROTOCOL on NFS/SMB
|
||||
"not authorized", # Some FUSE mounts block WAL pragma outright
|
||||
"disk i/o error", # Flaky network FS during WAL setup
|
||||
)
|
||||
|
||||
# Last SessionDB() init error, per-process. Surfaced in /resume and
|
||||
# related slash-command error strings so users know WHY the DB is
|
||||
# unavailable instead of getting a bare "Session database not available."
|
||||
# Only SessionDB.__init__ writes to this; kanban_db.connect() failures
|
||||
# do not update it (by design — kanban failures are reported via their
|
||||
# own caller's error handling, not via /resume-style slash commands).
|
||||
_last_init_error: Optional[str] = None
|
||||
_last_init_error_lock = threading.Lock()
|
||||
|
||||
# Paths for which we've already logged a WAL-fallback WARNING. Without
|
||||
# this, kanban_db.connect() (called on every kanban operation — see
|
||||
# hermes_cli/kanban_db.py for ~30 call sites) would re-log the same
|
||||
# filesystem-incompat warning on every connection, filling errors.log.
|
||||
_wal_fallback_warned_paths: set[str] = set()
|
||||
_wal_fallback_warned_lock = threading.Lock()
|
||||
|
||||
|
||||
def _set_last_init_error(msg: Optional[str]) -> None:
|
||||
"""Record (or clear) the most recent state.db init failure.
|
||||
|
||||
Thread-safe via _last_init_error_lock. Callers pass a message to
|
||||
record a failure or None to clear. SessionDB.__init__ only calls
|
||||
this to SET on failure — it deliberately does NOT clear on success,
|
||||
because in a multi-threaded caller (e.g. gateway / web_server per-
|
||||
request SessionDB() instantiation), a concurrent successful open
|
||||
racing past a different thread's failure would erase the cause
|
||||
string that thread's /resume handler is about to format. Explicit
|
||||
clears (e.g. test fixtures) are still supported by passing None.
|
||||
"""
|
||||
global _last_init_error
|
||||
with _last_init_error_lock:
|
||||
_last_init_error = msg
|
||||
|
||||
|
||||
def get_last_init_error() -> Optional[str]:
|
||||
"""Return the most recent state.db init failure, if any.
|
||||
|
||||
Slash-command handlers (``/resume``, ``/title``, ``/history``, ``/branch``)
|
||||
call this to surface the underlying cause in their error messages when
|
||||
``_session_db is None``. Returns ``None`` if SessionDB initialized
|
||||
successfully (or hasn't been attempted).
|
||||
"""
|
||||
return _last_init_error
|
||||
|
||||
|
||||
def format_session_db_unavailable(prefix: str = "Session database not available") -> str:
|
||||
"""Format a user-facing 'session DB unavailable' message with cause.
|
||||
|
||||
When ``SessionDB()`` init fails, callers set ``_session_db = None`` and
|
||||
several slash commands (/resume, /title, /history, /branch) previously
|
||||
responded with a bare ``"Session database not available."`` — no
|
||||
indication of WHY. This helper includes the captured cause (typically
|
||||
``"locking protocol"`` from NFS/SMB) and points users at the known
|
||||
culprit so they can fix it themselves.
|
||||
|
||||
Example output:
|
||||
Session database not available: locking protocol (state.db may be
|
||||
on NFS/SMB — see https://www.sqlite.org/wal.html).
|
||||
"""
|
||||
cause = get_last_init_error()
|
||||
if not cause:
|
||||
return f"{prefix}."
|
||||
hint = ""
|
||||
if any(marker in cause.lower() for marker in _WAL_INCOMPAT_MARKERS):
|
||||
hint = " (state.db may be on NFS/SMB/FUSE — see https://www.sqlite.org/wal.html)"
|
||||
return f"{prefix}: {cause}{hint}."
|
||||
|
||||
|
||||
def apply_wal_with_fallback(
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
db_label: str = "state.db",
|
||||
) -> str:
|
||||
"""Set ``journal_mode=WAL`` on ``conn``, falling back to DELETE on failure.
|
||||
|
||||
Returns the journal mode actually set (``"wal"`` or ``"delete"``).
|
||||
|
||||
On WAL-incompatible filesystems (NFS, SMB, some FUSE), SQLite raises
|
||||
``OperationalError("locking protocol")`` when setting WAL. We fall
|
||||
back to DELETE mode — the pre-WAL default, which works on NFS — and
|
||||
log one WARNING explaining why.
|
||||
|
||||
The WARNING is deduplicated per ``db_label``: repeated connections
|
||||
to the same underlying DB (e.g. kanban_db.connect() which is called
|
||||
on every kanban operation) log once per process, not once per call.
|
||||
Different db_labels log independently, so state.db and kanban.db
|
||||
each get one warning on the same NFS mount.
|
||||
|
||||
Shared by :class:`SessionDB` and ``hermes_cli.kanban_db.connect`` so
|
||||
both databases get identical fallback behavior.
|
||||
"""
|
||||
try:
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
return "wal"
|
||||
except sqlite3.OperationalError as exc:
|
||||
msg = str(exc).lower()
|
||||
if not any(marker in msg for marker in _WAL_INCOMPAT_MARKERS):
|
||||
# Unrelated OperationalError — don't silently swallow.
|
||||
raise
|
||||
_log_wal_fallback_once(db_label, exc)
|
||||
conn.execute("PRAGMA journal_mode=DELETE")
|
||||
return "delete"
|
||||
|
||||
|
||||
def _log_wal_fallback_once(db_label: str, exc: Exception) -> None:
|
||||
"""Log a single WARNING per (process, db_label) about WAL fallback.
|
||||
|
||||
Without this dedup, NFS users running kanban (which opens a fresh
|
||||
connection on every operation — see hermes_cli/kanban_db.py) would
|
||||
fill errors.log with hundreds of identical warnings per hour.
|
||||
"""
|
||||
with _wal_fallback_warned_lock:
|
||||
if db_label in _wal_fallback_warned_paths:
|
||||
return
|
||||
_wal_fallback_warned_paths.add(db_label)
|
||||
logger.warning(
|
||||
"%s: WAL journal_mode unsupported on this filesystem (%s) — "
|
||||
"falling back to journal_mode=DELETE (slower rollback-journal "
|
||||
"mode; reduces concurrency but works on NFS/SMB/FUSE). See "
|
||||
"https://www.sqlite.org/wal.html for details. This warning "
|
||||
"fires once per process per database.",
|
||||
db_label,
|
||||
exc,
|
||||
)
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS schema_version (
|
||||
version INTEGER NOT NULL
|
||||
|
|
@ -185,23 +332,40 @@ class SessionDB:
|
|||
|
||||
self._lock = threading.Lock()
|
||||
self._write_count = 0
|
||||
self._conn = sqlite3.connect(
|
||||
str(self.db_path),
|
||||
check_same_thread=False,
|
||||
# Short timeout — application-level retry with random jitter
|
||||
# handles contention instead of sitting in SQLite's internal
|
||||
# busy handler for up to 30s.
|
||||
timeout=1.0,
|
||||
# Autocommit mode: Python's default isolation_level="" auto-starts
|
||||
# transactions on DML, which conflicts with our explicit
|
||||
# BEGIN IMMEDIATE. None = we manage transactions ourselves.
|
||||
isolation_level=None,
|
||||
)
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.execute("PRAGMA journal_mode=WAL")
|
||||
self._conn.execute("PRAGMA foreign_keys=ON")
|
||||
try:
|
||||
self._conn = sqlite3.connect(
|
||||
str(self.db_path),
|
||||
check_same_thread=False,
|
||||
# Short timeout — application-level retry with random jitter
|
||||
# handles contention instead of sitting in SQLite's internal
|
||||
# busy handler for up to 30s.
|
||||
timeout=1.0,
|
||||
# Autocommit mode: Python's default isolation_level=""
|
||||
# auto-starts transactions on DML, which conflicts with our
|
||||
# explicit BEGIN IMMEDIATE. None = we manage transactions
|
||||
# ourselves.
|
||||
isolation_level=None,
|
||||
)
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
apply_wal_with_fallback(self._conn, db_label="state.db")
|
||||
self._conn.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
self._init_schema()
|
||||
self._init_schema()
|
||||
except Exception as exc:
|
||||
# Capture the cause so /resume and friends can surface WHY the
|
||||
# session DB is unavailable instead of a bare "Session database
|
||||
# not available." Callers that catch this exception keep their
|
||||
# existing ``self._session_db = None`` degradation path.
|
||||
#
|
||||
# Note: we deliberately do NOT clear _last_init_error on the
|
||||
# success path (no else branch). In multi-threaded callers
|
||||
# (gateway, web_server per-request SessionDB()), a concurrent
|
||||
# successful open racing past this failure would erase the
|
||||
# cause that another thread's /resume is about to format.
|
||||
# Tests that need to reset the state can call
|
||||
# ``hermes_state._set_last_init_error(None)`` explicitly.
|
||||
_set_last_init_error(f"{type(exc).__name__}: {exc}")
|
||||
raise
|
||||
|
||||
# ── Core write helper ──
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue