mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(sqlite): fall back to journal_mode=DELETE on NFS/SMB/FUSE (#22043)
SQLite's WAL mode requires shared-memory (mmap) coordination and fcntl byte-range locks that don't reliably work on network filesystems. Upstream documents this explicitly: https://www.sqlite.org/wal.html#sometimes_queries_return_sqlite_busy_in_wal_mode On NFS / SMB / some FUSE mounts / WSL1, 'PRAGMA journal_mode=WAL' raises 'sqlite3.OperationalError: locking protocol' (SQLITE_PROTOCOL). Before this change, every feature backed by state.db or kanban.db broke silently: - /resume, /title, /history, /branch returned 'Session database not available.' with no cause - gateway logged the init failure at DEBUG (invisible in errors.log) - kanban dispatcher crashed every 60s, driving the known migration race (duplicate column name: consecutive_failures, #21708 / #21374) Changes: - hermes_state.apply_wal_with_fallback(): shared helper that tries WAL and falls back to DELETE on SQLITE_PROTOCOL-style errors with one WARNING explaining why - hermes_state.get_last_init_error() + format_session_db_unavailable(): capture the init failure cause and surface it in user-facing strings (with an NFS/SMB pointer for 'locking protocol') - hermes_cli/kanban_db.connect(): use the shared helper - gateway/run.py: bump SessionDB init failure log DEBUG -> WARNING (matches cli.py's existing correct behavior) - cli.py (4 sites) + gateway/run.py (5 sites): replace bare 'Session database not available.' with format_session_db_unavailable() Tests: 12 new tests in tests/test_hermes_state_wal_fallback.py + 1 new test in tests/hermes_cli/test_kanban_db.py. Existing suites (state, kanban, gateway, cli) remain green for all tests unrelated to pre-existing failures on main. Evidence: real-world user on NFSv3 mount (172.26.224.200:d2dfac12/home, local_lock=none) reporting 'Session database not available.' on /resume; 'locking protocol' appears in 4 distinct log entries across backup, kanban, TUI, and CLI paths in the same session. closes #22032
This commit is contained in:
parent
ae005ec588
commit
2a7047c2ed
10 changed files with 584 additions and 32 deletions
|
|
@ -1218,7 +1218,13 @@ class GatewayRunner:
|
|||
from hermes_state import SessionDB
|
||||
self._session_db = SessionDB()
|
||||
except Exception as e:
|
||||
logger.debug("SQLite session store not available: %s", e)
|
||||
# WARNING (not DEBUG) so the failure appears in errors.log — matches
|
||||
# cli.py's handling of the same init path. Users hitting NFS-mounted
|
||||
# HERMES_HOME silently lost /resume, /title, /history, /branch, and
|
||||
# session search without this. The underlying cause (usually
|
||||
# "locking protocol" from NFS) is now also captured by
|
||||
# hermes_state.get_last_init_error() for slash-command error strings.
|
||||
logger.warning("SQLite session store not available: %s", e)
|
||||
|
||||
# Opportunistic state.db maintenance: prune ended sessions older
|
||||
# than sessions.retention_days + optional VACUUM. Tracks last-run
|
||||
|
|
@ -10374,7 +10380,8 @@ class GatewayRunner:
|
|||
def _disable_telegram_topic_mode_for_chat(self, source: SessionSource) -> str:
|
||||
"""Cleanly disable topic mode for a chat via /topic off."""
|
||||
if not self._session_db:
|
||||
return "Session database not available."
|
||||
from hermes_state import format_session_db_unavailable
|
||||
return format_session_db_unavailable()
|
||||
chat_id = str(source.chat_id or "")
|
||||
if not chat_id:
|
||||
return "Could not determine chat ID."
|
||||
|
|
@ -10412,7 +10419,8 @@ class GatewayRunner:
|
|||
if source.platform != Platform.TELEGRAM or source.chat_type != "dm":
|
||||
return "The /topic command is only available in Telegram private chats."
|
||||
if not self._session_db:
|
||||
return "Session database not available."
|
||||
from hermes_state import format_session_db_unavailable
|
||||
return format_session_db_unavailable()
|
||||
|
||||
# Authorization: /topic activates multi-session mode and mutates
|
||||
# SQLite side tables. Unauthorized senders (not in allowlist) must
|
||||
|
|
@ -10626,7 +10634,8 @@ class GatewayRunner:
|
|||
session_id = session_entry.session_id
|
||||
|
||||
if not self._session_db:
|
||||
return "Session database not available."
|
||||
from hermes_state import format_session_db_unavailable
|
||||
return format_session_db_unavailable()
|
||||
|
||||
# Ensure session exists in SQLite DB (it may only exist in session_store
|
||||
# if this is the first command in a new session)
|
||||
|
|
@ -10670,7 +10679,8 @@ class GatewayRunner:
|
|||
async def _handle_resume_command(self, event: MessageEvent) -> str:
|
||||
"""Handle /resume command — switch to a previously-named session."""
|
||||
if not self._session_db:
|
||||
return "Session database not available."
|
||||
from hermes_state import format_session_db_unavailable
|
||||
return format_session_db_unavailable()
|
||||
|
||||
source = event.source
|
||||
session_key = self._session_key_for_source(source)
|
||||
|
|
@ -10757,7 +10767,8 @@ class GatewayRunner:
|
|||
import uuid as _uuid
|
||||
|
||||
if not self._session_db:
|
||||
return "Session database not available."
|
||||
from hermes_state import format_session_db_unavailable
|
||||
return format_session_db_unavailable()
|
||||
|
||||
source = event.source
|
||||
session_key = self._session_key_for_source(source)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue