mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
fix(state): F_FULLFSYNC barrier at WAL checkpoints on macOS (#30636)
On Darwin, synchronous=FULL (the WAL default) only issues a plain fsync(), which Apple documents does NOT guarantee writes reach stable storage or stay ordered. SQLite's WAL corruption-safety guarantee assumes the OS honors the fsync barrier; macOS does not unless the app uses F_FULLFSYNC. During a launchd *system* shutdown the page cache is dropped (effectively power-loss for in-flight pages), so a WAL checkpoint whose fsync 'reported' durable may never hit the platter — corrupting state.db with a malformed image. That is the trigger in #30636 ('SIGTERM during launchd shutdown under high load'). Apply PRAGMA checkpoint_fullfsync=1 (macOS-guarded) in apply_wal_with_fallback. It forces the F_FULLFSYNC barrier only at checkpoint boundaries (where WAL frames land in the main DB), so cost amortizes to ~+0.1ms/commit vs ~+4ms for the broader fullfsync=1. No-op off Darwin (F_FULLFSYNC is macOS-only). Root-cause analysis by @catapreta on #30636. Supersedes #30654, whose synchronous=FULL is a no-op (already FULL in WAL mode) and whose TRUNCATE-on-close is already on main. Co-authored-by: catapreta <catapreta@users.noreply.github.com>
This commit is contained in:
parent
9229d0db17
commit
52d774f0f9
2 changed files with 128 additions and 0 deletions
|
|
@ -19,6 +19,7 @@ import logging
|
|||
import random
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
|
@ -241,6 +242,41 @@ def _on_disk_journal_mode(conn: sqlite3.Connection) -> Optional[str]:
|
|||
return str(mode).strip().lower() if mode is not None else None
|
||||
|
||||
|
||||
def _apply_macos_checkpoint_barrier(conn: sqlite3.Connection) -> None:
|
||||
"""Enable ``PRAGMA checkpoint_fullfsync`` on macOS (no-op elsewhere).
|
||||
|
||||
On Darwin, ``synchronous=FULL`` (the WAL default) issues a plain
|
||||
``fsync()``, which Apple documents does *not* guarantee that data
|
||||
has reached stable storage or that writes are not reordered — see
|
||||
the ``fsync(2)`` man page. SQLite's WAL corruption-safety guarantee
|
||||
assumes the OS honors the fsync write barrier; macOS does not unless
|
||||
the app uses ``F_FULLFSYNC``.
|
||||
|
||||
During a launchd *system* shutdown/reboot the OS page cache is
|
||||
dropped (effectively a power-loss event for in-flight pages), so a
|
||||
WAL checkpoint whose ``fsync()`` "reported" durable may never have
|
||||
hit the platter — corrupting ``state.db`` with a malformed image.
|
||||
This is the trigger in issue #30636 ("SIGTERM during launchd
|
||||
shutdown under high load"), distinct from a plain in-session kill
|
||||
(which the page cache survives and SQLite recovers from).
|
||||
|
||||
``checkpoint_fullfsync=1`` forces an ``F_FULLFSYNC`` barrier only at
|
||||
checkpoint boundaries — where WAL frames land in the main DB — so the
|
||||
cost amortizes to roughly +0.1 ms/commit (vs ~+4 ms for the broader
|
||||
``fullfsync=1`` that flushes on every commit's WAL sync). Guarded by
|
||||
``sys.platform == "darwin"`` because ``F_FULLFSYNC`` is macOS-only;
|
||||
on other platforms the PRAGMA is a no-op, so we skip it entirely.
|
||||
|
||||
Best-effort: never raises.
|
||||
"""
|
||||
if sys.platform != "darwin":
|
||||
return
|
||||
try:
|
||||
conn.execute("PRAGMA checkpoint_fullfsync=1")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
|
||||
|
||||
def apply_wal_with_fallback(
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
|
|
@ -271,12 +307,14 @@ def apply_wal_with_fallback(
|
|||
try:
|
||||
current_mode = conn.execute("PRAGMA journal_mode").fetchone()
|
||||
if current_mode and current_mode[0] == "wal":
|
||||
_apply_macos_checkpoint_barrier(conn)
|
||||
return "wal"
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
|
||||
try:
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
_apply_macos_checkpoint_barrier(conn)
|
||||
return "wal"
|
||||
except sqlite3.OperationalError as exc:
|
||||
msg = str(exc).lower()
|
||||
|
|
|
|||
|
|
@ -4191,6 +4191,96 @@ class TestApplyWalProbe:
|
|||
"set-pragma must fire on a fresh (non-WAL) connection"
|
||||
)
|
||||
|
||||
def test_macos_checkpoint_fullsync_barrier_applied(self, tmp_path, monkeypatch):
|
||||
"""On Darwin, apply_wal_with_fallback sets checkpoint_fullfsync=1 (issue #30636)."""
|
||||
import sqlite3
|
||||
import hermes_state
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _TracingConn(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self.executed = []
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
self.executed.append(sql)
|
||||
return super().execute(sql, params)
|
||||
|
||||
monkeypatch.setattr(hermes_state.sys, "platform", "darwin")
|
||||
|
||||
db_path = tmp_path / "macos_fresh.db"
|
||||
conn = _TracingConn(str(db_path))
|
||||
try:
|
||||
result = apply_wal_with_fallback(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert result == "wal"
|
||||
assert any("checkpoint_fullfsync=1" in sql for sql in conn.executed), (
|
||||
"checkpoint_fullfsync barrier must be applied on macOS"
|
||||
)
|
||||
|
||||
def test_macos_barrier_applied_when_already_wal(self, tmp_path, monkeypatch):
|
||||
"""The Darwin barrier fires on the already-WAL early-return path too."""
|
||||
import sqlite3
|
||||
import hermes_state
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _TracingConn(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self.executed = []
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
self.executed.append(sql)
|
||||
return super().execute(sql, params)
|
||||
|
||||
db_path = tmp_path / "macos_wal.db"
|
||||
with sqlite3.connect(str(db_path)) as seed:
|
||||
seed.execute("PRAGMA journal_mode=WAL")
|
||||
|
||||
monkeypatch.setattr(hermes_state.sys, "platform", "darwin")
|
||||
|
||||
conn = _TracingConn(str(db_path))
|
||||
try:
|
||||
result = apply_wal_with_fallback(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert result == "wal"
|
||||
assert any("checkpoint_fullfsync=1" in sql for sql in conn.executed), (
|
||||
"checkpoint_fullfsync barrier must fire on the already-WAL path"
|
||||
)
|
||||
|
||||
def test_checkpoint_fullsync_barrier_skipped_off_darwin(self, tmp_path, monkeypatch):
|
||||
"""Non-macOS platforms must NOT issue the macOS-only PRAGMA."""
|
||||
import sqlite3
|
||||
import hermes_state
|
||||
from hermes_state import apply_wal_with_fallback
|
||||
|
||||
class _TracingConn(sqlite3.Connection):
|
||||
def __init__(self, *a, **kw):
|
||||
super().__init__(*a, **kw)
|
||||
self.executed = []
|
||||
|
||||
def execute(self, sql, params=()):
|
||||
self.executed.append(sql)
|
||||
return super().execute(sql, params)
|
||||
|
||||
monkeypatch.setattr(hermes_state.sys, "platform", "linux")
|
||||
|
||||
db_path = tmp_path / "linux_fresh.db"
|
||||
conn = _TracingConn(str(db_path))
|
||||
try:
|
||||
result = apply_wal_with_fallback(conn)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert result == "wal"
|
||||
assert not any("checkpoint_fullfsync" in sql for sql in conn.executed), (
|
||||
"checkpoint_fullfsync must not be issued off macOS"
|
||||
)
|
||||
|
||||
def test_apply_wal_concurrent_connects_no_eio(self, tmp_path):
|
||||
"""20 threads calling connect() on the same DB must not see disk I/O error."""
|
||||
import sys
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue