feat(state): auto-prune old sessions + VACUUM state.db at startup (#13861)

* feat(state): auto-prune old sessions + VACUUM state.db at startup

state.db accumulates every session, message, and FTS5 index entry forever.
A heavy user (gateway + cron) reported 384MB with 982 sessions / 68K messages
causing slowdown; manual 'hermes sessions prune --older-than 7' + VACUUM
brought it to 43MB. The prune command and VACUUM are not wired to run
automatically anywhere — sessions grew unbounded until users noticed.

Changes:
- hermes_state.py: new state_meta key/value table, vacuum() method, and
  maybe_auto_prune_and_vacuum() — idempotent via last-run timestamp in
  state_meta so it only actually executes once per min_interval_hours
  across all Hermes processes for a given HERMES_HOME. Never raises.
- hermes_cli/config.py: new 'sessions:' block in DEFAULT_CONFIG
  (auto_prune=True, retention_days=90, vacuum_after_prune=True,
  min_interval_hours=24). Added to _KNOWN_ROOT_KEYS.
- cli.py: call maintenance once at HermesCLI init (shared helper
  _run_state_db_auto_maintenance reads config and delegates to DB).
- gateway/run.py: call maintenance once at GatewayRunner init.
- Docs: user-guide/sessions.md rewrites 'Automatic Cleanup' section.

Why VACUUM matters: SQLite does NOT shrink the file on DELETE — freed
pages get reused on next INSERT. Without VACUUM, a delete-heavy DB stays
bloated forever. VACUUM only runs when the prune actually removed rows,
so tight DBs don't pay the I/O cost.

Tests: 10 new tests in tests/test_hermes_state.py covering state_meta,
vacuum, idempotency, interval skipping, VACUUM-only-when-needed,
corrupt-marker recovery. All 246 existing state/config/gateway tests
still pass.

Verified E2E with real imports + isolated HERMES_HOME: DEFAULT_CONFIG
exposes the new block, load_config() returns it for fresh installs,
first call prunes+vacuums, second call within min_interval_hours skips,
and the state_meta marker persists across connection close/reopen.

* sessions.auto_prune defaults to false (opt-in)

Session history powers session_search recall across past conversations,
so silently pruning on startup could surprise users. Ship the machinery
disabled and let users opt in when they notice state.db is hurting
performance.

- DEFAULT_CONFIG.sessions.auto_prune: True → False
- Call-site fallbacks in cli.py and gateway/run.py match the new default
  (so unmigrated configs still see off)
- Docs: flip 'Enable in config.yaml' framing + tip explains the tradeoff
This commit is contained in:
Teknium 2026-04-22 05:21:49 -07:00 committed by GitHub
parent b43524ecab
commit b8663813b6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 337 additions and 4 deletions

View file

@ -1764,3 +1764,124 @@ class TestConcurrentWriteSafety:
assert "30" in src, (
"SQLite timeout should be at least 30s to handle CLI/gateway lock contention"
)
# =========================================================================
# Auto-maintenance: state_meta + vacuum + maybe_auto_prune_and_vacuum
# =========================================================================
class TestStateMeta:
def test_get_meta_missing_returns_none(self, db):
assert db.get_meta("nonexistent") is None
def test_set_then_get_meta(self, db):
db.set_meta("foo", "bar")
assert db.get_meta("foo") == "bar"
def test_set_meta_upsert(self, db):
"""set_meta overwrites existing value (ON CONFLICT DO UPDATE)."""
db.set_meta("key", "v1")
db.set_meta("key", "v2")
assert db.get_meta("key") == "v2"
class TestVacuum:
def test_vacuum_runs_without_error(self, db):
"""VACUUM must succeed on a fresh DB (no rows to reclaim)."""
db.create_session(session_id="s1", source="cli")
db.append_message(session_id="s1", role="user", content="hi")
# Should not raise, even though there's nothing significant to reclaim.
db.vacuum()
class TestAutoMaintenance:
def _make_old_ended(self, db, sid: str, days_old: int = 100):
"""Create a session that is ended and was started `days_old` days ago."""
db.create_session(session_id=sid, source="cli")
db.end_session(sid, end_reason="done")
db._conn.execute(
"UPDATE sessions SET started_at = ? WHERE id = ?",
(time.time() - days_old * 86400, sid),
)
db._conn.commit()
def test_first_run_prunes_and_vacuums(self, db):
self._make_old_ended(db, "old1", days_old=100)
self._make_old_ended(db, "old2", days_old=100)
db.create_session(session_id="new", source="cli") # active, must survive
result = db.maybe_auto_prune_and_vacuum(retention_days=90)
assert result["skipped"] is False
assert result["pruned"] == 2
assert result["vacuumed"] is True
assert result.get("error") is None
assert db.get_session("old1") is None
assert db.get_session("old2") is None
assert db.get_session("new") is not None
def test_second_call_within_interval_skips(self, db):
self._make_old_ended(db, "old", days_old=100)
first = db.maybe_auto_prune_and_vacuum(
retention_days=90, min_interval_hours=24
)
assert first["skipped"] is False
assert first["pruned"] == 1
# Create another prunable session; a second call within
# min_interval_hours should still skip without touching it.
self._make_old_ended(db, "old2", days_old=100)
second = db.maybe_auto_prune_and_vacuum(
retention_days=90, min_interval_hours=24
)
assert second["skipped"] is True
assert second["pruned"] == 0
assert db.get_session("old2") is not None # untouched
def test_second_call_after_interval_runs_again(self, db):
self._make_old_ended(db, "old", days_old=100)
db.maybe_auto_prune_and_vacuum(retention_days=90, min_interval_hours=24)
# Backdate the last-run marker to force another run.
db.set_meta("last_auto_prune", str(time.time() - 48 * 3600))
self._make_old_ended(db, "old2", days_old=100)
result = db.maybe_auto_prune_and_vacuum(
retention_days=90, min_interval_hours=24
)
assert result["skipped"] is False
assert result["pruned"] == 1
assert db.get_session("old2") is None
def test_no_prunable_sessions_no_vacuum(self, db):
"""When prune deletes 0 rows, VACUUM is skipped (wasted I/O)."""
db.create_session(session_id="fresh", source="cli") # too recent
result = db.maybe_auto_prune_and_vacuum(retention_days=90)
assert result["skipped"] is False
assert result["pruned"] == 0
assert result["vacuumed"] is False
# But last-run is still recorded so we don't retry immediately.
assert db.get_meta("last_auto_prune") is not None
def test_vacuum_disabled_via_flag(self, db):
self._make_old_ended(db, "old", days_old=100)
result = db.maybe_auto_prune_and_vacuum(retention_days=90, vacuum=False)
assert result["pruned"] == 1
assert result["vacuumed"] is False
def test_corrupt_last_run_marker_treated_as_no_prior_run(self, db):
"""A non-numeric marker must not break maintenance."""
db.set_meta("last_auto_prune", "not-a-timestamp")
self._make_old_ended(db, "old", days_old=100)
result = db.maybe_auto_prune_and_vacuum(retention_days=90)
assert result["skipped"] is False
assert result["pruned"] == 1
def test_state_meta_survives_vacuum(self, db):
"""Marker written just before VACUUM must still be readable after."""
self._make_old_ended(db, "old", days_old=100)
db.maybe_auto_prune_and_vacuum(retention_days=90)
marker = db.get_meta("last_auto_prune")
assert marker is not None
# Should parse as a float timestamp close to now.
assert abs(float(marker) - time.time()) < 60