feat(state): auto-prune old sessions + VACUUM state.db at startup (#13861)

* feat(state): auto-prune old sessions + VACUUM state.db at startup state.db accumulates every session, message, and FTS5 index entry forever. A heavy user (gateway + cron) reported 384MB with 982 sessions / 68K messages causing slowdown; manual 'hermes sessions prune --older-than 7' + VACUUM brought it to 43MB. The prune command and VACUUM are not wired to run automatically anywhere — sessions grew unbounded until users noticed. Changes: - hermes_state.py: new state_meta key/value table, vacuum() method, and maybe_auto_prune_and_vacuum() — idempotent via last-run timestamp in state_meta so it only actually executes once per min_interval_hours across all Hermes processes for a given HERMES_HOME. Never raises. - hermes_cli/config.py: new 'sessions:' block in DEFAULT_CONFIG (auto_prune=True, retention_days=90, vacuum_after_prune=True, min_interval_hours=24). Added to _KNOWN_ROOT_KEYS. - cli.py: call maintenance once at HermesCLI init (shared helper _run_state_db_auto_maintenance reads config and delegates to DB). - gateway/run.py: call maintenance once at GatewayRunner init. - Docs: user-guide/sessions.md rewrites 'Automatic Cleanup' section. Why VACUUM matters: SQLite does NOT shrink the file on DELETE — freed pages get reused on next INSERT. Without VACUUM, a delete-heavy DB stays bloated forever. VACUUM only runs when the prune actually removed rows, so tight DBs don't pay the I/O cost. Tests: 10 new tests in tests/test_hermes_state.py covering state_meta, vacuum, idempotency, interval skipping, VACUUM-only-when-needed, corrupt-marker recovery. All 246 existing state/config/gateway tests still pass. Verified E2E with real imports + isolated HERMES_HOME: DEFAULT_CONFIG exposes the new block, load_config() returns it for fresh installs, first call prunes+vacuums, second call within min_interval_hours skips, and the state_meta marker persists across connection close/reopen. * sessions.auto_prune defaults to false (opt-in) Session history powers session_search recall across past conversations, so silently pruning on startup could surprise users. Ship the machinery disabled and let users opt in when they notice state.db is hurting performance. - DEFAULT_CONFIG.sessions.auto_prune: True → False - Call-site fallbacks in cli.py and gateway/run.py match the new default (so unmigrated configs still see off) - Docs: flip 'Enable in config.yaml' framing + tip explains the tradeoff
2026-04-25 00:51:20 +00:00 · 2026-04-22 05:21:49 -07:00 · 2026-04-22 05:21:49 -07:00 · b8663813b6
commit b8663813b6
parent b43524ecab
6 changed files with 337 additions and 4 deletions
--- a/tests/test_hermes_state.py
+++ b/tests/test_hermes_state.py
@ -1764,3 +1764,124 @@ class TestConcurrentWriteSafety:
        assert "30" in src, (
            "SQLite timeout should be at least 30s to handle CLI/gateway lock contention"
        )
+
+
+# =========================================================================
+# Auto-maintenance: state_meta + vacuum + maybe_auto_prune_and_vacuum
+# =========================================================================
+
+class TestStateMeta:
+    def test_get_meta_missing_returns_none(self, db):
+        assert db.get_meta("nonexistent") is None
+
+    def test_set_then_get_meta(self, db):
+        db.set_meta("foo", "bar")
+        assert db.get_meta("foo") == "bar"
+
+    def test_set_meta_upsert(self, db):
+        """set_meta overwrites existing value (ON CONFLICT DO UPDATE)."""
+        db.set_meta("key", "v1")
+        db.set_meta("key", "v2")
+        assert db.get_meta("key") == "v2"
+
+
+class TestVacuum:
+    def test_vacuum_runs_without_error(self, db):
+        """VACUUM must succeed on a fresh DB (no rows to reclaim)."""
+        db.create_session(session_id="s1", source="cli")
+        db.append_message(session_id="s1", role="user", content="hi")
+        # Should not raise, even though there's nothing significant to reclaim.
+        db.vacuum()
+
+
+class TestAutoMaintenance:
+    def _make_old_ended(self, db, sid: str, days_old: int = 100):
+        """Create a session that is ended and was started `days_old` days ago."""
+        db.create_session(session_id=sid, source="cli")
+        db.end_session(sid, end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - days_old * 86400, sid),
+        )
+        db._conn.commit()
+
+    def test_first_run_prunes_and_vacuums(self, db):
+        self._make_old_ended(db, "old1", days_old=100)
+        self._make_old_ended(db, "old2", days_old=100)
+        db.create_session(session_id="new", source="cli")  # active, must survive
+
+        result = db.maybe_auto_prune_and_vacuum(retention_days=90)
+        assert result["skipped"] is False
+        assert result["pruned"] == 2
+        assert result["vacuumed"] is True
+        assert result.get("error") is None
+        assert db.get_session("old1") is None
+        assert db.get_session("old2") is None
+        assert db.get_session("new") is not None
+
+    def test_second_call_within_interval_skips(self, db):
+        self._make_old_ended(db, "old", days_old=100)
+        first = db.maybe_auto_prune_and_vacuum(
+            retention_days=90, min_interval_hours=24
+        )
+        assert first["skipped"] is False
+        assert first["pruned"] == 1
+
+        # Create another prunable session; a second call within
+        # min_interval_hours should still skip without touching it.
+        self._make_old_ended(db, "old2", days_old=100)
+        second = db.maybe_auto_prune_and_vacuum(
+            retention_days=90, min_interval_hours=24
+        )
+        assert second["skipped"] is True
+        assert second["pruned"] == 0
+        assert db.get_session("old2") is not None  # untouched
+
+    def test_second_call_after_interval_runs_again(self, db):
+        self._make_old_ended(db, "old", days_old=100)
+        db.maybe_auto_prune_and_vacuum(retention_days=90, min_interval_hours=24)
+
+        # Backdate the last-run marker to force another run.
+        db.set_meta("last_auto_prune", str(time.time() - 48 * 3600))
+
+        self._make_old_ended(db, "old2", days_old=100)
+        result = db.maybe_auto_prune_and_vacuum(
+            retention_days=90, min_interval_hours=24
+        )
+        assert result["skipped"] is False
+        assert result["pruned"] == 1
+        assert db.get_session("old2") is None
+
+    def test_no_prunable_sessions_no_vacuum(self, db):
+        """When prune deletes 0 rows, VACUUM is skipped (wasted I/O)."""
+        db.create_session(session_id="fresh", source="cli")  # too recent
+        result = db.maybe_auto_prune_and_vacuum(retention_days=90)
+        assert result["skipped"] is False
+        assert result["pruned"] == 0
+        assert result["vacuumed"] is False
+        # But last-run is still recorded so we don't retry immediately.
+        assert db.get_meta("last_auto_prune") is not None
+
+    def test_vacuum_disabled_via_flag(self, db):
+        self._make_old_ended(db, "old", days_old=100)
+        result = db.maybe_auto_prune_and_vacuum(retention_days=90, vacuum=False)
+        assert result["pruned"] == 1
+        assert result["vacuumed"] is False
+
+    def test_corrupt_last_run_marker_treated_as_no_prior_run(self, db):
+        """A non-numeric marker must not break maintenance."""
+        db.set_meta("last_auto_prune", "not-a-timestamp")
+        self._make_old_ended(db, "old", days_old=100)
+        result = db.maybe_auto_prune_and_vacuum(retention_days=90)
+        assert result["skipped"] is False
+        assert result["pruned"] == 1
+
+    def test_state_meta_survives_vacuum(self, db):
+        """Marker written just before VACUUM must still be readable after."""
+        self._make_old_ended(db, "old", days_old=100)
+        db.maybe_auto_prune_and_vacuum(retention_days=90)
+        marker = db.get_meta("last_auto_prune")
+        assert marker is not None
+        # Should parse as a float timestamp close to now.
+        assert abs(float(marker) - time.time()) < 60
+