diff --git a/cli.py b/cli.py index 588988d8c..9d87ff356 100644 --- a/cli.py +++ b/cli.py @@ -914,6 +914,32 @@ def _cleanup_worktree(info: Dict[str, str] = None) -> None: print(f"\033[32m✓ Worktree cleaned up: {wt_path}\033[0m") +def _run_state_db_auto_maintenance(session_db) -> None: + """Call ``SessionDB.maybe_auto_prune_and_vacuum`` using current config. + + Reads the ``sessions:`` section from config.yaml via + :func:`hermes_cli.config.load_config` (the authoritative loader that + deep-merges DEFAULT_CONFIG, so unmigrated configs still get default + values). Honours ``auto_prune`` / ``retention_days`` / + ``vacuum_after_prune`` / ``min_interval_hours``, and delegates to the + DB. Never raises — maintenance must never block interactive startup. + """ + if session_db is None: + return + try: + from hermes_cli.config import load_config as _load_full_config + cfg = (_load_full_config().get("sessions") or {}) + if not cfg.get("auto_prune", False): + return + session_db.maybe_auto_prune_and_vacuum( + retention_days=int(cfg.get("retention_days", 90)), + min_interval_hours=int(cfg.get("min_interval_hours", 24)), + vacuum=bool(cfg.get("vacuum_after_prune", True)), + ) + except Exception as exc: + logger.debug("state.db auto-maintenance skipped: %s", exc) + + def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None: """Remove stale worktrees and orphaned branches on startup. @@ -1961,7 +1987,13 @@ class HermesCLI: self._session_db = SessionDB() except Exception as e: logger.warning("Failed to initialize SessionDB — session will NOT be indexed for search: %s", e) - + + # Opportunistic state.db maintenance — runs at most once per + # min_interval_hours, tracked via state_meta in state.db itself so + # it's shared across all Hermes processes for this HERMES_HOME. + # Never blocks startup on failure. + _run_state_db_auto_maintenance(self._session_db) + # Deferred title: stored in memory until the session is created in the DB self._pending_title: Optional[str] = None diff --git a/gateway/run.py b/gateway/run.py index db99ad087..ad907f623 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -710,7 +710,26 @@ class GatewayRunner: self._session_db = SessionDB() except Exception as e: logger.debug("SQLite session store not available: %s", e) - + + # Opportunistic state.db maintenance: prune ended sessions older + # than sessions.retention_days + optional VACUUM. Tracks last-run + # in state_meta so it only actually executes once per + # sessions.min_interval_hours. Gateway is long-lived so blocking + # a few seconds once per day is acceptable; failures are logged + # but never raised. + if self._session_db is not None: + try: + from hermes_cli.config import load_config as _load_full_config + _sess_cfg = (_load_full_config().get("sessions") or {}) + if _sess_cfg.get("auto_prune", False): + self._session_db.maybe_auto_prune_and_vacuum( + retention_days=int(_sess_cfg.get("retention_days", 90)), + min_interval_hours=int(_sess_cfg.get("min_interval_hours", 24)), + vacuum=bool(_sess_cfg.get("vacuum_after_prune", True)), + ) + except Exception as exc: + logger.debug("state.db auto-maintenance skipped: %s", exc) + # DM pairing store for code-based user authorization from gateway.pairing import PairingStore self.pairing_store = PairingStore() diff --git a/hermes_cli/config.py b/hermes_cli/config.py index ebeace304..81275a7f9 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -893,6 +893,34 @@ DEFAULT_CONFIG = { "force_ipv4": False, }, + # Session storage — controls automatic cleanup of ~/.hermes/state.db. + # state.db accumulates every session, message, tool call, and FTS5 index + # entry forever. Without auto-pruning, a heavy user (gateway + cron) + # reports 384MB+ databases with 68K+ messages, which slows down FTS5 + # inserts, /resume listing, and insights queries. + "sessions": { + # When true, prune ended sessions older than retention_days once + # per (roughly) min_interval_hours at CLI/gateway/cron startup. + # Only touches ended sessions — active sessions are always preserved. + # Default false: session history is valuable for search recall, and + # silently deleting it could surprise users. Opt in explicitly. + "auto_prune": False, + # How many days of ended-session history to keep. Matches the + # default of ``hermes sessions prune``. + "retention_days": 90, + # VACUUM after a prune that actually deleted rows. SQLite does not + # reclaim disk space on DELETE — freed pages are just reused on + # subsequent INSERTs — so without VACUUM the file stays bloated + # even after pruning. VACUUM blocks writes for a few seconds per + # 100MB, so it only runs at startup, and only when prune deleted + # ≥1 session. + "vacuum_after_prune": True, + # Minimum hours between auto-maintenance runs (avoids repeating + # the sweep on every CLI invocation). Tracked via state_meta in + # state.db itself, so it's shared across all processes. + "min_interval_hours": 24, + }, + # Config schema version - bump this when adding new required fields "_config_version": 22, } @@ -2118,6 +2146,7 @@ _KNOWN_ROOT_KEYS = { "fallback_providers", "credential_pool_strategies", "toolsets", "agent", "terminal", "display", "compression", "delegation", "auxiliary", "custom_providers", "context", "memory", "gateway", + "sessions", } # Valid fields inside a custom_providers list entry diff --git a/hermes_state.py b/hermes_state.py index 46f3de6fd..7d17747f4 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -85,6 +85,11 @@ CREATE TABLE IF NOT EXISTS messages ( codex_reasoning_items TEXT ); +CREATE TABLE IF NOT EXISTS state_meta ( + key TEXT PRIMARY KEY, + value TEXT +); + CREATE INDEX IF NOT EXISTS idx_sessions_source ON sessions(source); CREATE INDEX IF NOT EXISTS idx_sessions_parent ON sessions(parent_session_id); CREATE INDEX IF NOT EXISTS idx_sessions_started ON sessions(started_at DESC); @@ -1455,3 +1460,116 @@ class SessionDB: return len(session_ids) return self._execute_write(_do) + + # ── Meta key/value (for scheduler bookkeeping) ── + + def get_meta(self, key: str) -> Optional[str]: + """Read a value from the state_meta key/value store.""" + with self._lock: + row = self._conn.execute( + "SELECT value FROM state_meta WHERE key = ?", (key,) + ).fetchone() + if row is None: + return None + return row["value"] if isinstance(row, sqlite3.Row) else row[0] + + def set_meta(self, key: str, value: str) -> None: + """Write a value to the state_meta key/value store.""" + def _do(conn): + conn.execute( + "INSERT INTO state_meta (key, value) VALUES (?, ?) " + "ON CONFLICT(key) DO UPDATE SET value = excluded.value", + (key, value), + ) + self._execute_write(_do) + + # ── Space reclamation ── + + def vacuum(self) -> None: + """Run VACUUM to reclaim disk space after large deletes. + + SQLite does not shrink the database file when rows are deleted — + freed pages just get reused on the next insert. After a prune that + removed hundreds of sessions, the file stays bloated unless we + explicitly VACUUM. + + VACUUM rewrites the entire DB, so it's expensive (seconds per + 100MB) and cannot run inside a transaction. It also acquires an + exclusive lock, so callers must ensure no other writers are + active. Safe to call at startup before the gateway/CLI starts + serving traffic. + """ + # VACUUM cannot be executed inside a transaction. + with self._lock: + # Best-effort WAL checkpoint first, then VACUUM. + try: + self._conn.execute("PRAGMA wal_checkpoint(TRUNCATE)") + except Exception: + pass + self._conn.execute("VACUUM") + + def maybe_auto_prune_and_vacuum( + self, + retention_days: int = 90, + min_interval_hours: int = 24, + vacuum: bool = True, + ) -> Dict[str, Any]: + """Idempotent auto-maintenance: prune old sessions + optional VACUUM. + + Records the last run timestamp in state_meta so subsequent calls + within ``min_interval_hours`` no-op. Designed to be called once at + startup from long-lived entrypoints (CLI, gateway, cron scheduler). + + Never raises. On any failure, logs a warning and returns a dict + with ``"error"`` set. + + Returns a dict with keys: + - ``"skipped"`` (bool) — true if within min_interval_hours of last run + - ``"pruned"`` (int) — number of sessions deleted + - ``"vacuumed"`` (bool) — true if VACUUM ran + - ``"error"`` (str, optional) — present only on failure + """ + result: Dict[str, Any] = {"skipped": False, "pruned": 0, "vacuumed": False} + try: + # Skip if another process/call did maintenance recently. + last_raw = self.get_meta("last_auto_prune") + now = time.time() + if last_raw: + try: + last_ts = float(last_raw) + if now - last_ts < min_interval_hours * 3600: + result["skipped"] = True + return result + except (TypeError, ValueError): + pass # corrupt meta; treat as no prior run + + pruned = self.prune_sessions(older_than_days=retention_days) + result["pruned"] = pruned + + # Only VACUUM if we actually freed rows — VACUUM on a tight DB + # is wasted I/O. Threshold keeps small DBs from paying the cost. + if vacuum and pruned > 0: + try: + self.vacuum() + result["vacuumed"] = True + except Exception as exc: + logger.warning("state.db VACUUM failed: %s", exc) + + # Record the attempt even if pruned == 0, so we don't retry + # every startup within the min_interval_hours window. + self.set_meta("last_auto_prune", str(now)) + + if pruned > 0: + logger.info( + "state.db auto-maintenance: pruned %d session(s) older than %d days%s", + pruned, + retention_days, + " + VACUUM" if result["vacuumed"] else "", + ) + except Exception as exc: + # Maintenance must never block startup. Log and return error marker. + logger.warning("state.db auto-maintenance failed: %s", exc) + result["error"] = str(exc) + + return result + diff --git a/tests/test_hermes_state.py b/tests/test_hermes_state.py index 49fea324d..0dd87e292 100644 --- a/tests/test_hermes_state.py +++ b/tests/test_hermes_state.py @@ -1764,3 +1764,124 @@ class TestConcurrentWriteSafety: assert "30" in src, ( "SQLite timeout should be at least 30s to handle CLI/gateway lock contention" ) + + +# ========================================================================= +# Auto-maintenance: state_meta + vacuum + maybe_auto_prune_and_vacuum +# ========================================================================= + +class TestStateMeta: + def test_get_meta_missing_returns_none(self, db): + assert db.get_meta("nonexistent") is None + + def test_set_then_get_meta(self, db): + db.set_meta("foo", "bar") + assert db.get_meta("foo") == "bar" + + def test_set_meta_upsert(self, db): + """set_meta overwrites existing value (ON CONFLICT DO UPDATE).""" + db.set_meta("key", "v1") + db.set_meta("key", "v2") + assert db.get_meta("key") == "v2" + + +class TestVacuum: + def test_vacuum_runs_without_error(self, db): + """VACUUM must succeed on a fresh DB (no rows to reclaim).""" + db.create_session(session_id="s1", source="cli") + db.append_message(session_id="s1", role="user", content="hi") + # Should not raise, even though there's nothing significant to reclaim. + db.vacuum() + + +class TestAutoMaintenance: + def _make_old_ended(self, db, sid: str, days_old: int = 100): + """Create a session that is ended and was started `days_old` days ago.""" + db.create_session(session_id=sid, source="cli") + db.end_session(sid, end_reason="done") + db._conn.execute( + "UPDATE sessions SET started_at = ? WHERE id = ?", + (time.time() - days_old * 86400, sid), + ) + db._conn.commit() + + def test_first_run_prunes_and_vacuums(self, db): + self._make_old_ended(db, "old1", days_old=100) + self._make_old_ended(db, "old2", days_old=100) + db.create_session(session_id="new", source="cli") # active, must survive + + result = db.maybe_auto_prune_and_vacuum(retention_days=90) + assert result["skipped"] is False + assert result["pruned"] == 2 + assert result["vacuumed"] is True + assert result.get("error") is None + assert db.get_session("old1") is None + assert db.get_session("old2") is None + assert db.get_session("new") is not None + + def test_second_call_within_interval_skips(self, db): + self._make_old_ended(db, "old", days_old=100) + first = db.maybe_auto_prune_and_vacuum( + retention_days=90, min_interval_hours=24 + ) + assert first["skipped"] is False + assert first["pruned"] == 1 + + # Create another prunable session; a second call within + # min_interval_hours should still skip without touching it. + self._make_old_ended(db, "old2", days_old=100) + second = db.maybe_auto_prune_and_vacuum( + retention_days=90, min_interval_hours=24 + ) + assert second["skipped"] is True + assert second["pruned"] == 0 + assert db.get_session("old2") is not None # untouched + + def test_second_call_after_interval_runs_again(self, db): + self._make_old_ended(db, "old", days_old=100) + db.maybe_auto_prune_and_vacuum(retention_days=90, min_interval_hours=24) + + # Backdate the last-run marker to force another run. + db.set_meta("last_auto_prune", str(time.time() - 48 * 3600)) + + self._make_old_ended(db, "old2", days_old=100) + result = db.maybe_auto_prune_and_vacuum( + retention_days=90, min_interval_hours=24 + ) + assert result["skipped"] is False + assert result["pruned"] == 1 + assert db.get_session("old2") is None + + def test_no_prunable_sessions_no_vacuum(self, db): + """When prune deletes 0 rows, VACUUM is skipped (wasted I/O).""" + db.create_session(session_id="fresh", source="cli") # too recent + result = db.maybe_auto_prune_and_vacuum(retention_days=90) + assert result["skipped"] is False + assert result["pruned"] == 0 + assert result["vacuumed"] is False + # But last-run is still recorded so we don't retry immediately. + assert db.get_meta("last_auto_prune") is not None + + def test_vacuum_disabled_via_flag(self, db): + self._make_old_ended(db, "old", days_old=100) + result = db.maybe_auto_prune_and_vacuum(retention_days=90, vacuum=False) + assert result["pruned"] == 1 + assert result["vacuumed"] is False + + def test_corrupt_last_run_marker_treated_as_no_prior_run(self, db): + """A non-numeric marker must not break maintenance.""" + db.set_meta("last_auto_prune", "not-a-timestamp") + self._make_old_ended(db, "old", days_old=100) + result = db.maybe_auto_prune_and_vacuum(retention_days=90) + assert result["skipped"] is False + assert result["pruned"] == 1 + + def test_state_meta_survives_vacuum(self, db): + """Marker written just before VACUUM must still be readable after.""" + self._make_old_ended(db, "old", days_old=100) + db.maybe_auto_prune_and_vacuum(retention_days=90) + marker = db.get_meta("last_auto_prune") + assert marker is not None + # Should parse as a float timestamp close to now. + assert abs(float(marker) - time.time()) < 60 + diff --git a/website/docs/user-guide/sessions.md b/website/docs/user-guide/sessions.md index bd1007859..a60f35776 100644 --- a/website/docs/user-guide/sessions.md +++ b/website/docs/user-guide/sessions.md @@ -386,7 +386,21 @@ Key tables in `state.db`: - Gateway sessions auto-reset based on the configured reset policy - Before reset, the agent saves memories and skills from the expiring session -- Ended sessions remain in the database until pruned +- Opt-in auto-pruning: when `sessions.auto_prune` is `true`, ended sessions older than `sessions.retention_days` (default 90) are pruned at CLI/gateway startup +- After a prune that actually removed rows, `state.db` is `VACUUM`ed to reclaim disk space (SQLite does not shrink the file on plain DELETE) +- Pruning runs at most once per `sessions.min_interval_hours` (default 24); the last-run timestamp is tracked inside `state.db` itself so it's shared across every Hermes process in the same `HERMES_HOME` + +Default is **off** — session history is valuable for `session_search` recall, and silently deleting it could surprise users. Enable in `~/.hermes/config.yaml`: + +```yaml +sessions: + auto_prune: true # opt in — default is false + retention_days: 90 # keep ended sessions this many days + vacuum_after_prune: true # reclaim disk space after a pruning sweep + min_interval_hours: 24 # don't re-run the sweep more often than this +``` + +Active sessions are never auto-pruned, regardless of age. ### Manual Cleanup @@ -403,5 +417,5 @@ hermes sessions prune --older-than 30 --yes ``` :::tip -The database grows slowly (typical: 10-15 MB for hundreds of sessions). Pruning is mainly useful for removing old conversations you no longer need for search recall. +The database grows slowly (typical: 10-15 MB for hundreds of sessions) and session history powers `session_search` recall across past conversations, so auto-prune ships disabled. Enable it if you're running a heavy gateway/cron workload where `state.db` is meaningfully affecting performance (observed failure mode: 384 MB state.db with ~1000 sessions slowing down FTS5 inserts and `/resume` listing). Use `hermes sessions prune` for one-off cleanup without turning on the automatic sweep. :::