mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(state): auto-prune old sessions + VACUUM state.db at startup (#13861)
* feat(state): auto-prune old sessions + VACUUM state.db at startup state.db accumulates every session, message, and FTS5 index entry forever. A heavy user (gateway + cron) reported 384MB with 982 sessions / 68K messages causing slowdown; manual 'hermes sessions prune --older-than 7' + VACUUM brought it to 43MB. The prune command and VACUUM are not wired to run automatically anywhere — sessions grew unbounded until users noticed. Changes: - hermes_state.py: new state_meta key/value table, vacuum() method, and maybe_auto_prune_and_vacuum() — idempotent via last-run timestamp in state_meta so it only actually executes once per min_interval_hours across all Hermes processes for a given HERMES_HOME. Never raises. - hermes_cli/config.py: new 'sessions:' block in DEFAULT_CONFIG (auto_prune=True, retention_days=90, vacuum_after_prune=True, min_interval_hours=24). Added to _KNOWN_ROOT_KEYS. - cli.py: call maintenance once at HermesCLI init (shared helper _run_state_db_auto_maintenance reads config and delegates to DB). - gateway/run.py: call maintenance once at GatewayRunner init. - Docs: user-guide/sessions.md rewrites 'Automatic Cleanup' section. Why VACUUM matters: SQLite does NOT shrink the file on DELETE — freed pages get reused on next INSERT. Without VACUUM, a delete-heavy DB stays bloated forever. VACUUM only runs when the prune actually removed rows, so tight DBs don't pay the I/O cost. Tests: 10 new tests in tests/test_hermes_state.py covering state_meta, vacuum, idempotency, interval skipping, VACUUM-only-when-needed, corrupt-marker recovery. All 246 existing state/config/gateway tests still pass. Verified E2E with real imports + isolated HERMES_HOME: DEFAULT_CONFIG exposes the new block, load_config() returns it for fresh installs, first call prunes+vacuums, second call within min_interval_hours skips, and the state_meta marker persists across connection close/reopen. * sessions.auto_prune defaults to false (opt-in) Session history powers session_search recall across past conversations, so silently pruning on startup could surprise users. Ship the machinery disabled and let users opt in when they notice state.db is hurting performance. - DEFAULT_CONFIG.sessions.auto_prune: True → False - Call-site fallbacks in cli.py and gateway/run.py match the new default (so unmigrated configs still see off) - Docs: flip 'Enable in config.yaml' framing + tip explains the tradeoff
This commit is contained in:
parent
b43524ecab
commit
b8663813b6
6 changed files with 337 additions and 4 deletions
118
hermes_state.py
118
hermes_state.py
|
|
@ -85,6 +85,11 @@ CREATE TABLE IF NOT EXISTS messages (
|
|||
codex_reasoning_items TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS state_meta (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sessions_source ON sessions(source);
|
||||
CREATE INDEX IF NOT EXISTS idx_sessions_parent ON sessions(parent_session_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_sessions_started ON sessions(started_at DESC);
|
||||
|
|
@ -1455,3 +1460,116 @@ class SessionDB:
|
|||
return len(session_ids)
|
||||
|
||||
return self._execute_write(_do)
|
||||
|
||||
# ── Meta key/value (for scheduler bookkeeping) ──
|
||||
|
||||
def get_meta(self, key: str) -> Optional[str]:
|
||||
"""Read a value from the state_meta key/value store."""
|
||||
with self._lock:
|
||||
row = self._conn.execute(
|
||||
"SELECT value FROM state_meta WHERE key = ?", (key,)
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
return row["value"] if isinstance(row, sqlite3.Row) else row[0]
|
||||
|
||||
def set_meta(self, key: str, value: str) -> None:
|
||||
"""Write a value to the state_meta key/value store."""
|
||||
def _do(conn):
|
||||
conn.execute(
|
||||
"INSERT INTO state_meta (key, value) VALUES (?, ?) "
|
||||
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||
(key, value),
|
||||
)
|
||||
self._execute_write(_do)
|
||||
|
||||
# ── Space reclamation ──
|
||||
|
||||
def vacuum(self) -> None:
|
||||
"""Run VACUUM to reclaim disk space after large deletes.
|
||||
|
||||
SQLite does not shrink the database file when rows are deleted —
|
||||
freed pages just get reused on the next insert. After a prune that
|
||||
removed hundreds of sessions, the file stays bloated unless we
|
||||
explicitly VACUUM.
|
||||
|
||||
VACUUM rewrites the entire DB, so it's expensive (seconds per
|
||||
100MB) and cannot run inside a transaction. It also acquires an
|
||||
exclusive lock, so callers must ensure no other writers are
|
||||
active. Safe to call at startup before the gateway/CLI starts
|
||||
serving traffic.
|
||||
"""
|
||||
# VACUUM cannot be executed inside a transaction.
|
||||
with self._lock:
|
||||
# Best-effort WAL checkpoint first, then VACUUM.
|
||||
try:
|
||||
self._conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||
except Exception:
|
||||
pass
|
||||
self._conn.execute("VACUUM")
|
||||
|
||||
def maybe_auto_prune_and_vacuum(
|
||||
self,
|
||||
retention_days: int = 90,
|
||||
min_interval_hours: int = 24,
|
||||
vacuum: bool = True,
|
||||
) -> Dict[str, Any]:
|
||||
"""Idempotent auto-maintenance: prune old sessions + optional VACUUM.
|
||||
|
||||
Records the last run timestamp in state_meta so subsequent calls
|
||||
within ``min_interval_hours`` no-op. Designed to be called once at
|
||||
startup from long-lived entrypoints (CLI, gateway, cron scheduler).
|
||||
|
||||
Never raises. On any failure, logs a warning and returns a dict
|
||||
with ``"error"`` set.
|
||||
|
||||
Returns a dict with keys:
|
||||
- ``"skipped"`` (bool) — true if within min_interval_hours of last run
|
||||
- ``"pruned"`` (int) — number of sessions deleted
|
||||
- ``"vacuumed"`` (bool) — true if VACUUM ran
|
||||
- ``"error"`` (str, optional) — present only on failure
|
||||
"""
|
||||
result: Dict[str, Any] = {"skipped": False, "pruned": 0, "vacuumed": False}
|
||||
try:
|
||||
# Skip if another process/call did maintenance recently.
|
||||
last_raw = self.get_meta("last_auto_prune")
|
||||
now = time.time()
|
||||
if last_raw:
|
||||
try:
|
||||
last_ts = float(last_raw)
|
||||
if now - last_ts < min_interval_hours * 3600:
|
||||
result["skipped"] = True
|
||||
return result
|
||||
except (TypeError, ValueError):
|
||||
pass # corrupt meta; treat as no prior run
|
||||
|
||||
pruned = self.prune_sessions(older_than_days=retention_days)
|
||||
result["pruned"] = pruned
|
||||
|
||||
# Only VACUUM if we actually freed rows — VACUUM on a tight DB
|
||||
# is wasted I/O. Threshold keeps small DBs from paying the cost.
|
||||
if vacuum and pruned > 0:
|
||||
try:
|
||||
self.vacuum()
|
||||
result["vacuumed"] = True
|
||||
except Exception as exc:
|
||||
logger.warning("state.db VACUUM failed: %s", exc)
|
||||
|
||||
# Record the attempt even if pruned == 0, so we don't retry
|
||||
# every startup within the min_interval_hours window.
|
||||
self.set_meta("last_auto_prune", str(now))
|
||||
|
||||
if pruned > 0:
|
||||
logger.info(
|
||||
"state.db auto-maintenance: pruned %d session(s) older than %d days%s",
|
||||
pruned,
|
||||
retention_days,
|
||||
" + VACUUM" if result["vacuumed"] else "",
|
||||
)
|
||||
except Exception as exc:
|
||||
# Maintenance must never block startup. Log and return error marker.
|
||||
logger.warning("state.db auto-maintenance failed: %s", exc)
|
||||
result["error"] = str(exc)
|
||||
|
||||
return result
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue