mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(state): auto-prune old sessions + VACUUM state.db at startup (#13861)
* feat(state): auto-prune old sessions + VACUUM state.db at startup state.db accumulates every session, message, and FTS5 index entry forever. A heavy user (gateway + cron) reported 384MB with 982 sessions / 68K messages causing slowdown; manual 'hermes sessions prune --older-than 7' + VACUUM brought it to 43MB. The prune command and VACUUM are not wired to run automatically anywhere — sessions grew unbounded until users noticed. Changes: - hermes_state.py: new state_meta key/value table, vacuum() method, and maybe_auto_prune_and_vacuum() — idempotent via last-run timestamp in state_meta so it only actually executes once per min_interval_hours across all Hermes processes for a given HERMES_HOME. Never raises. - hermes_cli/config.py: new 'sessions:' block in DEFAULT_CONFIG (auto_prune=True, retention_days=90, vacuum_after_prune=True, min_interval_hours=24). Added to _KNOWN_ROOT_KEYS. - cli.py: call maintenance once at HermesCLI init (shared helper _run_state_db_auto_maintenance reads config and delegates to DB). - gateway/run.py: call maintenance once at GatewayRunner init. - Docs: user-guide/sessions.md rewrites 'Automatic Cleanup' section. Why VACUUM matters: SQLite does NOT shrink the file on DELETE — freed pages get reused on next INSERT. Without VACUUM, a delete-heavy DB stays bloated forever. VACUUM only runs when the prune actually removed rows, so tight DBs don't pay the I/O cost. Tests: 10 new tests in tests/test_hermes_state.py covering state_meta, vacuum, idempotency, interval skipping, VACUUM-only-when-needed, corrupt-marker recovery. All 246 existing state/config/gateway tests still pass. Verified E2E with real imports + isolated HERMES_HOME: DEFAULT_CONFIG exposes the new block, load_config() returns it for fresh installs, first call prunes+vacuums, second call within min_interval_hours skips, and the state_meta marker persists across connection close/reopen. * sessions.auto_prune defaults to false (opt-in) Session history powers session_search recall across past conversations, so silently pruning on startup could surprise users. Ship the machinery disabled and let users opt in when they notice state.db is hurting performance. - DEFAULT_CONFIG.sessions.auto_prune: True → False - Call-site fallbacks in cli.py and gateway/run.py match the new default (so unmigrated configs still see off) - Docs: flip 'Enable in config.yaml' framing + tip explains the tradeoff
This commit is contained in:
parent
b43524ecab
commit
b8663813b6
6 changed files with 337 additions and 4 deletions
|
|
@ -1764,3 +1764,124 @@ class TestConcurrentWriteSafety:
|
|||
assert "30" in src, (
|
||||
"SQLite timeout should be at least 30s to handle CLI/gateway lock contention"
|
||||
)
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Auto-maintenance: state_meta + vacuum + maybe_auto_prune_and_vacuum
|
||||
# =========================================================================
|
||||
|
||||
class TestStateMeta:
|
||||
def test_get_meta_missing_returns_none(self, db):
|
||||
assert db.get_meta("nonexistent") is None
|
||||
|
||||
def test_set_then_get_meta(self, db):
|
||||
db.set_meta("foo", "bar")
|
||||
assert db.get_meta("foo") == "bar"
|
||||
|
||||
def test_set_meta_upsert(self, db):
|
||||
"""set_meta overwrites existing value (ON CONFLICT DO UPDATE)."""
|
||||
db.set_meta("key", "v1")
|
||||
db.set_meta("key", "v2")
|
||||
assert db.get_meta("key") == "v2"
|
||||
|
||||
|
||||
class TestVacuum:
|
||||
def test_vacuum_runs_without_error(self, db):
|
||||
"""VACUUM must succeed on a fresh DB (no rows to reclaim)."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message(session_id="s1", role="user", content="hi")
|
||||
# Should not raise, even though there's nothing significant to reclaim.
|
||||
db.vacuum()
|
||||
|
||||
|
||||
class TestAutoMaintenance:
|
||||
def _make_old_ended(self, db, sid: str, days_old: int = 100):
|
||||
"""Create a session that is ended and was started `days_old` days ago."""
|
||||
db.create_session(session_id=sid, source="cli")
|
||||
db.end_session(sid, end_reason="done")
|
||||
db._conn.execute(
|
||||
"UPDATE sessions SET started_at = ? WHERE id = ?",
|
||||
(time.time() - days_old * 86400, sid),
|
||||
)
|
||||
db._conn.commit()
|
||||
|
||||
def test_first_run_prunes_and_vacuums(self, db):
|
||||
self._make_old_ended(db, "old1", days_old=100)
|
||||
self._make_old_ended(db, "old2", days_old=100)
|
||||
db.create_session(session_id="new", source="cli") # active, must survive
|
||||
|
||||
result = db.maybe_auto_prune_and_vacuum(retention_days=90)
|
||||
assert result["skipped"] is False
|
||||
assert result["pruned"] == 2
|
||||
assert result["vacuumed"] is True
|
||||
assert result.get("error") is None
|
||||
assert db.get_session("old1") is None
|
||||
assert db.get_session("old2") is None
|
||||
assert db.get_session("new") is not None
|
||||
|
||||
def test_second_call_within_interval_skips(self, db):
|
||||
self._make_old_ended(db, "old", days_old=100)
|
||||
first = db.maybe_auto_prune_and_vacuum(
|
||||
retention_days=90, min_interval_hours=24
|
||||
)
|
||||
assert first["skipped"] is False
|
||||
assert first["pruned"] == 1
|
||||
|
||||
# Create another prunable session; a second call within
|
||||
# min_interval_hours should still skip without touching it.
|
||||
self._make_old_ended(db, "old2", days_old=100)
|
||||
second = db.maybe_auto_prune_and_vacuum(
|
||||
retention_days=90, min_interval_hours=24
|
||||
)
|
||||
assert second["skipped"] is True
|
||||
assert second["pruned"] == 0
|
||||
assert db.get_session("old2") is not None # untouched
|
||||
|
||||
def test_second_call_after_interval_runs_again(self, db):
|
||||
self._make_old_ended(db, "old", days_old=100)
|
||||
db.maybe_auto_prune_and_vacuum(retention_days=90, min_interval_hours=24)
|
||||
|
||||
# Backdate the last-run marker to force another run.
|
||||
db.set_meta("last_auto_prune", str(time.time() - 48 * 3600))
|
||||
|
||||
self._make_old_ended(db, "old2", days_old=100)
|
||||
result = db.maybe_auto_prune_and_vacuum(
|
||||
retention_days=90, min_interval_hours=24
|
||||
)
|
||||
assert result["skipped"] is False
|
||||
assert result["pruned"] == 1
|
||||
assert db.get_session("old2") is None
|
||||
|
||||
def test_no_prunable_sessions_no_vacuum(self, db):
|
||||
"""When prune deletes 0 rows, VACUUM is skipped (wasted I/O)."""
|
||||
db.create_session(session_id="fresh", source="cli") # too recent
|
||||
result = db.maybe_auto_prune_and_vacuum(retention_days=90)
|
||||
assert result["skipped"] is False
|
||||
assert result["pruned"] == 0
|
||||
assert result["vacuumed"] is False
|
||||
# But last-run is still recorded so we don't retry immediately.
|
||||
assert db.get_meta("last_auto_prune") is not None
|
||||
|
||||
def test_vacuum_disabled_via_flag(self, db):
|
||||
self._make_old_ended(db, "old", days_old=100)
|
||||
result = db.maybe_auto_prune_and_vacuum(retention_days=90, vacuum=False)
|
||||
assert result["pruned"] == 1
|
||||
assert result["vacuumed"] is False
|
||||
|
||||
def test_corrupt_last_run_marker_treated_as_no_prior_run(self, db):
|
||||
"""A non-numeric marker must not break maintenance."""
|
||||
db.set_meta("last_auto_prune", "not-a-timestamp")
|
||||
self._make_old_ended(db, "old", days_old=100)
|
||||
result = db.maybe_auto_prune_and_vacuum(retention_days=90)
|
||||
assert result["skipped"] is False
|
||||
assert result["pruned"] == 1
|
||||
|
||||
def test_state_meta_survives_vacuum(self, db):
|
||||
"""Marker written just before VACUUM must still be readable after."""
|
||||
self._make_old_ended(db, "old", days_old=100)
|
||||
db.maybe_auto_prune_and_vacuum(retention_days=90)
|
||||
marker = db.get_meta("last_auto_prune")
|
||||
assert marker is not None
|
||||
# Should parse as a float timestamp close to now.
|
||||
assert abs(float(marker) - time.time()) < 60
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue