feat(checkpoints): auto-prune orphan and stale shadow repos at startup (#16303)

Every working dir hermes ever touches gets its own shadow git repo under
~/.hermes/checkpoints/{sha256(abs_dir)[:16]}/.  The per-repo _prune is a
no-op (comment in CheckpointManager._prune says so), so abandoned repos
from deleted/moved projects or one-off tmp dirs pile up forever.  Field
reports put the typical offender at 1000+ repos / ~12 GB on active
contributor machines.

Adds an opt-in startup sweep that mirrors the sessions.auto_prune
pattern from #13861 / #16286:

- tools/checkpoint_manager.py: new prune_checkpoints() and
  maybe_auto_prune_checkpoints() helpers.  Deletes shadow repos that
  are orphan (HERMES_WORKDIR marker points to a path that no longer
  exists) or stale (newest in-repo mtime older than retention_days).
  Idempotent via a CHECKPOINT_BASE/.last_prune marker file so it only
  runs once per min_interval_hours regardless of how many hermes
  processes start up.
- hermes_cli/config.py: new checkpoints.auto_prune /
  retention_days / delete_orphans / min_interval_hours knobs.
  Default auto_prune: false so users who rely on /rollback against
  long-ago sessions never lose data silently.
- cli.py / gateway/run.py: startup hooks gated on checkpoints.auto_prune,
  called right next to the existing state.db maintenance block.
- Docs updated with the new config knobs.
- 11 regression tests: orphan/stale deletion, precedence, byte-freed
  tracking, non-shadow dir skip, interval gating, corrupt marker
  recovery.

Refs #3015 (session-file disk growth was fixed in #16286; this covers
the checkpoint side noted out-of-scope there).
This commit is contained in:
Teknium 2026-04-26 19:05:52 -07:00 committed by GitHub
parent ced8f44cd2
commit 478444c262
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 458 additions and 0 deletions

View file

@ -717,3 +717,193 @@ class TestGpgAndGlobalConfigIsolation:
mgr = CheckpointManager(enabled=True)
assert mgr.ensure_checkpoint(str(work_dir), reason="prefix-shadow") is True
assert len(mgr.list_checkpoints(str(work_dir))) == 1
# =========================================================================
# Auto-maintenance: prune_checkpoints + maybe_auto_prune_checkpoints
# =========================================================================
class TestPruneCheckpoints:
"""Sweep orphan/stale shadow repos under CHECKPOINT_BASE (issue #3015 follow-up)."""
def _seed_shadow_repo(
self, base: Path, dir_hash: str, workdir: Path, mtime: float = None
) -> Path:
"""Create a minimal shadow repo on disk without invoking real git."""
import time as _time
shadow = base / dir_hash
shadow.mkdir(parents=True)
(shadow / "HEAD").write_text("ref: refs/heads/main\n")
(shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n")
(shadow / "info").mkdir()
(shadow / "info" / "exclude").write_text("node_modules/\n")
if mtime is not None:
for p in shadow.rglob("*"):
import os
os.utime(p, (mtime, mtime))
import os
os.utime(shadow, (mtime, mtime))
return shadow
def test_deletes_orphan_when_workdir_missing(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
base = tmp_path / "checkpoints"
alive_work = tmp_path / "alive"
alive_work.mkdir()
alive_repo = self._seed_shadow_repo(base, "aaaa" * 4, alive_work)
orphan_repo = self._seed_shadow_repo(
base, "bbbb" * 4, tmp_path / "was-deleted"
)
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
assert result["scanned"] == 2
assert result["deleted_orphan"] == 1
assert result["deleted_stale"] == 0
assert alive_repo.exists()
assert not orphan_repo.exists()
def test_deletes_stale_by_mtime_when_workdir_alive(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
import time as _time
base = tmp_path / "checkpoints"
work = tmp_path / "work"
work.mkdir()
fresh_repo = self._seed_shadow_repo(base, "cccc" * 4, work)
stale_work = tmp_path / "stale_work"
stale_work.mkdir()
old = _time.time() - 60 * 86400 # 60 days ago
stale_repo = self._seed_shadow_repo(base, "dddd" * 4, stale_work, mtime=old)
result = prune_checkpoints(
retention_days=30, delete_orphans=False, checkpoint_base=base
)
assert result["deleted_orphan"] == 0
assert result["deleted_stale"] == 1
assert fresh_repo.exists()
assert not stale_repo.exists()
def test_orphan_takes_priority_over_stale(self, tmp_path):
"""Orphan detection counts first — reason="orphan" even if also stale."""
from tools.checkpoint_manager import prune_checkpoints
import time as _time
base = tmp_path / "checkpoints"
old = _time.time() - 60 * 86400
self._seed_shadow_repo(base, "eeee" * 4, tmp_path / "gone", mtime=old)
result = prune_checkpoints(retention_days=30, checkpoint_base=base)
assert result["deleted_orphan"] == 1
assert result["deleted_stale"] == 0
def test_delete_orphans_disabled_keeps_orphans(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
base = tmp_path / "checkpoints"
orphan = self._seed_shadow_repo(base, "ffff" * 4, tmp_path / "gone")
result = prune_checkpoints(
retention_days=0, delete_orphans=False, checkpoint_base=base
)
assert result["deleted_orphan"] == 0
assert orphan.exists()
def test_skips_non_shadow_dirs(self, tmp_path):
"""Dirs without HEAD (non-initialised) are left alone."""
from tools.checkpoint_manager import prune_checkpoints
base = tmp_path / "checkpoints"
base.mkdir()
(base / "garbage-dir").mkdir()
(base / "garbage-dir" / "random.txt").write_text("hi")
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
assert result["scanned"] == 0
assert (base / "garbage-dir").exists()
def test_tracks_bytes_freed(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
base = tmp_path / "checkpoints"
orphan = self._seed_shadow_repo(base, "1234" * 4, tmp_path / "gone")
(orphan / "objects").mkdir()
(orphan / "objects" / "pack.bin").write_bytes(b"x" * 5000)
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
assert result["deleted_orphan"] == 1
assert result["bytes_freed"] >= 5000
def test_base_missing_returns_empty_counts(self, tmp_path):
from tools.checkpoint_manager import prune_checkpoints
result = prune_checkpoints(checkpoint_base=tmp_path / "does-not-exist")
assert result == {
"scanned": 0, "deleted_orphan": 0, "deleted_stale": 0,
"errors": 0, "bytes_freed": 0,
}
class TestMaybeAutoPruneCheckpoints:
def _seed(self, base, dir_hash, workdir):
base.mkdir(parents=True, exist_ok=True)
shadow = base / dir_hash
shadow.mkdir()
(shadow / "HEAD").write_text("ref: refs/heads/main\n")
(shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n")
return shadow
def test_first_call_prunes_and_writes_marker(self, tmp_path):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
base = tmp_path / "checkpoints"
self._seed(base, "0000" * 4, tmp_path / "gone")
out = maybe_auto_prune_checkpoints(checkpoint_base=base)
assert out["skipped"] is False
assert out["result"]["deleted_orphan"] == 1
assert (base / ".last_prune").exists()
def test_second_call_within_interval_skips(self, tmp_path):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
base = tmp_path / "checkpoints"
self._seed(base, "1111" * 4, tmp_path / "gone")
first = maybe_auto_prune_checkpoints(
checkpoint_base=base, min_interval_hours=24
)
assert first["skipped"] is False
self._seed(base, "2222" * 4, tmp_path / "also-gone")
second = maybe_auto_prune_checkpoints(
checkpoint_base=base, min_interval_hours=24
)
assert second["skipped"] is True
# The second orphan must still exist — skip was honoured.
assert (base / ("2222" * 4)).exists()
def test_corrupt_marker_treated_as_no_prior_run(self, tmp_path):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
base = tmp_path / "checkpoints"
base.mkdir()
(base / ".last_prune").write_text("not-a-timestamp")
self._seed(base, "3333" * 4, tmp_path / "gone")
out = maybe_auto_prune_checkpoints(checkpoint_base=base)
assert out["skipped"] is False
assert out["result"]["deleted_orphan"] == 1
def test_missing_base_no_raise(self, tmp_path):
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
out = maybe_auto_prune_checkpoints(
checkpoint_base=tmp_path / "does-not-exist"
)
assert out["skipped"] is False
assert out["result"]["scanned"] == 0