mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-30 01:41:43 +00:00
feat(checkpoints): auto-prune orphan and stale shadow repos at startup (#16303)
Every working dir hermes ever touches gets its own shadow git repo under
~/.hermes/checkpoints/{sha256(abs_dir)[:16]}/. The per-repo _prune is a
no-op (comment in CheckpointManager._prune says so), so abandoned repos
from deleted/moved projects or one-off tmp dirs pile up forever. Field
reports put the typical offender at 1000+ repos / ~12 GB on active
contributor machines.
Adds an opt-in startup sweep that mirrors the sessions.auto_prune
pattern from #13861 / #16286:
- tools/checkpoint_manager.py: new prune_checkpoints() and
maybe_auto_prune_checkpoints() helpers. Deletes shadow repos that
are orphan (HERMES_WORKDIR marker points to a path that no longer
exists) or stale (newest in-repo mtime older than retention_days).
Idempotent via a CHECKPOINT_BASE/.last_prune marker file so it only
runs once per min_interval_hours regardless of how many hermes
processes start up.
- hermes_cli/config.py: new checkpoints.auto_prune /
retention_days / delete_orphans / min_interval_hours knobs.
Default auto_prune: false so users who rely on /rollback against
long-ago sessions never lose data silently.
- cli.py / gateway/run.py: startup hooks gated on checkpoints.auto_prune,
called right next to the existing state.db maintenance block.
- Docs updated with the new config knobs.
- 11 regression tests: orphan/stale deletion, precedence, byte-freed
tracking, non-shadow dir skip, interval gating, corrupt marker
recovery.
Refs #3015 (session-file disk growth was fixed in #16286; this covers
the checkpoint side noted out-of-scope there).
This commit is contained in:
parent
ced8f44cd2
commit
478444c262
6 changed files with 458 additions and 0 deletions
|
|
@ -717,3 +717,193 @@ class TestGpgAndGlobalConfigIsolation:
|
|||
mgr = CheckpointManager(enabled=True)
|
||||
assert mgr.ensure_checkpoint(str(work_dir), reason="prefix-shadow") is True
|
||||
assert len(mgr.list_checkpoints(str(work_dir))) == 1
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Auto-maintenance: prune_checkpoints + maybe_auto_prune_checkpoints
|
||||
# =========================================================================
|
||||
|
||||
class TestPruneCheckpoints:
|
||||
"""Sweep orphan/stale shadow repos under CHECKPOINT_BASE (issue #3015 follow-up)."""
|
||||
|
||||
def _seed_shadow_repo(
|
||||
self, base: Path, dir_hash: str, workdir: Path, mtime: float = None
|
||||
) -> Path:
|
||||
"""Create a minimal shadow repo on disk without invoking real git."""
|
||||
import time as _time
|
||||
shadow = base / dir_hash
|
||||
shadow.mkdir(parents=True)
|
||||
(shadow / "HEAD").write_text("ref: refs/heads/main\n")
|
||||
(shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n")
|
||||
(shadow / "info").mkdir()
|
||||
(shadow / "info" / "exclude").write_text("node_modules/\n")
|
||||
if mtime is not None:
|
||||
for p in shadow.rglob("*"):
|
||||
import os
|
||||
os.utime(p, (mtime, mtime))
|
||||
import os
|
||||
os.utime(shadow, (mtime, mtime))
|
||||
return shadow
|
||||
|
||||
def test_deletes_orphan_when_workdir_missing(self, tmp_path):
|
||||
from tools.checkpoint_manager import prune_checkpoints
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
alive_work = tmp_path / "alive"
|
||||
alive_work.mkdir()
|
||||
alive_repo = self._seed_shadow_repo(base, "aaaa" * 4, alive_work)
|
||||
orphan_repo = self._seed_shadow_repo(
|
||||
base, "bbbb" * 4, tmp_path / "was-deleted"
|
||||
)
|
||||
|
||||
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
|
||||
|
||||
assert result["scanned"] == 2
|
||||
assert result["deleted_orphan"] == 1
|
||||
assert result["deleted_stale"] == 0
|
||||
assert alive_repo.exists()
|
||||
assert not orphan_repo.exists()
|
||||
|
||||
def test_deletes_stale_by_mtime_when_workdir_alive(self, tmp_path):
|
||||
from tools.checkpoint_manager import prune_checkpoints
|
||||
import time as _time
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
work = tmp_path / "work"
|
||||
work.mkdir()
|
||||
|
||||
fresh_repo = self._seed_shadow_repo(base, "cccc" * 4, work)
|
||||
stale_work = tmp_path / "stale_work"
|
||||
stale_work.mkdir()
|
||||
old = _time.time() - 60 * 86400 # 60 days ago
|
||||
stale_repo = self._seed_shadow_repo(base, "dddd" * 4, stale_work, mtime=old)
|
||||
|
||||
result = prune_checkpoints(
|
||||
retention_days=30, delete_orphans=False, checkpoint_base=base
|
||||
)
|
||||
|
||||
assert result["deleted_orphan"] == 0
|
||||
assert result["deleted_stale"] == 1
|
||||
assert fresh_repo.exists()
|
||||
assert not stale_repo.exists()
|
||||
|
||||
def test_orphan_takes_priority_over_stale(self, tmp_path):
|
||||
"""Orphan detection counts first — reason="orphan" even if also stale."""
|
||||
from tools.checkpoint_manager import prune_checkpoints
|
||||
import time as _time
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
old = _time.time() - 60 * 86400
|
||||
self._seed_shadow_repo(base, "eeee" * 4, tmp_path / "gone", mtime=old)
|
||||
|
||||
result = prune_checkpoints(retention_days=30, checkpoint_base=base)
|
||||
assert result["deleted_orphan"] == 1
|
||||
assert result["deleted_stale"] == 0
|
||||
|
||||
def test_delete_orphans_disabled_keeps_orphans(self, tmp_path):
|
||||
from tools.checkpoint_manager import prune_checkpoints
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
orphan = self._seed_shadow_repo(base, "ffff" * 4, tmp_path / "gone")
|
||||
|
||||
result = prune_checkpoints(
|
||||
retention_days=0, delete_orphans=False, checkpoint_base=base
|
||||
)
|
||||
assert result["deleted_orphan"] == 0
|
||||
assert orphan.exists()
|
||||
|
||||
def test_skips_non_shadow_dirs(self, tmp_path):
|
||||
"""Dirs without HEAD (non-initialised) are left alone."""
|
||||
from tools.checkpoint_manager import prune_checkpoints
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
base.mkdir()
|
||||
(base / "garbage-dir").mkdir()
|
||||
(base / "garbage-dir" / "random.txt").write_text("hi")
|
||||
|
||||
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
|
||||
assert result["scanned"] == 0
|
||||
assert (base / "garbage-dir").exists()
|
||||
|
||||
def test_tracks_bytes_freed(self, tmp_path):
|
||||
from tools.checkpoint_manager import prune_checkpoints
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
orphan = self._seed_shadow_repo(base, "1234" * 4, tmp_path / "gone")
|
||||
(orphan / "objects").mkdir()
|
||||
(orphan / "objects" / "pack.bin").write_bytes(b"x" * 5000)
|
||||
|
||||
result = prune_checkpoints(retention_days=0, checkpoint_base=base)
|
||||
assert result["deleted_orphan"] == 1
|
||||
assert result["bytes_freed"] >= 5000
|
||||
|
||||
def test_base_missing_returns_empty_counts(self, tmp_path):
|
||||
from tools.checkpoint_manager import prune_checkpoints
|
||||
|
||||
result = prune_checkpoints(checkpoint_base=tmp_path / "does-not-exist")
|
||||
assert result == {
|
||||
"scanned": 0, "deleted_orphan": 0, "deleted_stale": 0,
|
||||
"errors": 0, "bytes_freed": 0,
|
||||
}
|
||||
|
||||
|
||||
class TestMaybeAutoPruneCheckpoints:
|
||||
def _seed(self, base, dir_hash, workdir):
|
||||
base.mkdir(parents=True, exist_ok=True)
|
||||
shadow = base / dir_hash
|
||||
shadow.mkdir()
|
||||
(shadow / "HEAD").write_text("ref: refs/heads/main\n")
|
||||
(shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n")
|
||||
return shadow
|
||||
|
||||
def test_first_call_prunes_and_writes_marker(self, tmp_path):
|
||||
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
self._seed(base, "0000" * 4, tmp_path / "gone")
|
||||
|
||||
out = maybe_auto_prune_checkpoints(checkpoint_base=base)
|
||||
assert out["skipped"] is False
|
||||
assert out["result"]["deleted_orphan"] == 1
|
||||
assert (base / ".last_prune").exists()
|
||||
|
||||
def test_second_call_within_interval_skips(self, tmp_path):
|
||||
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
self._seed(base, "1111" * 4, tmp_path / "gone")
|
||||
|
||||
first = maybe_auto_prune_checkpoints(
|
||||
checkpoint_base=base, min_interval_hours=24
|
||||
)
|
||||
assert first["skipped"] is False
|
||||
|
||||
self._seed(base, "2222" * 4, tmp_path / "also-gone")
|
||||
second = maybe_auto_prune_checkpoints(
|
||||
checkpoint_base=base, min_interval_hours=24
|
||||
)
|
||||
assert second["skipped"] is True
|
||||
# The second orphan must still exist — skip was honoured.
|
||||
assert (base / ("2222" * 4)).exists()
|
||||
|
||||
def test_corrupt_marker_treated_as_no_prior_run(self, tmp_path):
|
||||
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
|
||||
|
||||
base = tmp_path / "checkpoints"
|
||||
base.mkdir()
|
||||
(base / ".last_prune").write_text("not-a-timestamp")
|
||||
self._seed(base, "3333" * 4, tmp_path / "gone")
|
||||
|
||||
out = maybe_auto_prune_checkpoints(checkpoint_base=base)
|
||||
assert out["skipped"] is False
|
||||
assert out["result"]["deleted_orphan"] == 1
|
||||
|
||||
def test_missing_base_no_raise(self, tmp_path):
|
||||
from tools.checkpoint_manager import maybe_auto_prune_checkpoints
|
||||
|
||||
out = maybe_auto_prune_checkpoints(
|
||||
checkpoint_base=tmp_path / "does-not-exist"
|
||||
)
|
||||
assert out["skipped"] is False
|
||||
assert out["result"]["scanned"] == 0
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue