diff --git a/cli.py b/cli.py index 2cb27e9e39..dec4ed980b 100644 --- a/cli.py +++ b/cli.py @@ -988,6 +988,29 @@ def _run_state_db_auto_maintenance(session_db) -> None: logger.debug("state.db auto-maintenance skipped: %s", exc) +def _run_checkpoint_auto_maintenance() -> None: + """Call ``checkpoint_manager.maybe_auto_prune_checkpoints`` using current config. + + Reads the ``checkpoints:`` section from config.yaml via + :func:`hermes_cli.config.load_config`. Honours ``auto_prune`` / + ``retention_days`` / ``delete_orphans`` / ``min_interval_hours``. + Never raises — maintenance must never block interactive startup. + """ + try: + from hermes_cli.config import load_config as _load_full_config + cfg = (_load_full_config().get("checkpoints") or {}) + if not cfg.get("auto_prune", False): + return + from tools.checkpoint_manager import maybe_auto_prune_checkpoints + maybe_auto_prune_checkpoints( + retention_days=int(cfg.get("retention_days", 7)), + min_interval_hours=int(cfg.get("min_interval_hours", 24)), + delete_orphans=bool(cfg.get("delete_orphans", True)), + ) + except Exception as exc: + logger.debug("checkpoint auto-maintenance skipped: %s", exc) + + def _prune_stale_worktrees(repo_root: str, max_age_hours: int = 24) -> None: """Remove stale worktrees and orphaned branches on startup. @@ -2054,6 +2077,11 @@ class HermesCLI: # Never blocks startup on failure. _run_state_db_auto_maintenance(self._session_db) + # Opportunistic shadow-repo cleanup — deletes orphan/stale + # checkpoint repos under ~/.hermes/checkpoints/. Opt-in via + # checkpoints.auto_prune, idempotent via .last_prune marker. + _run_checkpoint_auto_maintenance() + # Deferred title: stored in memory until the session is created in the DB self._pending_title: Optional[str] = None diff --git a/gateway/run.py b/gateway/run.py index 3305c20ad0..137347bf4e 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -768,6 +768,22 @@ class GatewayRunner: except Exception as exc: logger.debug("state.db auto-maintenance skipped: %s", exc) + # Opportunistic shadow-repo cleanup — deletes orphan/stale + # checkpoint repos under ~/.hermes/checkpoints/. Opt-in via + # checkpoints.auto_prune, idempotent via .last_prune marker. + try: + from hermes_cli.config import load_config as _load_full_config + _ckpt_cfg = (_load_full_config().get("checkpoints") or {}) + if _ckpt_cfg.get("auto_prune", False): + from tools.checkpoint_manager import maybe_auto_prune_checkpoints + maybe_auto_prune_checkpoints( + retention_days=int(_ckpt_cfg.get("retention_days", 7)), + min_interval_hours=int(_ckpt_cfg.get("min_interval_hours", 24)), + delete_orphans=bool(_ckpt_cfg.get("delete_orphans", True)), + ) + except Exception as exc: + logger.debug("checkpoint auto-maintenance skipped: %s", exc) + # DM pairing store for code-based user authorization from gateway.pairing import PairingStore self.pairing_store = PairingStore() diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 2391f0e309..e061fff62c 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -487,6 +487,19 @@ DEFAULT_CONFIG = { "checkpoints": { "enabled": True, "max_snapshots": 50, # Max checkpoints to keep per directory + # Auto-maintenance: shadow repos accumulate forever under + # ~/.hermes/checkpoints/ (one per cd'd working directory). Field + # reports put the typical offender at 1000+ repos / ~12 GB. When + # auto_prune is on, hermes sweeps at startup (at most once per + # min_interval_hours) and deletes: + # * orphan repos: HERMES_WORKDIR no longer exists on disk + # * stale repos: newest mtime older than retention_days + # Opt-in so users who rely on /rollback against long-ago sessions + # never lose data silently. + "auto_prune": False, + "retention_days": 7, + "delete_orphans": True, + "min_interval_hours": 24, }, # Maximum characters returned by a single read_file call. Reads that diff --git a/tests/tools/test_checkpoint_manager.py b/tests/tools/test_checkpoint_manager.py index 66fa107545..4b7f89644d 100644 --- a/tests/tools/test_checkpoint_manager.py +++ b/tests/tools/test_checkpoint_manager.py @@ -717,3 +717,193 @@ class TestGpgAndGlobalConfigIsolation: mgr = CheckpointManager(enabled=True) assert mgr.ensure_checkpoint(str(work_dir), reason="prefix-shadow") is True assert len(mgr.list_checkpoints(str(work_dir))) == 1 + + +# ========================================================================= +# Auto-maintenance: prune_checkpoints + maybe_auto_prune_checkpoints +# ========================================================================= + +class TestPruneCheckpoints: + """Sweep orphan/stale shadow repos under CHECKPOINT_BASE (issue #3015 follow-up).""" + + def _seed_shadow_repo( + self, base: Path, dir_hash: str, workdir: Path, mtime: float = None + ) -> Path: + """Create a minimal shadow repo on disk without invoking real git.""" + import time as _time + shadow = base / dir_hash + shadow.mkdir(parents=True) + (shadow / "HEAD").write_text("ref: refs/heads/main\n") + (shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n") + (shadow / "info").mkdir() + (shadow / "info" / "exclude").write_text("node_modules/\n") + if mtime is not None: + for p in shadow.rglob("*"): + import os + os.utime(p, (mtime, mtime)) + import os + os.utime(shadow, (mtime, mtime)) + return shadow + + def test_deletes_orphan_when_workdir_missing(self, tmp_path): + from tools.checkpoint_manager import prune_checkpoints + + base = tmp_path / "checkpoints" + alive_work = tmp_path / "alive" + alive_work.mkdir() + alive_repo = self._seed_shadow_repo(base, "aaaa" * 4, alive_work) + orphan_repo = self._seed_shadow_repo( + base, "bbbb" * 4, tmp_path / "was-deleted" + ) + + result = prune_checkpoints(retention_days=0, checkpoint_base=base) + + assert result["scanned"] == 2 + assert result["deleted_orphan"] == 1 + assert result["deleted_stale"] == 0 + assert alive_repo.exists() + assert not orphan_repo.exists() + + def test_deletes_stale_by_mtime_when_workdir_alive(self, tmp_path): + from tools.checkpoint_manager import prune_checkpoints + import time as _time + + base = tmp_path / "checkpoints" + work = tmp_path / "work" + work.mkdir() + + fresh_repo = self._seed_shadow_repo(base, "cccc" * 4, work) + stale_work = tmp_path / "stale_work" + stale_work.mkdir() + old = _time.time() - 60 * 86400 # 60 days ago + stale_repo = self._seed_shadow_repo(base, "dddd" * 4, stale_work, mtime=old) + + result = prune_checkpoints( + retention_days=30, delete_orphans=False, checkpoint_base=base + ) + + assert result["deleted_orphan"] == 0 + assert result["deleted_stale"] == 1 + assert fresh_repo.exists() + assert not stale_repo.exists() + + def test_orphan_takes_priority_over_stale(self, tmp_path): + """Orphan detection counts first — reason="orphan" even if also stale.""" + from tools.checkpoint_manager import prune_checkpoints + import time as _time + + base = tmp_path / "checkpoints" + old = _time.time() - 60 * 86400 + self._seed_shadow_repo(base, "eeee" * 4, tmp_path / "gone", mtime=old) + + result = prune_checkpoints(retention_days=30, checkpoint_base=base) + assert result["deleted_orphan"] == 1 + assert result["deleted_stale"] == 0 + + def test_delete_orphans_disabled_keeps_orphans(self, tmp_path): + from tools.checkpoint_manager import prune_checkpoints + + base = tmp_path / "checkpoints" + orphan = self._seed_shadow_repo(base, "ffff" * 4, tmp_path / "gone") + + result = prune_checkpoints( + retention_days=0, delete_orphans=False, checkpoint_base=base + ) + assert result["deleted_orphan"] == 0 + assert orphan.exists() + + def test_skips_non_shadow_dirs(self, tmp_path): + """Dirs without HEAD (non-initialised) are left alone.""" + from tools.checkpoint_manager import prune_checkpoints + + base = tmp_path / "checkpoints" + base.mkdir() + (base / "garbage-dir").mkdir() + (base / "garbage-dir" / "random.txt").write_text("hi") + + result = prune_checkpoints(retention_days=0, checkpoint_base=base) + assert result["scanned"] == 0 + assert (base / "garbage-dir").exists() + + def test_tracks_bytes_freed(self, tmp_path): + from tools.checkpoint_manager import prune_checkpoints + + base = tmp_path / "checkpoints" + orphan = self._seed_shadow_repo(base, "1234" * 4, tmp_path / "gone") + (orphan / "objects").mkdir() + (orphan / "objects" / "pack.bin").write_bytes(b"x" * 5000) + + result = prune_checkpoints(retention_days=0, checkpoint_base=base) + assert result["deleted_orphan"] == 1 + assert result["bytes_freed"] >= 5000 + + def test_base_missing_returns_empty_counts(self, tmp_path): + from tools.checkpoint_manager import prune_checkpoints + + result = prune_checkpoints(checkpoint_base=tmp_path / "does-not-exist") + assert result == { + "scanned": 0, "deleted_orphan": 0, "deleted_stale": 0, + "errors": 0, "bytes_freed": 0, + } + + +class TestMaybeAutoPruneCheckpoints: + def _seed(self, base, dir_hash, workdir): + base.mkdir(parents=True, exist_ok=True) + shadow = base / dir_hash + shadow.mkdir() + (shadow / "HEAD").write_text("ref: refs/heads/main\n") + (shadow / "HERMES_WORKDIR").write_text(str(workdir) + "\n") + return shadow + + def test_first_call_prunes_and_writes_marker(self, tmp_path): + from tools.checkpoint_manager import maybe_auto_prune_checkpoints + + base = tmp_path / "checkpoints" + self._seed(base, "0000" * 4, tmp_path / "gone") + + out = maybe_auto_prune_checkpoints(checkpoint_base=base) + assert out["skipped"] is False + assert out["result"]["deleted_orphan"] == 1 + assert (base / ".last_prune").exists() + + def test_second_call_within_interval_skips(self, tmp_path): + from tools.checkpoint_manager import maybe_auto_prune_checkpoints + + base = tmp_path / "checkpoints" + self._seed(base, "1111" * 4, tmp_path / "gone") + + first = maybe_auto_prune_checkpoints( + checkpoint_base=base, min_interval_hours=24 + ) + assert first["skipped"] is False + + self._seed(base, "2222" * 4, tmp_path / "also-gone") + second = maybe_auto_prune_checkpoints( + checkpoint_base=base, min_interval_hours=24 + ) + assert second["skipped"] is True + # The second orphan must still exist — skip was honoured. + assert (base / ("2222" * 4)).exists() + + def test_corrupt_marker_treated_as_no_prior_run(self, tmp_path): + from tools.checkpoint_manager import maybe_auto_prune_checkpoints + + base = tmp_path / "checkpoints" + base.mkdir() + (base / ".last_prune").write_text("not-a-timestamp") + self._seed(base, "3333" * 4, tmp_path / "gone") + + out = maybe_auto_prune_checkpoints(checkpoint_base=base) + assert out["skipped"] is False + assert out["result"]["deleted_orphan"] == 1 + + def test_missing_base_no_raise(self, tmp_path): + from tools.checkpoint_manager import maybe_auto_prune_checkpoints + + out = maybe_auto_prune_checkpoints( + checkpoint_base=tmp_path / "does-not-exist" + ) + assert out["skipped"] is False + assert out["result"]["scanned"] == 0 + diff --git a/tools/checkpoint_manager.py b/tools/checkpoint_manager.py index a3beee2a79..dbeb2554ff 100644 --- a/tools/checkpoint_manager.py +++ b/tools/checkpoint_manager.py @@ -651,3 +651,204 @@ def format_checkpoint_list(checkpoints: List[Dict], directory: str) -> str: lines.append(" /rollback diff preview changes since checkpoint N") lines.append(" /rollback restore a single file from checkpoint N") return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Auto-maintenance (issue #3015 follow-up) +# --------------------------------------------------------------------------- +# +# Every working directory the agent has ever touched gets its own shadow +# repo under CHECKPOINT_BASE. Per-repo ``_prune`` is a no-op (see comment +# in CheckpointManager._prune), so abandoned repos (deleted projects, +# one-off tmp dirs, long-stale work trees) accumulate forever. Field +# reports put the typical offender at 1000+ repos / ~12 GB on active +# contributor machines. +# +# ``prune_checkpoints`` sweeps CHECKPOINT_BASE at startup, deleting shadow +# repos that match either criterion: +# * orphan: the ``HERMES_WORKDIR`` path no longer exists on disk +# * stale: the repo's newest mtime is older than ``retention_days`` +# +# ``maybe_auto_prune_checkpoints`` wraps it with an idempotency marker +# (``CHECKPOINT_BASE/.last_prune``) so calling it on every CLI/gateway +# startup is free after the first run of the day. Opt-in via +# ``checkpoints.auto_prune`` in config.yaml — default off so users who +# rely on ``/rollback`` against long-ago sessions never lose data +# silently. + +_PRUNE_MARKER_NAME = ".last_prune" + + +def _read_workdir_marker(shadow_repo: Path) -> Optional[str]: + """Read ``HERMES_WORKDIR`` from a shadow repo, or None if missing/unreadable.""" + try: + return (shadow_repo / "HERMES_WORKDIR").read_text(encoding="utf-8").strip() + except (OSError, UnicodeDecodeError): + return None + + +def _shadow_repo_newest_mtime(shadow_repo: Path) -> float: + """Return newest mtime across the shadow repo (walks objects/refs/HEAD). + + We walk instead of trusting the directory mtime because git's pack + operations can leave the top-level dir untouched while refs/objects + inside get updated. Best-effort — returns 0.0 on any error. + """ + newest = 0.0 + try: + for p in shadow_repo.rglob("*"): + try: + m = p.stat().st_mtime + if m > newest: + newest = m + except OSError: + continue + except OSError: + pass + return newest + + +def prune_checkpoints( + retention_days: int = 7, + delete_orphans: bool = True, + checkpoint_base: Optional[Path] = None, +) -> Dict[str, int]: + """Delete stale/orphan shadow repos under ``checkpoint_base``. + + A shadow repo is deleted when either: + + * ``delete_orphans=True`` and its ``HERMES_WORKDIR`` path no longer + exists on disk (the original project was deleted / moved); OR + * its newest in-repo mtime is older than ``retention_days`` days. + + Returns a dict with counts ``{"scanned", "deleted_orphan", + "deleted_stale", "errors", "bytes_freed"}``. + + Never raises — maintenance must never block interactive startup. + """ + base = checkpoint_base or CHECKPOINT_BASE + result = { + "scanned": 0, + "deleted_orphan": 0, + "deleted_stale": 0, + "errors": 0, + "bytes_freed": 0, + } + if not base.exists(): + return result + + cutoff = 0.0 + if retention_days > 0: + import time as _time + cutoff = _time.time() - retention_days * 86400 + + for child in base.iterdir(): + if not child.is_dir(): + continue + # Protect the marker file and anything that isn't a real shadow + # repo (no HEAD = not initialised, leave alone). + if not (child / "HEAD").exists(): + continue + result["scanned"] += 1 + + reason: Optional[str] = None + if delete_orphans: + workdir = _read_workdir_marker(child) + if workdir is None or not Path(workdir).exists(): + reason = "orphan" + + if reason is None and retention_days > 0: + newest = _shadow_repo_newest_mtime(child) + if newest > 0 and newest < cutoff: + reason = "stale" + + if reason is None: + continue + + # Measure size before delete (best-effort) + try: + size = sum(p.stat().st_size for p in child.rglob("*") if p.is_file()) + except OSError: + size = 0 + try: + shutil.rmtree(child) + result["bytes_freed"] += size + if reason == "orphan": + result["deleted_orphan"] += 1 + else: + result["deleted_stale"] += 1 + logger.debug("Pruned %s checkpoint repo: %s (%d bytes)", reason, child.name, size) + except OSError as exc: + result["errors"] += 1 + logger.warning("Failed to prune checkpoint repo %s: %s", child.name, exc) + + return result + + +def maybe_auto_prune_checkpoints( + retention_days: int = 7, + min_interval_hours: int = 24, + delete_orphans: bool = True, + checkpoint_base: Optional[Path] = None, +) -> Dict[str, object]: + """Idempotent wrapper around ``prune_checkpoints`` for startup hooks. + + Writes ``CHECKPOINT_BASE/.last_prune`` on completion so subsequent + calls within ``min_interval_hours`` short-circuit. Designed to be + called once per CLI/gateway process startup; the marker keeps costs + bounded regardless of how many times hermes is invoked per day. + + Returns ``{"skipped": bool, "result": prune_checkpoints-dict, + "error": optional str}``. + """ + import time as _time + base = checkpoint_base or CHECKPOINT_BASE + out: Dict[str, object] = {"skipped": False} + + try: + if not base.exists(): + out["result"] = { + "scanned": 0, "deleted_orphan": 0, "deleted_stale": 0, + "errors": 0, "bytes_freed": 0, + } + return out + + marker = base / _PRUNE_MARKER_NAME + now = _time.time() + if marker.exists(): + try: + last_ts = float(marker.read_text(encoding="utf-8").strip()) + if now - last_ts < min_interval_hours * 3600: + out["skipped"] = True + return out + except (OSError, ValueError): + pass # corrupt marker — treat as no prior run + + result = prune_checkpoints( + retention_days=retention_days, + delete_orphans=delete_orphans, + checkpoint_base=base, + ) + out["result"] = result + + try: + marker.write_text(str(now), encoding="utf-8") + except OSError as exc: + logger.debug("Could not write checkpoint prune marker: %s", exc) + + total = result["deleted_orphan"] + result["deleted_stale"] + if total > 0: + logger.info( + "checkpoint auto-maintenance: pruned %d repo(s) " + "(%d orphan, %d stale), reclaimed %.1f MB", + total, + result["deleted_orphan"], + result["deleted_stale"], + result["bytes_freed"] / (1024 * 1024), + ) + except Exception as exc: + logger.warning("checkpoint auto-maintenance failed: %s", exc) + out["error"] = str(exc) + + return out + diff --git a/website/docs/user-guide/checkpoints-and-rollback.md b/website/docs/user-guide/checkpoints-and-rollback.md index 1c31acdaef..77847d2ef6 100644 --- a/website/docs/user-guide/checkpoints-and-rollback.md +++ b/website/docs/user-guide/checkpoints-and-rollback.md @@ -64,6 +64,16 @@ Checkpoints are enabled by default. Configure in `~/.hermes/config.yaml`: checkpoints: enabled: true # master switch (default: true) max_snapshots: 50 # max checkpoints per directory + + # Auto-maintenance (opt-in): sweep ~/.hermes/checkpoints/ at startup + # and delete shadow repos whose working directory no longer exists + # (orphans) or whose newest commit is older than retention_days. + # Runs at most once per min_interval_hours, tracked via a + # .last_prune marker inside ~/.hermes/checkpoints/. + auto_prune: false # default off — enable to reclaim disk + retention_days: 7 + delete_orphans: true # delete repos whose workdir is gone + min_interval_hours: 24 ``` To disable: