mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
Replaces the per-directory shadow-repo design with a single shared shadow
git store at ~/.hermes/checkpoints/store/. Object DB is now deduplicated
across every working directory the agent has ever touched; a dozen
worktrees of the same project cost near-zero in additional disk.
Why
---
Pre-v2 design had three compounding problems that let ~/.hermes/checkpoints/
grow to multi-GB on active machines:
1. Each working directory got its own full shadow git repo — no object
dedup across projects or across worktrees of the same project.
2. _prune() was a documented no-op: max_snapshots only limited the
/rollback listing. Loose objects accumulated forever.
3. Defaults: enabled=True, auto_prune=False — users paid the disk cost
without ever asking for /rollback.
Field report on a single workstation: 847 MB across 47 shadow repos,
mostly redundant clones of the hermes-agent source tree.
Changes
-------
- tools/checkpoint_manager.py: full rewrite. Single bare store, per-project
refs (refs/hermes/<hash>), per-project indexes (store/indexes/<hash>),
per-project metadata (store/projects/<hash>.json with workdir +
created_at + last_touch). On first v2 init, any pre-v2 per-directory
shadow repos are auto-migrated into legacy-<timestamp>/ so the new
store starts clean. _prune() now actually rewrites the per-project ref
to the last max_snapshots commits and runs git gc --prune=now. New
_enforce_size_cap() drops oldest commits round-robin across projects
when the store exceeds max_total_size_mb. _drop_oversize_from_index()
filters any single file larger than max_file_size_mb out of the snapshot.
- hermes_cli/checkpoints.py: new 'hermes checkpoints' CLI
(status / list / prune / clear / clear-legacy) for managing the store
outside a session.
- hermes_cli/config.py: flipped defaults — enabled=False, max_snapshots=20,
auto_prune=True. Added max_total_size_mb=500, max_file_size_mb=10.
Tightened DEFAULT_EXCLUDES (added target/, *.so/*.dylib/*.dll,
*.mp4/*.mov, *.zip/*.tar.gz, .worktrees/, .mypy_cache/, etc.).
- run_agent.py / cli.py / gateway/run.py: thread the new kwargs through
AIAgent and the startup auto_prune hooks.
- Tests rewritten to match v2 storage while keeping backwards-compat
coverage for the pre-v2 prune path (per-directory shadow repos under
base/ are still swept correctly for anyone mid-migration).
- Docs updated: user-guide/checkpoints-and-rollback.md explains the
shared store, new defaults, migration, and the new CLI;
reference/cli-commands.md documents 'hermes checkpoints'.
E2E validated
-------------
- Legacy migration: pre-v2 shadow repos auto-archived into legacy-<ts>/.
- Object dedup: two projects with an identical shared.py blob resolve to
7 total objects in the store (v1 would have stored the blob twice).
- max_snapshots=3 actually enforced: after 6 commits, list shows 3.
- Orphan prune: deleting a project's workdir + 'hermes checkpoints prune
--retention-days 0' removes its ref, index, and metadata; GC reclaims
the objects.
- max_file_size_mb=1 excludes a 2 MB weights.bin while keeping the
tracked source code files.
- hermes checkpoints {status,prune,clear,clear-legacy} all work from the
CLI without an agent running.
Breaking / migration
--------------------
No in-place data migration — legacy per-directory shadow repos are moved
into legacy-<timestamp>/ on first run. Old /rollback history is still
accessible by inspecting the archive with git; run
'hermes checkpoints clear-legacy' to reclaim the space when ready. Users
relying on /rollback must now set checkpoints.enabled=true (or pass
--checkpoints) explicitly.
244 lines
8.4 KiB
Python
244 lines
8.4 KiB
Python
"""`hermes checkpoints` CLI subcommand.
|
|
|
|
Gives users direct visibility and control over the filesystem checkpoint
|
|
store at ``~/.hermes/checkpoints/``. Actions:
|
|
|
|
hermes checkpoints # same as `status`
|
|
hermes checkpoints status # total size, project count, breakdown
|
|
hermes checkpoints list # per-project checkpoint counts + workdir
|
|
hermes checkpoints prune [opts] # force a sweep (ignores the 24h marker)
|
|
hermes checkpoints clear [-f] # nuke the entire base (asks first)
|
|
hermes checkpoints clear-legacy # delete just the legacy-* archives
|
|
|
|
Examples::
|
|
|
|
hermes checkpoints
|
|
hermes checkpoints prune --retention-days 3 --max-size-mb 200
|
|
hermes checkpoints clear -f
|
|
|
|
None of these require the agent to be running. Safe to call any time.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict
|
|
|
|
|
|
def _fmt_bytes(n: int) -> str:
|
|
units = ("B", "KB", "MB", "GB", "TB")
|
|
size = float(n or 0)
|
|
for unit in units:
|
|
if size < 1024 or unit == units[-1]:
|
|
if unit == "B":
|
|
return f"{int(size)} {unit}"
|
|
return f"{size:.1f} {unit}"
|
|
size /= 1024
|
|
return f"{size:.1f} TB"
|
|
|
|
|
|
def _fmt_ts(ts: Any) -> str:
|
|
try:
|
|
return datetime.fromtimestamp(float(ts)).strftime("%Y-%m-%d %H:%M")
|
|
except (TypeError, ValueError):
|
|
return "—"
|
|
|
|
|
|
def _fmt_age(ts: Any) -> str:
|
|
try:
|
|
age = time.time() - float(ts)
|
|
except (TypeError, ValueError):
|
|
return "—"
|
|
if age < 0:
|
|
return "now"
|
|
if age < 60:
|
|
return f"{int(age)}s ago"
|
|
if age < 3600:
|
|
return f"{int(age / 60)}m ago"
|
|
if age < 86400:
|
|
return f"{int(age / 3600)}h ago"
|
|
return f"{int(age / 86400)}d ago"
|
|
|
|
|
|
def cmd_status(args: argparse.Namespace) -> int:
|
|
from tools.checkpoint_manager import store_status
|
|
|
|
info = store_status()
|
|
base = info["base"]
|
|
print(f"Checkpoint base: {base}")
|
|
print(f"Total size: {_fmt_bytes(info['total_size_bytes'])}")
|
|
print(f" store/ {_fmt_bytes(info['store_size_bytes'])}")
|
|
print(f" legacy-* {_fmt_bytes(info['legacy_size_bytes'])}")
|
|
print(f"Projects: {info['project_count']}")
|
|
|
|
projects = sorted(
|
|
info["projects"],
|
|
key=lambda p: (p.get("last_touch") or 0),
|
|
reverse=True,
|
|
)
|
|
if projects:
|
|
print()
|
|
print(f" {'WORKDIR':<60} {'COMMITS':>7} {'LAST TOUCH':>12} STATE")
|
|
for p in projects[: args.limit if hasattr(args, "limit") and args.limit else 20]:
|
|
wd = p.get("workdir") or "(unknown)"
|
|
if len(wd) > 60:
|
|
wd = "…" + wd[-59:]
|
|
exists = p.get("exists")
|
|
state = "live" if exists else "orphan"
|
|
commits = p.get("commits", 0)
|
|
last = _fmt_age(p.get("last_touch"))
|
|
print(f" {wd:<60} {commits:>7} {last:>12} {state}")
|
|
|
|
legacy = info.get("legacy_archives", [])
|
|
if legacy:
|
|
print()
|
|
print(f"Legacy archives ({len(legacy)}):")
|
|
for arch in sorted(legacy, key=lambda a: a.get("mtime", 0), reverse=True):
|
|
print(f" {arch['name']:<40} {_fmt_bytes(arch['size_bytes']):>10}")
|
|
print()
|
|
print("Clear with: hermes checkpoints clear-legacy")
|
|
return 0
|
|
|
|
|
|
def cmd_list(args: argparse.Namespace) -> int:
|
|
# `list` is just a terser status — already covered.
|
|
return cmd_status(args)
|
|
|
|
|
|
def cmd_prune(args: argparse.Namespace) -> int:
|
|
from tools.checkpoint_manager import prune_checkpoints
|
|
|
|
retention_days = args.retention_days
|
|
max_size_mb = args.max_size_mb
|
|
|
|
print("Pruning checkpoint store…")
|
|
print(f" retention_days: {retention_days}")
|
|
print(f" delete_orphans: {not args.keep_orphans}")
|
|
print(f" max_total_size_mb: {max_size_mb}")
|
|
print()
|
|
|
|
result = prune_checkpoints(
|
|
retention_days=retention_days,
|
|
delete_orphans=not args.keep_orphans,
|
|
max_total_size_mb=max_size_mb,
|
|
)
|
|
print(f"Scanned: {result['scanned']}")
|
|
print(f"Deleted orphan: {result['deleted_orphan']}")
|
|
print(f"Deleted stale: {result['deleted_stale']}")
|
|
print(f"Errors: {result['errors']}")
|
|
print(f"Bytes reclaimed: {_fmt_bytes(result['bytes_freed'])}")
|
|
return 0
|
|
|
|
|
|
def _confirm(prompt: str) -> bool:
|
|
try:
|
|
resp = input(f"{prompt} [y/N]: ").strip().lower()
|
|
except (EOFError, KeyboardInterrupt):
|
|
print()
|
|
return False
|
|
return resp in ("y", "yes")
|
|
|
|
|
|
def cmd_clear(args: argparse.Namespace) -> int:
|
|
from tools.checkpoint_manager import CHECKPOINT_BASE, clear_all, store_status
|
|
|
|
info = store_status()
|
|
if info["total_size_bytes"] == 0 and not Path(CHECKPOINT_BASE).exists():
|
|
print("Nothing to clear — checkpoint base does not exist.")
|
|
return 0
|
|
|
|
print(f"This will delete the ENTIRE checkpoint base at {info['base']}")
|
|
print(f" size: {_fmt_bytes(info['total_size_bytes'])}")
|
|
print(f" projects: {info['project_count']}")
|
|
print(f" legacy dirs: {len(info.get('legacy_archives', []))}")
|
|
print()
|
|
print("All /rollback history for every working directory will be lost.")
|
|
if not args.force and not _confirm("Proceed?"):
|
|
print("Aborted.")
|
|
return 1
|
|
|
|
result = clear_all()
|
|
if result["deleted"]:
|
|
print(f"Cleared. Reclaimed {_fmt_bytes(result['bytes_freed'])}.")
|
|
return 0
|
|
print("Could not clear checkpoint base (see logs).")
|
|
return 2
|
|
|
|
|
|
def cmd_clear_legacy(args: argparse.Namespace) -> int:
|
|
from tools.checkpoint_manager import clear_legacy, store_status
|
|
|
|
info = store_status()
|
|
legacy = info.get("legacy_archives", [])
|
|
if not legacy:
|
|
print("No legacy archives to clear.")
|
|
return 0
|
|
|
|
total = sum(a.get("size_bytes", 0) for a in legacy)
|
|
print(f"Found {len(legacy)} legacy archive(s), total {_fmt_bytes(total)}:")
|
|
for arch in legacy:
|
|
print(f" {arch['name']:<40} {_fmt_bytes(arch['size_bytes']):>10}")
|
|
print()
|
|
print("Legacy archives hold pre-v2 per-project shadow repos, moved aside")
|
|
print("during the single-store migration. Delete when you're confident")
|
|
print("you don't need the old /rollback history.")
|
|
if not args.force and not _confirm("Delete all legacy archives?"):
|
|
print("Aborted.")
|
|
return 1
|
|
|
|
result = clear_legacy()
|
|
print(f"Deleted {result['deleted']} archive(s), reclaimed {_fmt_bytes(result['bytes_freed'])}.")
|
|
return 0
|
|
|
|
|
|
def register_cli(parser: argparse.ArgumentParser) -> None:
|
|
"""Wire subcommands onto the ``hermes checkpoints`` parser."""
|
|
parser.set_defaults(func=cmd_status) # bare `hermes checkpoints` → status
|
|
subs = parser.add_subparsers(dest="checkpoints_command", metavar="COMMAND")
|
|
|
|
p_status = subs.add_parser(
|
|
"status",
|
|
help="Show total size, project count, and per-project breakdown",
|
|
)
|
|
p_status.add_argument("--limit", type=int, default=20,
|
|
help="Max projects to list (default 20)")
|
|
p_status.set_defaults(func=cmd_status)
|
|
|
|
p_list = subs.add_parser(
|
|
"list",
|
|
help="Alias for 'status'",
|
|
)
|
|
p_list.add_argument("--limit", type=int, default=20)
|
|
p_list.set_defaults(func=cmd_list)
|
|
|
|
p_prune = subs.add_parser(
|
|
"prune",
|
|
help="Delete orphan/stale checkpoints and GC the store",
|
|
)
|
|
p_prune.add_argument("--retention-days", type=int, default=7,
|
|
help="Drop projects whose last_touch is older than N days (default 7)")
|
|
p_prune.add_argument("--max-size-mb", type=int, default=500,
|
|
help="After orphan/stale prune, drop oldest commits "
|
|
"per project until total size <= this (default 500)")
|
|
p_prune.add_argument("--keep-orphans", action="store_true",
|
|
help="Skip deleting projects whose workdir no longer exists")
|
|
p_prune.set_defaults(func=cmd_prune)
|
|
|
|
p_clear = subs.add_parser(
|
|
"clear",
|
|
help="Delete the entire checkpoint base (all /rollback history)",
|
|
)
|
|
p_clear.add_argument("-f", "--force", action="store_true",
|
|
help="Skip confirmation prompt")
|
|
p_clear.set_defaults(func=cmd_clear)
|
|
|
|
p_legacy = subs.add_parser(
|
|
"clear-legacy",
|
|
help="Delete only the legacy-<ts>/ archives from v1 migration",
|
|
)
|
|
p_legacy.add_argument("-f", "--force", action="store_true",
|
|
help="Skip confirmation prompt")
|
|
p_legacy.set_defaults(func=cmd_clear_legacy)
|