mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat: fix SQLite safety in hermes backup + add --quick snapshots + /snapshot command (#8971)
Three changes consolidated into the existing backup system: 1. Fix: hermes backup now uses sqlite3.Connection.backup() for .db files instead of raw file copy. Raw copy of a WAL-mode database can produce a corrupted backup — the backup() API handles this correctly. 2. hermes backup --quick: fast snapshot of just critical state files (config.yaml, state.db, .env, auth.json, cron/jobs.json, etc.) stored in ~/.hermes/state-snapshots/. Auto-prunes to 20 snapshots. 3. /snapshot slash command (alias /snap): in-session interface for quick state snapshots. create/list/restore/prune subcommands. Restore by ID or number. Powered by the same backup module. No new modules — everything lives in hermes_cli/backup.py alongside the existing full backup/import code. No hooks in run_agent.py — purely on-demand, zero runtime overhead. Closes the use case from PRs #8406 and #7813 with ~200 lines of new logic instead of a 1090-line content-addressed storage engine.
This commit is contained in:
parent
82901695ff
commit
381810ad50
5 changed files with 548 additions and 7 deletions
|
|
@ -8,14 +8,22 @@ Backup and import commands for hermes CLI.
|
|||
HERMES_HOME root.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sqlite3
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from hermes_constants import get_default_hermes_root, display_hermes_home
|
||||
from hermes_constants import get_default_hermes_root, get_hermes_home, display_hermes_home
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -63,6 +71,33 @@ def _should_exclude(rel_path: Path) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SQLite safe copy
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _safe_copy_db(src: Path, dst: Path) -> bool:
|
||||
"""Copy a SQLite database safely using the backup() API.
|
||||
|
||||
Handles WAL mode — produces a consistent snapshot even while
|
||||
the DB is being written to. Falls back to raw copy on failure.
|
||||
"""
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{src}?mode=ro", uri=True)
|
||||
backup_conn = sqlite3.connect(str(dst))
|
||||
conn.backup(backup_conn)
|
||||
backup_conn.close()
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.warning("SQLite safe copy failed for %s: %s", src, exc)
|
||||
try:
|
||||
shutil.copy2(src, dst)
|
||||
return True
|
||||
except Exception as exc2:
|
||||
logger.error("Raw copy also failed for %s: %s", src, exc2)
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -151,8 +186,21 @@ def run_backup(args) -> None:
|
|||
with zipfile.ZipFile(out_path, "w", zipfile.ZIP_DEFLATED, compresslevel=6) as zf:
|
||||
for i, (abs_path, rel_path) in enumerate(files_to_add, 1):
|
||||
try:
|
||||
zf.write(abs_path, arcname=str(rel_path))
|
||||
total_bytes += abs_path.stat().st_size
|
||||
# Safe copy for SQLite databases (handles WAL mode)
|
||||
if abs_path.suffix == ".db":
|
||||
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
|
||||
tmp_db = Path(tmp.name)
|
||||
if _safe_copy_db(abs_path, tmp_db):
|
||||
zf.write(tmp_db, arcname=str(rel_path))
|
||||
total_bytes += tmp_db.stat().st_size
|
||||
tmp_db.unlink(missing_ok=True)
|
||||
else:
|
||||
tmp_db.unlink(missing_ok=True)
|
||||
errors.append(f" {rel_path}: SQLite safe copy failed")
|
||||
continue
|
||||
else:
|
||||
zf.write(abs_path, arcname=str(rel_path))
|
||||
total_bytes += abs_path.stat().st_size
|
||||
except (PermissionError, OSError) as exc:
|
||||
errors.append(f" {rel_path}: {exc}")
|
||||
continue
|
||||
|
|
@ -397,3 +445,211 @@ def run_import(args) -> None:
|
|||
print(f" hermes -p {pname} gateway install")
|
||||
|
||||
print("Done. Your Hermes configuration has been restored.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Quick state snapshots (used by /snapshot slash command and hermes backup --quick)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Critical state files to include in quick snapshots (relative to HERMES_HOME).
|
||||
# Everything else is either regeneratable (logs, cache) or managed separately
|
||||
# (skills, repo, sessions/).
|
||||
_QUICK_STATE_FILES = (
|
||||
"state.db",
|
||||
"config.yaml",
|
||||
".env",
|
||||
"auth.json",
|
||||
"cron/jobs.json",
|
||||
"gateway_state.json",
|
||||
"channel_directory.json",
|
||||
"processes.json",
|
||||
)
|
||||
|
||||
_QUICK_SNAPSHOTS_DIR = "state-snapshots"
|
||||
_QUICK_DEFAULT_KEEP = 20
|
||||
|
||||
|
||||
def _quick_snapshot_root(hermes_home: Optional[Path] = None) -> Path:
|
||||
home = hermes_home or get_hermes_home()
|
||||
return home / _QUICK_SNAPSHOTS_DIR
|
||||
|
||||
|
||||
def create_quick_snapshot(
|
||||
label: Optional[str] = None,
|
||||
hermes_home: Optional[Path] = None,
|
||||
) -> Optional[str]:
|
||||
"""Create a quick state snapshot of critical files.
|
||||
|
||||
Copies STATE_FILES to a timestamped directory under state-snapshots/.
|
||||
Auto-prunes old snapshots beyond the keep limit.
|
||||
|
||||
Returns:
|
||||
Snapshot ID (timestamp-based), or None if no files found.
|
||||
"""
|
||||
home = hermes_home or get_hermes_home()
|
||||
root = _quick_snapshot_root(home)
|
||||
|
||||
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
||||
snap_id = f"{ts}-{label}" if label else ts
|
||||
snap_dir = root / snap_id
|
||||
snap_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
manifest: Dict[str, int] = {} # rel_path -> file size
|
||||
|
||||
for rel in _QUICK_STATE_FILES:
|
||||
src = home / rel
|
||||
if not src.exists() or not src.is_file():
|
||||
continue
|
||||
|
||||
dst = snap_dir / rel
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
if src.suffix == ".db":
|
||||
if not _safe_copy_db(src, dst):
|
||||
continue
|
||||
else:
|
||||
shutil.copy2(src, dst)
|
||||
manifest[rel] = dst.stat().st_size
|
||||
except (OSError, PermissionError) as exc:
|
||||
logger.warning("Could not snapshot %s: %s", rel, exc)
|
||||
|
||||
if not manifest:
|
||||
shutil.rmtree(snap_dir, ignore_errors=True)
|
||||
return None
|
||||
|
||||
# Write manifest
|
||||
meta = {
|
||||
"id": snap_id,
|
||||
"timestamp": ts,
|
||||
"label": label,
|
||||
"file_count": len(manifest),
|
||||
"total_size": sum(manifest.values()),
|
||||
"files": manifest,
|
||||
}
|
||||
with open(snap_dir / "manifest.json", "w") as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
# Auto-prune
|
||||
_prune_quick_snapshots(root, keep=_QUICK_DEFAULT_KEEP)
|
||||
|
||||
logger.info("State snapshot created: %s (%d files)", snap_id, len(manifest))
|
||||
return snap_id
|
||||
|
||||
|
||||
def list_quick_snapshots(
|
||||
limit: int = 20,
|
||||
hermes_home: Optional[Path] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List existing quick state snapshots, most recent first."""
|
||||
root = _quick_snapshot_root(hermes_home)
|
||||
if not root.exists():
|
||||
return []
|
||||
|
||||
results = []
|
||||
for d in sorted(root.iterdir(), reverse=True):
|
||||
if not d.is_dir():
|
||||
continue
|
||||
manifest_path = d / "manifest.json"
|
||||
if manifest_path.exists():
|
||||
try:
|
||||
with open(manifest_path) as f:
|
||||
results.append(json.load(f))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
results.append({"id": d.name, "file_count": 0, "total_size": 0})
|
||||
if len(results) >= limit:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def restore_quick_snapshot(
|
||||
snapshot_id: str,
|
||||
hermes_home: Optional[Path] = None,
|
||||
) -> bool:
|
||||
"""Restore state from a quick snapshot.
|
||||
|
||||
Overwrites current state files with the snapshot's copies.
|
||||
Returns True if at least one file was restored.
|
||||
"""
|
||||
home = hermes_home or get_hermes_home()
|
||||
root = _quick_snapshot_root(home)
|
||||
snap_dir = root / snapshot_id
|
||||
|
||||
if not snap_dir.is_dir():
|
||||
return False
|
||||
|
||||
manifest_path = snap_dir / "manifest.json"
|
||||
if not manifest_path.exists():
|
||||
return False
|
||||
|
||||
with open(manifest_path) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
restored = 0
|
||||
for rel in meta.get("files", {}):
|
||||
src = snap_dir / rel
|
||||
if not src.exists():
|
||||
continue
|
||||
|
||||
dst = home / rel
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
if dst.suffix == ".db":
|
||||
# Atomic-ish replace for databases
|
||||
tmp = dst.parent / f".{dst.name}.snap_restore"
|
||||
shutil.copy2(src, tmp)
|
||||
dst.unlink(missing_ok=True)
|
||||
shutil.move(str(tmp), str(dst))
|
||||
else:
|
||||
shutil.copy2(src, dst)
|
||||
restored += 1
|
||||
except (OSError, PermissionError) as exc:
|
||||
logger.error("Failed to restore %s: %s", rel, exc)
|
||||
|
||||
logger.info("Restored %d files from snapshot %s", restored, snapshot_id)
|
||||
return restored > 0
|
||||
|
||||
|
||||
def _prune_quick_snapshots(root: Path, keep: int = _QUICK_DEFAULT_KEEP) -> int:
|
||||
"""Remove oldest quick snapshots beyond the keep limit. Returns count deleted."""
|
||||
if not root.exists():
|
||||
return 0
|
||||
|
||||
dirs = sorted(
|
||||
(d for d in root.iterdir() if d.is_dir()),
|
||||
key=lambda d: d.name,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
deleted = 0
|
||||
for d in dirs[keep:]:
|
||||
try:
|
||||
shutil.rmtree(d)
|
||||
deleted += 1
|
||||
except OSError as exc:
|
||||
logger.warning("Failed to prune snapshot %s: %s", d.name, exc)
|
||||
|
||||
return deleted
|
||||
|
||||
|
||||
def prune_quick_snapshots(
|
||||
keep: int = _QUICK_DEFAULT_KEEP,
|
||||
hermes_home: Optional[Path] = None,
|
||||
) -> int:
|
||||
"""Manually prune quick snapshots. Returns count deleted."""
|
||||
return _prune_quick_snapshots(_quick_snapshot_root(hermes_home), keep=keep)
|
||||
|
||||
|
||||
def run_quick_backup(args) -> None:
|
||||
"""CLI entry point for hermes backup --quick."""
|
||||
label = getattr(args, "label", None)
|
||||
snap_id = create_quick_snapshot(label=label)
|
||||
if snap_id:
|
||||
print(f"State snapshot created: {snap_id}")
|
||||
snaps = list_quick_snapshots()
|
||||
print(f" {len(snaps)} snapshot(s) stored in {display_hermes_home()}/state-snapshots/")
|
||||
print(f" Restore with: /snapshot restore {snap_id}")
|
||||
else:
|
||||
print("No state files found to snapshot.")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue