Merge pull request #48941 from kshitijk4poor/salvage-48887-backup-exclude-dirs

fix(backup): exclude regeneratable dep/cache dirs so backups don't balloon
This commit is contained in:
kshitij 2026-06-19 14:45:39 +05:30 committed by GitHub
commit db57a1a035
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 89 additions and 1 deletions

View file

@ -34,14 +34,38 @@ logger = logging.getLogger(__name__)
# ``hermes-agent`` is special-cased to root level only in ``_should_exclude``
# so that skill directories like ``skills/autonomous-ai-agents/hermes-agent/``
# are not accidentally excluded.
#
# The dependency/cache entries below matter for more than tidiness: without
# them a single plugin venv, MCP-server install, or pip/uv cache living under
# HERMES_HOME gets walked file-by-file, ballooning a backup to hundreds of
# thousands of entries that crawl for hours — the exact "backup stuck for
# days / 426543 files" symptom users hit. The dependency/test-env names mostly
# mirror ``agent.skill_utils.EXCLUDED_SKILL_DIRS`` (the project's canonical
# "regeneratable dir" set); ``.cache`` is an additional backup-only entry, as
# it names a broad regeneratable cache convention (pip/uv/etc.) that the skill
# scanner doesn't need to prune but a backup walk does. We deliberately do NOT
# exclude ``.archive`` here because the curator's ``skills/.archive/`` holds
# restorable user skills that must survive a backup.
_EXCLUDED_DIRS = {
"hermes-agent", # the codebase repo — re-clone instead
"__pycache__", # bytecode caches — regenerated on import
".git", # nested git dirs (profiles shouldn't have these, but safety)
"node_modules", # js deps if website/ somehow leaks in
"node_modules", # js deps — reinstalled on demand
"backups", # prior auto-backups — don't nest backups exponentially
"checkpoints", # session-local trajectory caches — regenerated per-session,
# session-hash-keyed so they don't port to another machine anyway
# Python dependency trees (plugin / MCP-server venvs under HERMES_HOME) —
# regenerated by reinstalling; never irreplaceable state.
".venv",
"venv",
"site-packages",
# Tool / build caches — all regeneratable.
".cache",
".tox",
".nox",
".pytest_cache",
".mypy_cache",
".ruff_cache",
}
# File-name suffixes to skip

View file

@ -153,6 +153,39 @@ class TestShouldExclude:
assert not _should_exclude(Path("skills/autonomous-ai-agents/hermes-agent/SKILL.md"))
assert not _should_exclude(Path("skills/autonomous-ai-agents/hermes-agent/sub/item.txt"))
@pytest.mark.parametrize(
"rel",
[
"plugins/my-plugin/.venv/lib/python3.12/site-packages/x/__init__.py",
"plugins/my-plugin/venv/bin/python",
"mcp/server/site-packages/pkg/mod.py",
".cache/uv/wheels/abc.whl",
"plugins/p/.cache/pip/http/deadbeef",
".tox/py312/log.txt",
".nox/tests/bin/pytest",
"plugins/p/.pytest_cache/v/cache/lastfailed",
".mypy_cache/3.12/agent.meta.json",
".ruff_cache/0.4.0/abc",
],
)
def test_excludes_regeneratable_dependency_and_cache_dirs(self, rel):
"""Python dep trees and tool caches under HERMES_HOME must be skipped —
these are what balloon a backup to hundreds of thousands of files."""
from hermes_cli.backup import _should_exclude
assert _should_exclude(Path(rel))
def test_does_not_exclude_curator_archive(self):
"""skills/.archive/ holds restorable archived skills and MUST survive
a backup it is intentionally NOT in the exclusion set."""
from hermes_cli.backup import _should_exclude
assert not _should_exclude(Path("skills/.archive/old-skill/SKILL.md"))
def test_does_not_exclude_legit_files_resembling_cache_names(self):
"""Only directory-component matches are excluded; a normal file is kept."""
from hermes_cli.backup import _should_exclude
assert not _should_exclude(Path("skills/my-skill/venv-notes.md"))
assert not _should_exclude(Path("memories/cache.json"))
# ---------------------------------------------------------------------------
# Backup tests
# ---------------------------------------------------------------------------
@ -272,6 +305,37 @@ class TestBackup:
agent_files = [n for n in names if "hermes-agent" in n]
assert agent_files == [], f"hermes-agent files leaked into backup: {agent_files}"
def test_excludes_dependency_and_cache_trees(self, tmp_path, monkeypatch):
"""A plugin venv / site-packages / pip cache under HERMES_HOME must be
pruned by the walk, while real data (skills, config) is preserved.
This is the regression guard for the ballooning-backup bug."""
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir()
_make_hermes_tree(hermes_home)
# Simulate the heavy regeneratable trees that ballooned the backup.
venv_pkg = hermes_home / "plugins" / "heavy" / ".venv" / "lib" / "site-packages" / "dep"
venv_pkg.mkdir(parents=True)
(venv_pkg / "__init__.py").write_text("# dep\n")
pip_cache = hermes_home / ".cache" / "uv" / "wheels"
pip_cache.mkdir(parents=True)
(pip_cache / "abc.whl").write_bytes(b"\x00")
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
monkeypatch.setattr(Path, "home", lambda: tmp_path)
out_zip = tmp_path / "backup.zip"
from hermes_cli.backup import run_backup
run_backup(Namespace(output=str(out_zip)))
with zipfile.ZipFile(out_zip, "r") as zf:
names = zf.namelist()
leaked = [n for n in names if ".venv" in n or "site-packages" in n or ".cache" in n]
assert leaked == [], f"regeneratable trees leaked into backup: {leaked}"
# Real data still present.
assert "skills/my-skill/SKILL.md" in names
assert "config.yaml" in names
def test_includes_nested_hermes_agent_in_skills(self, tmp_path, monkeypatch):
"""Backup includes skills/.../hermes-agent/ but NOT root hermes-agent/."""
hermes_home = tmp_path / ".hermes"