fix(profiles): exclude infrastructure artifacts when cloning with --clone-all

When the source profile is the default (~/.hermes), shutil.copytree()
was copying multi-GB infrastructure alongside the ~40 MB of actual
profile data: hermes-agent/ (repo checkout + 3 GB venv), .worktrees/,
profiles/ (sibling profiles — recursive!), bin/ (installed binaries),
node_modules/ (hundreds of MB).

Add _CLONE_ALL_DEFAULT_EXCLUDE_ROOT frozenset with these five entries
and pass an ignore callback to copytree().  Exclusions are gated on
the source actually being the default profile (is_default_source) so
named-profile sources are never affected.

Also exclude at any depth: __pycache__/, *.pyc, *.pyo, *.sock, *.tmp.
Profile data (config.yaml, .env, auth.json, state.db, sessions/,
skills/, logs/) is preserved intact — clone-all means 'complete
snapshot minus infrastructure'.

Mirrors the approach already used by _default_export_ignore() and
_DEFAULT_EXPORT_EXCLUDE_ROOT (the export-side exclusion set which is
broader because it produces a portable archive, not a live clone).

Co-authored-by: MustafaKara7 <karamusti912@gmail.com>
Co-authored-by: fahdad <30740087+fahdad@users.noreply.github.com>
Fixes #5022
Based on PRs #5025, #5026, and #21728
This commit is contained in:
donrhmexe 2026-05-09 15:39:27 +05:30 committed by kshitij
parent 93e25ceb13
commit f7e514d4ad
2 changed files with 123 additions and 14 deletions

View file

@ -64,13 +64,39 @@ _CLONE_SUBDIR_FILES = [
"memories/USER.md",
]
# Runtime files stripped after --clone-all (shouldn't carry over)
_CLONE_ALL_STRIP = [
# Runtime files stripped after --clone-all (shouldn't carry over).
# Kept as a post-copy step rather than in the ignore filter because they
# are created dynamically during normal use and may be absent at copy time.
_CLONE_ALL_STRIP: list[str] = [
"gateway.pid",
"gateway_state.json",
"processes.json",
]
# Infrastructure artifacts excluded from --clone-all when the source is the
# default profile (``~/.hermes``). Named profiles never contain these
# directories at root, so the exclusion is gated to avoid silently dropping
# user data from a named-profile source.
#
# Rationale per item:
# hermes-agent — git repo checkout (~84 MB source + ~3 GB venv)
# .worktrees — git worktrees
# profiles — sibling named profiles (recursive copy never intended)
# bin — installed binaries (tirith etc., ~10 MB) shared per-host
# node_modules — npm packages (hundreds of MB)
#
# See ``_DEFAULT_EXPORT_EXCLUDE_ROOT`` below for the broader export-side
# exclusion list (export drops state.db / logs / caches too because the
# archive is a portable snapshot; clone-all keeps those because the cloned
# profile is meant to keep working immediately).
_CLONE_ALL_DEFAULT_EXCLUDE_ROOT: frozenset[str] = frozenset({
"hermes-agent",
".worktrees",
"profiles",
"bin",
"node_modules",
})
# Marker file written by `hermes profile create --no-skills`. When present in
# a profile's root, callers of seed_profile_skills() (fresh-create, `hermes
# update`'s all-profile sync, the web dashboard) skip bundled-skill seeding
@ -89,23 +115,48 @@ def has_bundled_skills_opt_out(profile_dir: Path) -> bool:
def _clone_all_copytree_ignore(source_dir: Path):
"""Ignore ``profiles/`` at the root of *source_dir* only.
"""Exclude infrastructure artifacts when cloning a profile via --clone-all.
``~/.hermes`` contains ``profiles/<name>/`` for sibling named profiles.
``shutil.copytree`` would otherwise duplicate that entire tree inside the
new profile (recursive ``.../profiles/.../profiles/...``). Export already
excludes ``profiles`` via ``_DEFAULT_EXPORT_EXCLUDE_ROOT`` match that
behavior for ``--clone-all``.
Two categories:
1. Root-level entries in ``_CLONE_ALL_DEFAULT_EXCLUDE_ROOT`` known
Hermes infrastructure directories that only the default profile
(``~/.hermes``) ever contains. Gated on ``source_dir`` actually
being the default profile so a named-profile source never has its
own data silently dropped.
2. Universal exclusions at any depth Python bytecode caches that
are stale or regenerable (``__pycache__``, ``*.pyc``, ``*.pyo``)
and runtime sockets / temp files (``*.sock``, ``*.tmp``).
The export-side ignore (``_default_export_ignore``) uses the same
two-tier pattern with the broader ``_DEFAULT_EXPORT_EXCLUDE_ROOT`` set
because the export archive is a portable snapshot rather than a live
clone.
"""
source_resolved = source_dir.resolve()
is_default_source = source_resolved == _get_default_hermes_home().resolve()
def _ignore(directory: str, names: List[str]) -> List[str]:
try:
if Path(directory).resolve() == source_resolved:
return [n for n in names if n == "profiles"]
except (OSError, ValueError):
pass
return []
ignored: list[str] = []
for entry in names:
# Universal exclusions at any depth.
if (
entry == "__pycache__"
or entry.endswith((".pyc", ".pyo", ".sock", ".tmp"))
):
ignored.append(entry)
continue
# Root-level exclusions only apply when cloning the default profile.
if is_default_source:
try:
if Path(directory).resolve() == source_resolved:
if entry in _CLONE_ALL_DEFAULT_EXCLUDE_ROOT:
ignored.append(entry)
except (OSError, ValueError):
# ``resolve()`` can fail on unusual FS layouts (broken
# symlinks, missing parents). Fail open — better to
# over-copy than silently drop user data.
pass
return ignored
return _ignore

View file

@ -244,6 +244,64 @@ class TestCreateProfile:
assert (profile_dir / "memories" / "note.md").read_text() == "remember this"
assert not (profile_dir / "profiles").exists()
def test_clone_all_excludes_default_infrastructure(self, profile_env):
"""--clone-all from default profile excludes hermes-agent, .worktrees,
bin, node_modules at root, plus __pycache__/*.pyc/*.pyo/*.sock/*.tmp
at any depth. Profile data (config, env, skills, sessions, logs,
state.db) must be preserved clone-all means "complete snapshot
minus infrastructure."
"""
tmp_path = profile_env
default_home = tmp_path / ".hermes"
# Simulate infrastructure dirs that only the default profile has
(default_home / "hermes-agent" / ".git").mkdir(parents=True)
(default_home / "hermes-agent" / "venv" / "bin").mkdir(parents=True)
(default_home / "hermes-agent" / "README.md").write_text("repo")
(default_home / ".worktrees" / "some-tree").mkdir(parents=True)
(default_home / "profiles" / "other").mkdir(parents=True)
(default_home / "profiles" / "other" / "config.yaml").write_text("x")
(default_home / "bin").mkdir(exist_ok=True)
(default_home / "bin" / "tool").write_text("binary")
(default_home / "node_modules" / ".package-lock.json").mkdir(parents=True)
# Bytecode + temp files at nested depth (universal exclusion)
(default_home / "skills" / "my-skill" / "__pycache__").mkdir(parents=True)
(default_home / "skills" / "my-skill" / "__pycache__" / "module.cpython-311.pyc").write_text("stale")
(default_home / "skills" / "my-skill" / "module.pyc").write_text("stale")
(default_home / "skills" / "my-skill" / "module.pyo").write_text("stale")
(default_home / "data.sock").write_text("socket")
(default_home / "data.tmp").write_text("tmp")
# Profile data that SHOULD be copied
(default_home / "skills" / "my-skill").mkdir(parents=True, exist_ok=True)
(default_home / "skills" / "my-skill" / "SKILL.md").write_text("skill")
(default_home / "config.yaml").write_text("model: gpt-4")
(default_home / ".env").write_text("KEY=val")
(default_home / "state.db").write_text("sessions-data")
(default_home / "sessions").mkdir(exist_ok=True)
(default_home / "logs").mkdir(exist_ok=True)
(default_home / "logs" / "gateway.log").write_text("log")
profile_dir = create_profile("cloned", clone_all=True, no_alias=True)
# Infrastructure must be excluded
assert not (profile_dir / "hermes-agent").exists()
assert not (profile_dir / ".worktrees").exists()
assert not (profile_dir / "profiles").exists()
assert not (profile_dir / "bin").exists()
assert not (profile_dir / "node_modules").exists()
# Universal exclusions at any depth
assert not (profile_dir / "data.sock").exists()
assert not (profile_dir / "data.tmp").exists()
assert not (profile_dir / "skills" / "my-skill" / "__pycache__").exists()
assert not (profile_dir / "skills" / "my-skill" / "module.pyc").exists()
assert not (profile_dir / "skills" / "my-skill" / "module.pyo").exists()
# All profile data must be present
assert (profile_dir / "skills" / "my-skill" / "SKILL.md").read_text() == "skill"
assert (profile_dir / "config.yaml").read_text() == "model: gpt-4"
assert (profile_dir / ".env").read_text() == "KEY=val"
assert (profile_dir / "state.db").read_text() == "sessions-data"
assert (profile_dir / "sessions").exists()
assert (profile_dir / "logs" / "gateway.log").read_text() == "log"
def test_clone_config_missing_files_skipped(self, profile_env):
"""Clone config gracefully skips files that don't exist in source."""
profile_dir = create_profile("coder", clone_config=True, no_alias=True)