feat(file-sync): sync remote changes back to host on teardown

Salvage of PR #8018 by @alt-glitch onto current main.

On sandbox teardown, FileSyncManager now downloads the remote .hermes/
directory, diffs against SHA-256 hashes of what was originally pushed,
and applies only changed files back to the host.

Core (tools/environments/file_sync.py):
- sync_back(): orchestrates download -> unpack -> diff -> apply with:
  - Retry with exponential backoff (3 attempts, 2s/4s/8s)
  - SIGINT trap + defer (prevents partial writes on Ctrl-C)
  - fcntl.flock serialization (concurrent gateway sandboxes)
  - Last-write-wins conflict resolution with warning
  - New remote files pulled back via _infer_host_path prefix matching

Backends:
- SSH: _ssh_bulk_download — tar cf - piped over SSH
- Modal: _modal_bulk_download — exec tar cf - -> proc.stdout.read
- Daytona: _daytona_bulk_download — exec tar cf -> SDK download_file
- All three call sync_back() at the top of cleanup()

Fixes applied during salvage (vs original PR #8018):

| # | Issue | Fix |
|---|-------|-----|
| C1 | import fcntl unconditional — crashes Windows | try/except with fallback; _sync_back_locked skips locking when fcntl=None |
| W1 | assert for runtime guard (stripped by -O) | Replaced with proper if/raise RuntimeError |
| W2 | O(n*m) from _get_files_fn() called per file | Cache mapping once at start of _sync_back_impl, pass to resolve/infer |
| W3 | Dead BulkDownloadFn imports in 3 backends | Removed unused imports |
| W4 | Modal hardcodes root/.hermes, no explanation | Added docstring comment explaining Modal always runs as root |
| S1 | SHA-256 computed for new files where pushed_hash=None | Skip hashing when pushed_hash is None (comparison always False) |
| S2 | Daytona /tmp/.hermes_sync.tar never cleaned up | Added rm -f after download (best-effort) |

Tests: 49 passing (17 new: _infer_host_path edge cases, SIGINT
main/worker thread, Windows fcntl=None fallback, Daytona tar cleanup).

Based on #8018 by @alt-glitch.
This commit is contained in:
kshitijk4poor 2026-04-12 11:18:29 +05:30 committed by Teknium
parent 764536b684
commit d64446e315
6 changed files with 1166 additions and 0 deletions

View file

@ -6,13 +6,25 @@ and Daytona. Docker and Singularity use bind mounts (live host FS
view) and don't need this.
"""
import hashlib
import logging
import os
import shlex
import shutil
import signal
import tarfile
import tempfile
import threading
import time
try:
import fcntl
except ImportError:
fcntl = None # Windows — file locking skipped
from pathlib import Path
from typing import Callable
from hermes_constants import get_hermes_home
from tools.environments.base import _file_mtime_key
logger = logging.getLogger(__name__)
@ -23,6 +35,7 @@ _FORCE_SYNC_ENV = "HERMES_FORCE_FILE_SYNC"
# Transport callbacks provided by each backend
UploadFn = Callable[[str, str], None] # (host_path, remote_path) -> raises on failure
BulkUploadFn = Callable[[list[tuple[str, str]]], None] # [(host_path, remote_path), ...] -> raises on failure
BulkDownloadFn = Callable[[Path], None] # (dest_tar_path) -> writes tar archive, raises on failure
DeleteFn = Callable[[list[str]], None] # (remote_paths) -> raises on failure
GetFilesFn = Callable[[], list[tuple[str, str]]] # () -> [(host_path, remote_path), ...]
@ -71,6 +84,19 @@ def unique_parent_dirs(files: list[tuple[str, str]]) -> list[str]:
return sorted({str(Path(remote).parent) for _, remote in files})
def _sha256_file(path: str) -> str:
"""Return hex SHA-256 digest of a file."""
h = hashlib.sha256()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(65536), b""):
h.update(chunk)
return h.hexdigest()
_SYNC_BACK_MAX_RETRIES = 3
_SYNC_BACK_BACKOFF = (2, 4, 8) # seconds between retries
class FileSyncManager:
"""Tracks local file changes and syncs to a remote environment.
@ -89,12 +115,15 @@ class FileSyncManager:
delete_fn: DeleteFn,
sync_interval: float = _SYNC_INTERVAL_SECONDS,
bulk_upload_fn: BulkUploadFn | None = None,
bulk_download_fn: BulkDownloadFn | None = None,
):
self._get_files_fn = get_files_fn
self._upload_fn = upload_fn
self._bulk_upload_fn = bulk_upload_fn
self._bulk_download_fn = bulk_download_fn
self._delete_fn = delete_fn
self._synced_files: dict[str, tuple[float, int]] = {} # remote_path -> (mtime, size)
self._pushed_hashes: dict[str, str] = {} # remote_path -> sha256 hex digest
self._last_sync_time: float = 0.0 # monotonic; 0 ensures first sync runs
self._sync_interval = sync_interval
@ -136,6 +165,7 @@ class FileSyncManager:
# Snapshot for rollback (only when there's work to do)
prev_files = dict(self._synced_files)
prev_hashes = dict(self._pushed_hashes)
if to_upload:
logger.debug("file_sync: uploading %d file(s)", len(to_upload))
@ -156,13 +186,187 @@ class FileSyncManager:
logger.debug("file_sync: deleted %s", to_delete)
# --- Commit (all succeeded) ---
for host_path, remote_path in to_upload:
self._pushed_hashes[remote_path] = _sha256_file(host_path)
for p in to_delete:
new_files.pop(p, None)
self._pushed_hashes.pop(p, None)
self._synced_files = new_files
self._last_sync_time = time.monotonic()
except Exception as exc:
self._synced_files = prev_files
self._pushed_hashes = prev_hashes
self._last_sync_time = time.monotonic()
logger.warning("file_sync: sync failed, rolled back state: %s", exc)
# ------------------------------------------------------------------
# Sync-back: pull remote changes to host on teardown
# ------------------------------------------------------------------
def sync_back(self, hermes_home: Path | None = None) -> None:
"""Pull remote changes back to the host filesystem.
Downloads the remote ``.hermes/`` directory as a tar archive,
unpacks it, and applies only files that differ from what was
originally pushed (based on SHA-256 content hashes).
Protected against SIGINT (defers the signal until complete) and
serialized across concurrent gateway sandboxes via file lock.
"""
if self._bulk_download_fn is None:
return
lock_path = (hermes_home or get_hermes_home()) / ".sync.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
last_exc: Exception | None = None
for attempt in range(_SYNC_BACK_MAX_RETRIES):
try:
self._sync_back_once(lock_path)
return
except Exception as exc:
last_exc = exc
if attempt < _SYNC_BACK_MAX_RETRIES - 1:
delay = _SYNC_BACK_BACKOFF[attempt]
logger.warning(
"sync_back: attempt %d failed (%s), retrying in %ds",
attempt + 1, exc, delay,
)
time.sleep(delay)
logger.warning("sync_back: all %d attempts failed: %s", _SYNC_BACK_MAX_RETRIES, last_exc)
def _sync_back_once(self, lock_path: Path) -> None:
"""Single sync-back attempt with SIGINT protection and file lock."""
# signal.signal() only works from the main thread. In gateway
# contexts cleanup() may run from a worker thread — skip SIGINT
# deferral there rather than crashing.
on_main_thread = threading.current_thread() is threading.main_thread()
deferred_sigint: list[object] = []
original_handler = None
if on_main_thread:
original_handler = signal.getsignal(signal.SIGINT)
def _defer_sigint(signum, frame):
deferred_sigint.append((signum, frame))
logger.debug("sync_back: SIGINT deferred until sync completes")
signal.signal(signal.SIGINT, _defer_sigint)
try:
self._sync_back_locked(lock_path)
finally:
if on_main_thread and original_handler is not None:
signal.signal(signal.SIGINT, original_handler)
if deferred_sigint:
os.kill(os.getpid(), signal.SIGINT)
def _sync_back_locked(self, lock_path: Path) -> None:
"""Sync-back under file lock (serializes concurrent gateways)."""
if fcntl is None:
# Windows: no flock — run without serialization
self._sync_back_impl()
return
lock_fd = open(lock_path, "w")
try:
fcntl.flock(lock_fd, fcntl.LOCK_EX)
self._sync_back_impl()
finally:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
lock_fd.close()
def _sync_back_impl(self) -> None:
"""Download, diff, and apply remote changes to host."""
if self._bulk_download_fn is None:
raise RuntimeError("_sync_back_impl called without bulk_download_fn")
# Cache file mapping once to avoid O(n*m) from repeated iteration
try:
file_mapping = list(self._get_files_fn())
except Exception:
file_mapping = []
with tempfile.NamedTemporaryFile(suffix=".tar") as tf:
self._bulk_download_fn(Path(tf.name))
with tempfile.TemporaryDirectory(prefix="hermes-sync-back-") as staging:
with tarfile.open(tf.name) as tar:
tar.extractall(staging, filter="data")
applied = 0
for dirpath, _dirnames, filenames in os.walk(staging):
for fname in filenames:
staged_file = os.path.join(dirpath, fname)
rel = os.path.relpath(staged_file, staging)
remote_path = "/" + rel
pushed_hash = self._pushed_hashes.get(remote_path)
# Skip hashing for files unchanged from push
if pushed_hash is not None:
remote_hash = _sha256_file(staged_file)
if remote_hash == pushed_hash:
continue
else:
remote_hash = None # new remote file
# Resolve host path from cached mapping
host_path = self._resolve_host_path(remote_path, file_mapping)
if host_path is None:
host_path = self._infer_host_path(remote_path, file_mapping)
if host_path is None:
logger.debug(
"sync_back: skipping %s (no host mapping)",
remote_path,
)
continue
if os.path.exists(host_path) and pushed_hash is not None:
host_hash = _sha256_file(host_path)
if host_hash != pushed_hash:
logger.warning(
"sync_back: conflict on %s — host modified "
"since push, remote also changed. Applying "
"remote version (last-write-wins).",
remote_path,
)
os.makedirs(os.path.dirname(host_path), exist_ok=True)
shutil.copy2(staged_file, host_path)
applied += 1
if applied:
logger.info("sync_back: applied %d changed file(s)", applied)
else:
logger.debug("sync_back: no remote changes detected")
def _resolve_host_path(self, remote_path: str,
file_mapping: list[tuple[str, str]] | None = None) -> str | None:
"""Find the host path for a known remote path from the file mapping."""
mapping = file_mapping if file_mapping is not None else []
for host, remote in mapping:
if remote == remote_path:
return host
return None
def _infer_host_path(self, remote_path: str,
file_mapping: list[tuple[str, str]] | None = None) -> str | None:
"""Infer a host path for a new remote file by matching path prefixes.
Uses the existing file mapping to find a remote->host directory
pair, then applies the same prefix substitution to the new file.
For example, if the mapping has ``/root/.hermes/skills/a.md``
``~/.hermes/skills/a.md``, a new remote file at
``/root/.hermes/skills/b.md`` maps to ``~/.hermes/skills/b.md``.
"""
mapping = file_mapping if file_mapping is not None else []
for host, remote in mapping:
remote_dir = str(Path(remote).parent)
if remote_path.startswith(remote_dir + "/"):
host_dir = str(Path(host).parent)
suffix = remote_path[len(remote_dir):]
return host_dir + suffix
return None