fix: harden sync_back — PID-suffix temp path, size cap, lifecycle guards

Follow-ups on top of kshitijk4poor's cherry-picked salvage of PR #8018:

tools/environments/daytona.py
  - PID-suffix /tmp/.hermes_sync.<pid>.tar so concurrent sync_back calls
    against the same sandbox don't collide on the remote temp path
  - Move sync_back() inside the cleanup lock and after the _sandbox-None
    guard, with its own try/except. Previously a no-op cleanup (sandbox
    already cleared) still fired sync_back → 3-attempt retry storm against
    a nil sandbox (~6s of sleep). Now short-circuits cleanly.

tools/environments/file_sync.py
  - Add _SYNC_BACK_MAX_BYTES (2 GiB) defensive cap: refuse to extract a
    tar larger than the limit. Protects against runaway sandboxes
    producing arbitrary-size archives.
  - Add 'nothing previously pushed' guard at the top of sync_back(). If
    _pushed_hashes and _synced_files are both empty, the FileSyncManager
    was never initialized from the host side — there is nothing coherent
    to sync back. Skips the retry/backoff machinery on uninitialized
    managers and eliminates test-suite slowdown from pre-existing cleanup
    tests that don't mock the sync layer.

tests/tools/test_file_sync_back.py
  - Update _make_manager helper to seed a _pushed_hashes entry by default
    so sync_back() exercises its real path. A seed_pushed_state=False
    opt-out is available for noop-path tests.
  - Add TestSyncBackSizeCap with positive and negative coverage of the
    new cap.

tests/tools/test_sync_back_backends.py
  - Update Daytona bulk download test to assert the PID-suffixed path
    pattern instead of the fixed /tmp/.hermes_sync.tar.
This commit is contained in:
Teknium 2026-04-16 17:10:30 -07:00 committed by Teknium
parent d64446e315
commit 7fd508979e
4 changed files with 113 additions and 13 deletions

View file

@ -57,19 +57,36 @@ def _make_manager(
tmp_path: Path,
file_mapping: list[tuple[str, str]] | None = None,
bulk_download_fn=None,
seed_pushed_state: bool = True,
) -> FileSyncManager:
"""Create a FileSyncManager wired for testing.
*file_mapping* is a list of (host_path, remote_path) tuples that
``get_files_fn`` returns. If *None* an empty list is used.
When *seed_pushed_state* is True (default), populate ``_pushed_hashes``
from the mapping so sync_back doesn't early-return on the "nothing
previously pushed" guard. Set False to test the noop path.
"""
mapping = file_mapping or []
return FileSyncManager(
mgr = FileSyncManager(
get_files_fn=lambda: mapping,
upload_fn=MagicMock(),
delete_fn=MagicMock(),
bulk_download_fn=bulk_download_fn,
)
if seed_pushed_state:
# Seed _pushed_hashes so sync_back's "nothing previously pushed"
# guard does not early-return. Populate from the mapping when we
# can; otherwise drop a sentinel entry.
for host_path, remote_path in mapping:
if os.path.exists(host_path):
mgr._pushed_hashes[remote_path] = _sha256_file(host_path)
else:
mgr._pushed_hashes[remote_path] = "0" * 64
if not mgr._pushed_hashes:
mgr._pushed_hashes["/_sentinel"] = "0" * 64
return mgr
# ---------------------------------------------------------------------------
@ -410,3 +427,47 @@ class TestSyncBackSIGINT:
assert not exc, f"sync_back raised: {exc}"
# signal.signal should NOT have been called from the worker thread
assert len(signal_called) == 0
class TestSyncBackSizeCap:
"""The size cap refuses to extract tars above the configured limit."""
def test_sync_back_refuses_oversized_tar(self, tmp_path, caplog):
"""A tar larger than _SYNC_BACK_MAX_BYTES should be skipped with a warning."""
# Build a download_fn that writes a small tar, but patch the cap
# so the test doesn't need to produce a 2 GiB file.
skill_host = _write_file(tmp_path / "host_skill.md", b"original")
files = {"root/.hermes/skill.md": b"remote_version"}
download_fn = _make_download_fn(files)
mgr = _make_manager(
tmp_path,
file_mapping=[(skill_host, "/root/.hermes/skill.md")],
bulk_download_fn=download_fn,
)
# Cap at 1 byte so any non-empty tar exceeds it
with caplog.at_level(logging.WARNING, logger="tools.environments.file_sync"):
with patch("tools.environments.file_sync._SYNC_BACK_MAX_BYTES", 1):
mgr.sync_back(hermes_home=tmp_path / ".hermes")
# Host file should be untouched because extraction was skipped
assert Path(skill_host).read_bytes() == b"original"
# Warning should mention the cap
assert any("cap" in r.message for r in caplog.records)
def test_sync_back_applies_when_under_cap(self, tmp_path):
"""A tar under the cap should extract normally (sanity check)."""
host_file = _write_file(tmp_path / "host_skill.md", b"original")
files = {"root/.hermes/skill.md": b"remote_version"}
download_fn = _make_download_fn(files)
mgr = _make_manager(
tmp_path,
file_mapping=[(host_file, "/root/.hermes/skill.md")],
bulk_download_fn=download_fn,
)
# Default cap (2 GiB) is far above our tiny tar; extraction should proceed
mgr.sync_back(hermes_home=tmp_path / ".hermes")
assert Path(host_file).read_bytes() == b"remote_version"

View file

@ -354,15 +354,21 @@ class TestDaytonaBulkDownload:
assert env._sandbox.process.exec.call_count == 2
tar_cmd = env._sandbox.process.exec.call_args_list[0][0][0]
assert "tar cf" in tar_cmd
assert "/tmp/.hermes_sync.tar" in tar_cmd
# PID-suffixed temp path avoids collisions on sync_back retry
assert "/tmp/.hermes_sync." in tar_cmd
assert ".tar" in tar_cmd
assert ".hermes" in tar_cmd
cleanup_cmd = env._sandbox.process.exec.call_args_list[1][0][0]
assert "rm -f /tmp/.hermes_sync.tar" in cleanup_cmd
assert "rm -f" in cleanup_cmd
assert "/tmp/.hermes_sync." in cleanup_cmd
env._sandbox.fs.download_file.assert_called_once_with(
"/tmp/.hermes_sync.tar", str(dest)
)
# download_file called once with the same PID-suffixed path
env._sandbox.fs.download_file.assert_called_once()
download_args = env._sandbox.fs.download_file.call_args[0]
assert download_args[0].startswith("/tmp/.hermes_sync.")
assert download_args[0].endswith(".tar")
assert download_args[1] == str(dest)
def test_daytona_bulk_download_uses_remote_home(self, tmp_path):
"""The tar command should use the env's _remote_home."""

View file

@ -7,6 +7,7 @@ and resumed on next creation, preserving the filesystem across sessions.
import logging
import math
import os
import shlex
import threading
from pathlib import Path
@ -170,13 +171,16 @@ class DaytonaEnvironment(BaseEnvironment):
def _daytona_bulk_download(self, dest: Path) -> None:
"""Download remote .hermes/ as a tar archive."""
rel_base = f"{self._remote_home}/.hermes".lstrip("/")
# PID-suffixed remote temp path avoids collisions if sync_back fires
# concurrently for the same sandbox (e.g. retry after partial failure).
remote_tar = f"/tmp/.hermes_sync.{os.getpid()}.tar"
self._sandbox.process.exec(
f"tar cf /tmp/.hermes_sync.tar -C / {shlex.quote(rel_base)}"
f"tar cf {shlex.quote(remote_tar)} -C / {shlex.quote(rel_base)}"
)
self._sandbox.fs.download_file("/tmp/.hermes_sync.tar", str(dest))
self._sandbox.fs.download_file(remote_tar, str(dest))
# Clean up remote temp file
try:
self._sandbox.process.exec("rm -f /tmp/.hermes_sync.tar")
self._sandbox.process.exec(f"rm -f {shlex.quote(remote_tar)}")
except Exception:
pass # best-effort cleanup
@ -227,13 +231,21 @@ class DaytonaEnvironment(BaseEnvironment):
return _ThreadedProcessHandle(exec_fn, cancel_fn=cancel)
def cleanup(self):
if self._sync_manager:
logger.info("Daytona: syncing files from sandbox...")
self._sync_manager.sync_back()
with self._lock:
if self._sandbox is None:
return
# Sync remote changes back to host before teardown. Running
# inside the lock (and after the _sandbox is None guard) avoids
# firing sync_back on an already-cleaned-up env, which would
# trigger a 3-attempt retry storm against a nil sandbox.
if self._sync_manager:
logger.info("Daytona: syncing files from sandbox...")
try:
self._sync_manager.sync_back()
except Exception as e:
logger.warning("Daytona: sync_back failed: %s", e)
try:
if self._persistent:
self._sandbox.stop()

View file

@ -95,6 +95,7 @@ def _sha256_file(path: str) -> str:
_SYNC_BACK_MAX_RETRIES = 3
_SYNC_BACK_BACKOFF = (2, 4, 8) # seconds between retries
_SYNC_BACK_MAX_BYTES = 2 * 1024 * 1024 * 1024 # 2 GiB — refuse to extract larger tars
class FileSyncManager:
@ -219,6 +220,13 @@ class FileSyncManager:
if self._bulk_download_fn is None:
return
# Nothing was ever committed through this manager — the initial
# push failed or never ran. Skip sync_back to avoid retry storms
# against an uninitialized remote .hermes/ directory.
if not self._pushed_hashes and not self._synced_files:
logger.debug("sync_back: no prior push state — skipping")
return
lock_path = (hermes_home or get_hermes_home()) / ".sync.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
@ -292,6 +300,19 @@ class FileSyncManager:
with tempfile.NamedTemporaryFile(suffix=".tar") as tf:
self._bulk_download_fn(Path(tf.name))
# Defensive size cap: a misbehaving sandbox could produce an
# arbitrarily large tar. Refuse to extract if it exceeds the cap.
try:
tar_size = os.path.getsize(tf.name)
except OSError:
tar_size = 0
if tar_size > _SYNC_BACK_MAX_BYTES:
logger.warning(
"sync_back: remote tar is %d bytes (cap %d) — skipping extraction",
tar_size, _SYNC_BACK_MAX_BYTES,
)
return
with tempfile.TemporaryDirectory(prefix="hermes-sync-back-") as staging:
with tarfile.open(tf.name) as tar:
tar.extractall(staging, filter="data")