mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: harden sync_back — PID-suffix temp path, size cap, lifecycle guards
Follow-ups on top of kshitijk4poor's cherry-picked salvage of PR #8018: tools/environments/daytona.py - PID-suffix /tmp/.hermes_sync.<pid>.tar so concurrent sync_back calls against the same sandbox don't collide on the remote temp path - Move sync_back() inside the cleanup lock and after the _sandbox-None guard, with its own try/except. Previously a no-op cleanup (sandbox already cleared) still fired sync_back → 3-attempt retry storm against a nil sandbox (~6s of sleep). Now short-circuits cleanly. tools/environments/file_sync.py - Add _SYNC_BACK_MAX_BYTES (2 GiB) defensive cap: refuse to extract a tar larger than the limit. Protects against runaway sandboxes producing arbitrary-size archives. - Add 'nothing previously pushed' guard at the top of sync_back(). If _pushed_hashes and _synced_files are both empty, the FileSyncManager was never initialized from the host side — there is nothing coherent to sync back. Skips the retry/backoff machinery on uninitialized managers and eliminates test-suite slowdown from pre-existing cleanup tests that don't mock the sync layer. tests/tools/test_file_sync_back.py - Update _make_manager helper to seed a _pushed_hashes entry by default so sync_back() exercises its real path. A seed_pushed_state=False opt-out is available for noop-path tests. - Add TestSyncBackSizeCap with positive and negative coverage of the new cap. tests/tools/test_sync_back_backends.py - Update Daytona bulk download test to assert the PID-suffixed path pattern instead of the fixed /tmp/.hermes_sync.tar.
This commit is contained in:
parent
d64446e315
commit
7fd508979e
4 changed files with 113 additions and 13 deletions
|
|
@ -57,19 +57,36 @@ def _make_manager(
|
|||
tmp_path: Path,
|
||||
file_mapping: list[tuple[str, str]] | None = None,
|
||||
bulk_download_fn=None,
|
||||
seed_pushed_state: bool = True,
|
||||
) -> FileSyncManager:
|
||||
"""Create a FileSyncManager wired for testing.
|
||||
|
||||
*file_mapping* is a list of (host_path, remote_path) tuples that
|
||||
``get_files_fn`` returns. If *None* an empty list is used.
|
||||
|
||||
When *seed_pushed_state* is True (default), populate ``_pushed_hashes``
|
||||
from the mapping so sync_back doesn't early-return on the "nothing
|
||||
previously pushed" guard. Set False to test the noop path.
|
||||
"""
|
||||
mapping = file_mapping or []
|
||||
return FileSyncManager(
|
||||
mgr = FileSyncManager(
|
||||
get_files_fn=lambda: mapping,
|
||||
upload_fn=MagicMock(),
|
||||
delete_fn=MagicMock(),
|
||||
bulk_download_fn=bulk_download_fn,
|
||||
)
|
||||
if seed_pushed_state:
|
||||
# Seed _pushed_hashes so sync_back's "nothing previously pushed"
|
||||
# guard does not early-return. Populate from the mapping when we
|
||||
# can; otherwise drop a sentinel entry.
|
||||
for host_path, remote_path in mapping:
|
||||
if os.path.exists(host_path):
|
||||
mgr._pushed_hashes[remote_path] = _sha256_file(host_path)
|
||||
else:
|
||||
mgr._pushed_hashes[remote_path] = "0" * 64
|
||||
if not mgr._pushed_hashes:
|
||||
mgr._pushed_hashes["/_sentinel"] = "0" * 64
|
||||
return mgr
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -410,3 +427,47 @@ class TestSyncBackSIGINT:
|
|||
assert not exc, f"sync_back raised: {exc}"
|
||||
# signal.signal should NOT have been called from the worker thread
|
||||
assert len(signal_called) == 0
|
||||
|
||||
|
||||
class TestSyncBackSizeCap:
|
||||
"""The size cap refuses to extract tars above the configured limit."""
|
||||
|
||||
def test_sync_back_refuses_oversized_tar(self, tmp_path, caplog):
|
||||
"""A tar larger than _SYNC_BACK_MAX_BYTES should be skipped with a warning."""
|
||||
# Build a download_fn that writes a small tar, but patch the cap
|
||||
# so the test doesn't need to produce a 2 GiB file.
|
||||
skill_host = _write_file(tmp_path / "host_skill.md", b"original")
|
||||
files = {"root/.hermes/skill.md": b"remote_version"}
|
||||
download_fn = _make_download_fn(files)
|
||||
|
||||
mgr = _make_manager(
|
||||
tmp_path,
|
||||
file_mapping=[(skill_host, "/root/.hermes/skill.md")],
|
||||
bulk_download_fn=download_fn,
|
||||
)
|
||||
|
||||
# Cap at 1 byte so any non-empty tar exceeds it
|
||||
with caplog.at_level(logging.WARNING, logger="tools.environments.file_sync"):
|
||||
with patch("tools.environments.file_sync._SYNC_BACK_MAX_BYTES", 1):
|
||||
mgr.sync_back(hermes_home=tmp_path / ".hermes")
|
||||
|
||||
# Host file should be untouched because extraction was skipped
|
||||
assert Path(skill_host).read_bytes() == b"original"
|
||||
# Warning should mention the cap
|
||||
assert any("cap" in r.message for r in caplog.records)
|
||||
|
||||
def test_sync_back_applies_when_under_cap(self, tmp_path):
|
||||
"""A tar under the cap should extract normally (sanity check)."""
|
||||
host_file = _write_file(tmp_path / "host_skill.md", b"original")
|
||||
files = {"root/.hermes/skill.md": b"remote_version"}
|
||||
download_fn = _make_download_fn(files)
|
||||
|
||||
mgr = _make_manager(
|
||||
tmp_path,
|
||||
file_mapping=[(host_file, "/root/.hermes/skill.md")],
|
||||
bulk_download_fn=download_fn,
|
||||
)
|
||||
|
||||
# Default cap (2 GiB) is far above our tiny tar; extraction should proceed
|
||||
mgr.sync_back(hermes_home=tmp_path / ".hermes")
|
||||
assert Path(host_file).read_bytes() == b"remote_version"
|
||||
|
|
|
|||
|
|
@ -354,15 +354,21 @@ class TestDaytonaBulkDownload:
|
|||
assert env._sandbox.process.exec.call_count == 2
|
||||
tar_cmd = env._sandbox.process.exec.call_args_list[0][0][0]
|
||||
assert "tar cf" in tar_cmd
|
||||
assert "/tmp/.hermes_sync.tar" in tar_cmd
|
||||
# PID-suffixed temp path avoids collisions on sync_back retry
|
||||
assert "/tmp/.hermes_sync." in tar_cmd
|
||||
assert ".tar" in tar_cmd
|
||||
assert ".hermes" in tar_cmd
|
||||
|
||||
cleanup_cmd = env._sandbox.process.exec.call_args_list[1][0][0]
|
||||
assert "rm -f /tmp/.hermes_sync.tar" in cleanup_cmd
|
||||
assert "rm -f" in cleanup_cmd
|
||||
assert "/tmp/.hermes_sync." in cleanup_cmd
|
||||
|
||||
env._sandbox.fs.download_file.assert_called_once_with(
|
||||
"/tmp/.hermes_sync.tar", str(dest)
|
||||
)
|
||||
# download_file called once with the same PID-suffixed path
|
||||
env._sandbox.fs.download_file.assert_called_once()
|
||||
download_args = env._sandbox.fs.download_file.call_args[0]
|
||||
assert download_args[0].startswith("/tmp/.hermes_sync.")
|
||||
assert download_args[0].endswith(".tar")
|
||||
assert download_args[1] == str(dest)
|
||||
|
||||
def test_daytona_bulk_download_uses_remote_home(self, tmp_path):
|
||||
"""The tar command should use the env's _remote_home."""
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ and resumed on next creation, preserving the filesystem across sessions.
|
|||
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import shlex
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
|
@ -170,13 +171,16 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
def _daytona_bulk_download(self, dest: Path) -> None:
|
||||
"""Download remote .hermes/ as a tar archive."""
|
||||
rel_base = f"{self._remote_home}/.hermes".lstrip("/")
|
||||
# PID-suffixed remote temp path avoids collisions if sync_back fires
|
||||
# concurrently for the same sandbox (e.g. retry after partial failure).
|
||||
remote_tar = f"/tmp/.hermes_sync.{os.getpid()}.tar"
|
||||
self._sandbox.process.exec(
|
||||
f"tar cf /tmp/.hermes_sync.tar -C / {shlex.quote(rel_base)}"
|
||||
f"tar cf {shlex.quote(remote_tar)} -C / {shlex.quote(rel_base)}"
|
||||
)
|
||||
self._sandbox.fs.download_file("/tmp/.hermes_sync.tar", str(dest))
|
||||
self._sandbox.fs.download_file(remote_tar, str(dest))
|
||||
# Clean up remote temp file
|
||||
try:
|
||||
self._sandbox.process.exec("rm -f /tmp/.hermes_sync.tar")
|
||||
self._sandbox.process.exec(f"rm -f {shlex.quote(remote_tar)}")
|
||||
except Exception:
|
||||
pass # best-effort cleanup
|
||||
|
||||
|
|
@ -227,13 +231,21 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
return _ThreadedProcessHandle(exec_fn, cancel_fn=cancel)
|
||||
|
||||
def cleanup(self):
|
||||
if self._sync_manager:
|
||||
logger.info("Daytona: syncing files from sandbox...")
|
||||
self._sync_manager.sync_back()
|
||||
|
||||
with self._lock:
|
||||
if self._sandbox is None:
|
||||
return
|
||||
|
||||
# Sync remote changes back to host before teardown. Running
|
||||
# inside the lock (and after the _sandbox is None guard) avoids
|
||||
# firing sync_back on an already-cleaned-up env, which would
|
||||
# trigger a 3-attempt retry storm against a nil sandbox.
|
||||
if self._sync_manager:
|
||||
logger.info("Daytona: syncing files from sandbox...")
|
||||
try:
|
||||
self._sync_manager.sync_back()
|
||||
except Exception as e:
|
||||
logger.warning("Daytona: sync_back failed: %s", e)
|
||||
|
||||
try:
|
||||
if self._persistent:
|
||||
self._sandbox.stop()
|
||||
|
|
|
|||
|
|
@ -95,6 +95,7 @@ def _sha256_file(path: str) -> str:
|
|||
|
||||
_SYNC_BACK_MAX_RETRIES = 3
|
||||
_SYNC_BACK_BACKOFF = (2, 4, 8) # seconds between retries
|
||||
_SYNC_BACK_MAX_BYTES = 2 * 1024 * 1024 * 1024 # 2 GiB — refuse to extract larger tars
|
||||
|
||||
|
||||
class FileSyncManager:
|
||||
|
|
@ -219,6 +220,13 @@ class FileSyncManager:
|
|||
if self._bulk_download_fn is None:
|
||||
return
|
||||
|
||||
# Nothing was ever committed through this manager — the initial
|
||||
# push failed or never ran. Skip sync_back to avoid retry storms
|
||||
# against an uninitialized remote .hermes/ directory.
|
||||
if not self._pushed_hashes and not self._synced_files:
|
||||
logger.debug("sync_back: no prior push state — skipping")
|
||||
return
|
||||
|
||||
lock_path = (hermes_home or get_hermes_home()) / ".sync.lock"
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
|
@ -292,6 +300,19 @@ class FileSyncManager:
|
|||
with tempfile.NamedTemporaryFile(suffix=".tar") as tf:
|
||||
self._bulk_download_fn(Path(tf.name))
|
||||
|
||||
# Defensive size cap: a misbehaving sandbox could produce an
|
||||
# arbitrarily large tar. Refuse to extract if it exceeds the cap.
|
||||
try:
|
||||
tar_size = os.path.getsize(tf.name)
|
||||
except OSError:
|
||||
tar_size = 0
|
||||
if tar_size > _SYNC_BACK_MAX_BYTES:
|
||||
logger.warning(
|
||||
"sync_back: remote tar is %d bytes (cap %d) — skipping extraction",
|
||||
tar_size, _SYNC_BACK_MAX_BYTES,
|
||||
)
|
||||
return
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="hermes-sync-back-") as staging:
|
||||
with tarfile.open(tf.name) as tar:
|
||||
tar.extractall(staging, filter="data")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue