hermes-agent/tools/environments/daytona.py
kshitijk4poor d64446e315 feat(file-sync): sync remote changes back to host on teardown
Salvage of PR #8018 by @alt-glitch onto current main.

On sandbox teardown, FileSyncManager now downloads the remote .hermes/
directory, diffs against SHA-256 hashes of what was originally pushed,
and applies only changed files back to the host.

Core (tools/environments/file_sync.py):
- sync_back(): orchestrates download -> unpack -> diff -> apply with:
  - Retry with exponential backoff (3 attempts, 2s/4s/8s)
  - SIGINT trap + defer (prevents partial writes on Ctrl-C)
  - fcntl.flock serialization (concurrent gateway sandboxes)
  - Last-write-wins conflict resolution with warning
  - New remote files pulled back via _infer_host_path prefix matching

Backends:
- SSH: _ssh_bulk_download — tar cf - piped over SSH
- Modal: _modal_bulk_download — exec tar cf - -> proc.stdout.read
- Daytona: _daytona_bulk_download — exec tar cf -> SDK download_file
- All three call sync_back() at the top of cleanup()

Fixes applied during salvage (vs original PR #8018):

| # | Issue | Fix |
|---|-------|-----|
| C1 | import fcntl unconditional — crashes Windows | try/except with fallback; _sync_back_locked skips locking when fcntl=None |
| W1 | assert for runtime guard (stripped by -O) | Replaced with proper if/raise RuntimeError |
| W2 | O(n*m) from _get_files_fn() called per file | Cache mapping once at start of _sync_back_impl, pass to resolve/infer |
| W3 | Dead BulkDownloadFn imports in 3 backends | Removed unused imports |
| W4 | Modal hardcodes root/.hermes, no explanation | Added docstring comment explaining Modal always runs as root |
| S1 | SHA-256 computed for new files where pushed_hash=None | Skip hashing when pushed_hash is None (comparison always False) |
| S2 | Daytona /tmp/.hermes_sync.tar never cleaned up | Added rm -f after download (best-effort) |

Tests: 49 passing (17 new: _infer_host_path edge cases, SIGINT
main/worker thread, Windows fcntl=None fallback, Daytona tar cleanup).

Based on #8018 by @alt-glitch.
2026-04-16 19:39:21 -07:00

247 lines
8.7 KiB
Python

"""Daytona cloud execution environment.
Uses the Daytona Python SDK to run commands in cloud sandboxes.
Supports persistent sandboxes: when enabled, sandboxes are stopped on cleanup
and resumed on next creation, preserving the filesystem across sessions.
"""
import logging
import math
import shlex
import threading
from pathlib import Path
from tools.environments.base import (
BaseEnvironment,
_ThreadedProcessHandle,
)
from tools.environments.file_sync import (
FileSyncManager,
iter_sync_files,
quoted_mkdir_command,
quoted_rm_command,
unique_parent_dirs,
)
logger = logging.getLogger(__name__)
class DaytonaEnvironment(BaseEnvironment):
"""Daytona cloud sandbox execution backend.
Spawn-per-call via _ThreadedProcessHandle wrapping blocking SDK calls.
cancel_fn wired to sandbox.stop() for interrupt support.
Shell timeout wrapper preserved (SDK timeout unreliable).
"""
_stdin_mode = "heredoc"
def __init__(
self,
image: str,
cwd: str = "/home/daytona",
timeout: int = 60,
cpu: int = 1,
memory: int = 5120,
disk: int = 10240,
persistent_filesystem: bool = True,
task_id: str = "default",
):
requested_cwd = cwd
super().__init__(cwd=cwd, timeout=timeout)
from daytona import (
Daytona,
CreateSandboxFromImageParams,
DaytonaError,
Resources,
SandboxState,
)
self._persistent = persistent_filesystem
self._task_id = task_id
self._SandboxState = SandboxState
self._daytona = Daytona()
self._sandbox = None
self._lock = threading.Lock()
memory_gib = max(1, math.ceil(memory / 1024))
disk_gib = max(1, math.ceil(disk / 1024))
if disk_gib > 10:
logger.warning(
"Daytona: requested disk (%dGB) exceeds platform limit (10GB). "
"Capping to 10GB.", disk_gib,
)
disk_gib = 10
resources = Resources(cpu=cpu, memory=memory_gib, disk=disk_gib)
labels = {"hermes_task_id": task_id}
sandbox_name = f"hermes-{task_id}"
if self._persistent:
try:
self._sandbox = self._daytona.get(sandbox_name)
self._sandbox.start()
logger.info("Daytona: resumed sandbox %s for task %s",
self._sandbox.id, task_id)
except DaytonaError:
self._sandbox = None
except Exception as e:
logger.warning("Daytona: failed to resume sandbox for task %s: %s",
task_id, e)
self._sandbox = None
if self._sandbox is None:
try:
page = self._daytona.list(labels=labels, page=1, limit=1)
if page.items:
self._sandbox = page.items[0]
self._sandbox.start()
logger.info("Daytona: resumed legacy sandbox %s for task %s",
self._sandbox.id, task_id)
except Exception as e:
logger.debug("Daytona: no legacy sandbox found for task %s: %s",
task_id, e)
self._sandbox = None
if self._sandbox is None:
self._sandbox = self._daytona.create(
CreateSandboxFromImageParams(
image=image,
name=sandbox_name,
labels=labels,
auto_stop_interval=0,
resources=resources,
)
)
logger.info("Daytona: created sandbox %s for task %s",
self._sandbox.id, task_id)
# Detect remote home dir
self._remote_home = "/root"
try:
home = self._sandbox.process.exec("echo $HOME").result.strip()
if home:
self._remote_home = home
if requested_cwd in ("~", "/home/daytona"):
self.cwd = home
except Exception:
pass
logger.info("Daytona: resolved home to %s, cwd to %s", self._remote_home, self.cwd)
self._sync_manager = FileSyncManager(
get_files_fn=lambda: iter_sync_files(f"{self._remote_home}/.hermes"),
upload_fn=self._daytona_upload,
delete_fn=self._daytona_delete,
bulk_upload_fn=self._daytona_bulk_upload,
bulk_download_fn=self._daytona_bulk_download,
)
self._sync_manager.sync(force=True)
self.init_session()
def _daytona_upload(self, host_path: str, remote_path: str) -> None:
"""Upload a single file via Daytona SDK."""
parent = str(Path(remote_path).parent)
self._sandbox.process.exec(f"mkdir -p {parent}")
self._sandbox.fs.upload_file(host_path, remote_path)
def _daytona_bulk_upload(self, files: list[tuple[str, str]]) -> None:
"""Upload many files in a single HTTP call via Daytona SDK.
Uses ``sandbox.fs.upload_files()`` which batches all files into one
multipart POST, avoiding per-file TLS/HTTP overhead (~580 files
goes from ~5 min to <2 s).
"""
from daytona.common.filesystem import FileUpload
if not files:
return
parents = unique_parent_dirs(files)
if parents:
self._sandbox.process.exec(quoted_mkdir_command(parents))
uploads = [
FileUpload(source=host_path, destination=remote_path)
for host_path, remote_path in files
]
self._sandbox.fs.upload_files(uploads)
def _daytona_bulk_download(self, dest: Path) -> None:
"""Download remote .hermes/ as a tar archive."""
rel_base = f"{self._remote_home}/.hermes".lstrip("/")
self._sandbox.process.exec(
f"tar cf /tmp/.hermes_sync.tar -C / {shlex.quote(rel_base)}"
)
self._sandbox.fs.download_file("/tmp/.hermes_sync.tar", str(dest))
# Clean up remote temp file
try:
self._sandbox.process.exec("rm -f /tmp/.hermes_sync.tar")
except Exception:
pass # best-effort cleanup
def _daytona_delete(self, remote_paths: list[str]) -> None:
"""Batch-delete remote files via SDK exec."""
self._sandbox.process.exec(quoted_rm_command(remote_paths))
# ------------------------------------------------------------------
# Sandbox lifecycle
# ------------------------------------------------------------------
def _ensure_sandbox_ready(self) -> None:
"""Restart sandbox if it was stopped (e.g., by a previous interrupt)."""
self._sandbox.refresh_data()
if self._sandbox.state in (self._SandboxState.STOPPED, self._SandboxState.ARCHIVED):
self._sandbox.start()
logger.info("Daytona: restarted sandbox %s", self._sandbox.id)
def _before_execute(self) -> None:
"""Ensure sandbox is ready, then sync files via FileSyncManager."""
with self._lock:
self._ensure_sandbox_ready()
self._sync_manager.sync()
def _run_bash(self, cmd_string: str, *, login: bool = False,
timeout: int = 120,
stdin_data: str | None = None):
"""Return a _ThreadedProcessHandle wrapping a blocking Daytona SDK call."""
sandbox = self._sandbox
lock = self._lock
def cancel():
with lock:
try:
sandbox.stop()
except Exception:
pass
if login:
shell_cmd = f"bash -l -c {shlex.quote(cmd_string)}"
else:
shell_cmd = f"bash -c {shlex.quote(cmd_string)}"
def exec_fn() -> tuple[str, int]:
response = sandbox.process.exec(shell_cmd, timeout=timeout)
return (response.result or "", response.exit_code)
return _ThreadedProcessHandle(exec_fn, cancel_fn=cancel)
def cleanup(self):
if self._sync_manager:
logger.info("Daytona: syncing files from sandbox...")
self._sync_manager.sync_back()
with self._lock:
if self._sandbox is None:
return
try:
if self._persistent:
self._sandbox.stop()
logger.info("Daytona: stopped sandbox %s (filesystem preserved)",
self._sandbox.id)
else:
self._daytona.delete(self._sandbox)
logger.info("Daytona: deleted sandbox %s", self._sandbox.id)
except Exception as e:
logger.warning("Daytona: cleanup failed: %s", e)
self._sandbox = None