mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor
This commit is contained in:
commit
41d3d7afb7
18 changed files with 2877 additions and 484 deletions
|
|
@ -7,6 +7,7 @@ and resumed on next creation, preserving the filesystem across sessions.
|
|||
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import shlex
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
|
@ -134,6 +135,7 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
upload_fn=self._daytona_upload,
|
||||
delete_fn=self._daytona_delete,
|
||||
bulk_upload_fn=self._daytona_bulk_upload,
|
||||
bulk_download_fn=self._daytona_bulk_download,
|
||||
)
|
||||
self._sync_manager.sync(force=True)
|
||||
self.init_session()
|
||||
|
|
@ -166,6 +168,22 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
]
|
||||
self._sandbox.fs.upload_files(uploads)
|
||||
|
||||
def _daytona_bulk_download(self, dest: Path) -> None:
|
||||
"""Download remote .hermes/ as a tar archive."""
|
||||
rel_base = f"{self._remote_home}/.hermes".lstrip("/")
|
||||
# PID-suffixed remote temp path avoids collisions if sync_back fires
|
||||
# concurrently for the same sandbox (e.g. retry after partial failure).
|
||||
remote_tar = f"/tmp/.hermes_sync.{os.getpid()}.tar"
|
||||
self._sandbox.process.exec(
|
||||
f"tar cf {shlex.quote(remote_tar)} -C / {shlex.quote(rel_base)}"
|
||||
)
|
||||
self._sandbox.fs.download_file(remote_tar, str(dest))
|
||||
# Clean up remote temp file
|
||||
try:
|
||||
self._sandbox.process.exec(f"rm -f {shlex.quote(remote_tar)}")
|
||||
except Exception:
|
||||
pass # best-effort cleanup
|
||||
|
||||
def _daytona_delete(self, remote_paths: list[str]) -> None:
|
||||
"""Batch-delete remote files via SDK exec."""
|
||||
self._sandbox.process.exec(quoted_rm_command(remote_paths))
|
||||
|
|
@ -216,6 +234,18 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
with self._lock:
|
||||
if self._sandbox is None:
|
||||
return
|
||||
|
||||
# Sync remote changes back to host before teardown. Running
|
||||
# inside the lock (and after the _sandbox is None guard) avoids
|
||||
# firing sync_back on an already-cleaned-up env, which would
|
||||
# trigger a 3-attempt retry storm against a nil sandbox.
|
||||
if self._sync_manager:
|
||||
logger.info("Daytona: syncing files from sandbox...")
|
||||
try:
|
||||
self._sync_manager.sync_back()
|
||||
except Exception as e:
|
||||
logger.warning("Daytona: sync_back failed: %s", e)
|
||||
|
||||
try:
|
||||
if self._persistent:
|
||||
self._sandbox.stop()
|
||||
|
|
|
|||
|
|
@ -6,13 +6,25 @@ and Daytona. Docker and Singularity use bind mounts (live host FS
|
|||
view) and don't need this.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
import shutil
|
||||
import signal
|
||||
import tarfile
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
|
||||
try:
|
||||
import fcntl
|
||||
except ImportError:
|
||||
fcntl = None # Windows — file locking skipped
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
from tools.environments.base import _file_mtime_key
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -23,6 +35,7 @@ _FORCE_SYNC_ENV = "HERMES_FORCE_FILE_SYNC"
|
|||
# Transport callbacks provided by each backend
|
||||
UploadFn = Callable[[str, str], None] # (host_path, remote_path) -> raises on failure
|
||||
BulkUploadFn = Callable[[list[tuple[str, str]]], None] # [(host_path, remote_path), ...] -> raises on failure
|
||||
BulkDownloadFn = Callable[[Path], None] # (dest_tar_path) -> writes tar archive, raises on failure
|
||||
DeleteFn = Callable[[list[str]], None] # (remote_paths) -> raises on failure
|
||||
GetFilesFn = Callable[[], list[tuple[str, str]]] # () -> [(host_path, remote_path), ...]
|
||||
|
||||
|
|
@ -71,6 +84,20 @@ def unique_parent_dirs(files: list[tuple[str, str]]) -> list[str]:
|
|||
return sorted({str(Path(remote).parent) for _, remote in files})
|
||||
|
||||
|
||||
def _sha256_file(path: str) -> str:
|
||||
"""Return hex SHA-256 digest of a file."""
|
||||
h = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(65536), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
_SYNC_BACK_MAX_RETRIES = 3
|
||||
_SYNC_BACK_BACKOFF = (2, 4, 8) # seconds between retries
|
||||
_SYNC_BACK_MAX_BYTES = 2 * 1024 * 1024 * 1024 # 2 GiB — refuse to extract larger tars
|
||||
|
||||
|
||||
class FileSyncManager:
|
||||
"""Tracks local file changes and syncs to a remote environment.
|
||||
|
||||
|
|
@ -89,12 +116,15 @@ class FileSyncManager:
|
|||
delete_fn: DeleteFn,
|
||||
sync_interval: float = _SYNC_INTERVAL_SECONDS,
|
||||
bulk_upload_fn: BulkUploadFn | None = None,
|
||||
bulk_download_fn: BulkDownloadFn | None = None,
|
||||
):
|
||||
self._get_files_fn = get_files_fn
|
||||
self._upload_fn = upload_fn
|
||||
self._bulk_upload_fn = bulk_upload_fn
|
||||
self._bulk_download_fn = bulk_download_fn
|
||||
self._delete_fn = delete_fn
|
||||
self._synced_files: dict[str, tuple[float, int]] = {} # remote_path -> (mtime, size)
|
||||
self._pushed_hashes: dict[str, str] = {} # remote_path -> sha256 hex digest
|
||||
self._last_sync_time: float = 0.0 # monotonic; 0 ensures first sync runs
|
||||
self._sync_interval = sync_interval
|
||||
|
||||
|
|
@ -136,6 +166,7 @@ class FileSyncManager:
|
|||
|
||||
# Snapshot for rollback (only when there's work to do)
|
||||
prev_files = dict(self._synced_files)
|
||||
prev_hashes = dict(self._pushed_hashes)
|
||||
|
||||
if to_upload:
|
||||
logger.debug("file_sync: uploading %d file(s)", len(to_upload))
|
||||
|
|
@ -156,13 +187,207 @@ class FileSyncManager:
|
|||
logger.debug("file_sync: deleted %s", to_delete)
|
||||
|
||||
# --- Commit (all succeeded) ---
|
||||
for host_path, remote_path in to_upload:
|
||||
self._pushed_hashes[remote_path] = _sha256_file(host_path)
|
||||
|
||||
for p in to_delete:
|
||||
new_files.pop(p, None)
|
||||
self._pushed_hashes.pop(p, None)
|
||||
|
||||
self._synced_files = new_files
|
||||
self._last_sync_time = time.monotonic()
|
||||
|
||||
except Exception as exc:
|
||||
self._synced_files = prev_files
|
||||
self._pushed_hashes = prev_hashes
|
||||
self._last_sync_time = time.monotonic()
|
||||
logger.warning("file_sync: sync failed, rolled back state: %s", exc)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Sync-back: pull remote changes to host on teardown
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def sync_back(self, hermes_home: Path | None = None) -> None:
|
||||
"""Pull remote changes back to the host filesystem.
|
||||
|
||||
Downloads the remote ``.hermes/`` directory as a tar archive,
|
||||
unpacks it, and applies only files that differ from what was
|
||||
originally pushed (based on SHA-256 content hashes).
|
||||
|
||||
Protected against SIGINT (defers the signal until complete) and
|
||||
serialized across concurrent gateway sandboxes via file lock.
|
||||
"""
|
||||
if self._bulk_download_fn is None:
|
||||
return
|
||||
|
||||
# Nothing was ever committed through this manager — the initial
|
||||
# push failed or never ran. Skip sync_back to avoid retry storms
|
||||
# against an uninitialized remote .hermes/ directory.
|
||||
if not self._pushed_hashes and not self._synced_files:
|
||||
logger.debug("sync_back: no prior push state — skipping")
|
||||
return
|
||||
|
||||
lock_path = (hermes_home or get_hermes_home()) / ".sync.lock"
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(_SYNC_BACK_MAX_RETRIES):
|
||||
try:
|
||||
self._sync_back_once(lock_path)
|
||||
return
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
if attempt < _SYNC_BACK_MAX_RETRIES - 1:
|
||||
delay = _SYNC_BACK_BACKOFF[attempt]
|
||||
logger.warning(
|
||||
"sync_back: attempt %d failed (%s), retrying in %ds",
|
||||
attempt + 1, exc, delay,
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
logger.warning("sync_back: all %d attempts failed: %s", _SYNC_BACK_MAX_RETRIES, last_exc)
|
||||
|
||||
def _sync_back_once(self, lock_path: Path) -> None:
|
||||
"""Single sync-back attempt with SIGINT protection and file lock."""
|
||||
# signal.signal() only works from the main thread. In gateway
|
||||
# contexts cleanup() may run from a worker thread — skip SIGINT
|
||||
# deferral there rather than crashing.
|
||||
on_main_thread = threading.current_thread() is threading.main_thread()
|
||||
|
||||
deferred_sigint: list[object] = []
|
||||
original_handler = None
|
||||
if on_main_thread:
|
||||
original_handler = signal.getsignal(signal.SIGINT)
|
||||
|
||||
def _defer_sigint(signum, frame):
|
||||
deferred_sigint.append((signum, frame))
|
||||
logger.debug("sync_back: SIGINT deferred until sync completes")
|
||||
|
||||
signal.signal(signal.SIGINT, _defer_sigint)
|
||||
try:
|
||||
self._sync_back_locked(lock_path)
|
||||
finally:
|
||||
if on_main_thread and original_handler is not None:
|
||||
signal.signal(signal.SIGINT, original_handler)
|
||||
if deferred_sigint:
|
||||
os.kill(os.getpid(), signal.SIGINT)
|
||||
|
||||
def _sync_back_locked(self, lock_path: Path) -> None:
|
||||
"""Sync-back under file lock (serializes concurrent gateways)."""
|
||||
if fcntl is None:
|
||||
# Windows: no flock — run without serialization
|
||||
self._sync_back_impl()
|
||||
return
|
||||
lock_fd = open(lock_path, "w")
|
||||
try:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_EX)
|
||||
self._sync_back_impl()
|
||||
finally:
|
||||
fcntl.flock(lock_fd, fcntl.LOCK_UN)
|
||||
lock_fd.close()
|
||||
|
||||
def _sync_back_impl(self) -> None:
|
||||
"""Download, diff, and apply remote changes to host."""
|
||||
if self._bulk_download_fn is None:
|
||||
raise RuntimeError("_sync_back_impl called without bulk_download_fn")
|
||||
|
||||
# Cache file mapping once to avoid O(n*m) from repeated iteration
|
||||
try:
|
||||
file_mapping = list(self._get_files_fn())
|
||||
except Exception:
|
||||
file_mapping = []
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".tar") as tf:
|
||||
self._bulk_download_fn(Path(tf.name))
|
||||
|
||||
# Defensive size cap: a misbehaving sandbox could produce an
|
||||
# arbitrarily large tar. Refuse to extract if it exceeds the cap.
|
||||
try:
|
||||
tar_size = os.path.getsize(tf.name)
|
||||
except OSError:
|
||||
tar_size = 0
|
||||
if tar_size > _SYNC_BACK_MAX_BYTES:
|
||||
logger.warning(
|
||||
"sync_back: remote tar is %d bytes (cap %d) — skipping extraction",
|
||||
tar_size, _SYNC_BACK_MAX_BYTES,
|
||||
)
|
||||
return
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="hermes-sync-back-") as staging:
|
||||
with tarfile.open(tf.name) as tar:
|
||||
tar.extractall(staging, filter="data")
|
||||
|
||||
applied = 0
|
||||
for dirpath, _dirnames, filenames in os.walk(staging):
|
||||
for fname in filenames:
|
||||
staged_file = os.path.join(dirpath, fname)
|
||||
rel = os.path.relpath(staged_file, staging)
|
||||
remote_path = "/" + rel
|
||||
|
||||
pushed_hash = self._pushed_hashes.get(remote_path)
|
||||
|
||||
# Skip hashing for files unchanged from push
|
||||
if pushed_hash is not None:
|
||||
remote_hash = _sha256_file(staged_file)
|
||||
if remote_hash == pushed_hash:
|
||||
continue
|
||||
else:
|
||||
remote_hash = None # new remote file
|
||||
|
||||
# Resolve host path from cached mapping
|
||||
host_path = self._resolve_host_path(remote_path, file_mapping)
|
||||
if host_path is None:
|
||||
host_path = self._infer_host_path(remote_path, file_mapping)
|
||||
if host_path is None:
|
||||
logger.debug(
|
||||
"sync_back: skipping %s (no host mapping)",
|
||||
remote_path,
|
||||
)
|
||||
continue
|
||||
|
||||
if os.path.exists(host_path) and pushed_hash is not None:
|
||||
host_hash = _sha256_file(host_path)
|
||||
if host_hash != pushed_hash:
|
||||
logger.warning(
|
||||
"sync_back: conflict on %s — host modified "
|
||||
"since push, remote also changed. Applying "
|
||||
"remote version (last-write-wins).",
|
||||
remote_path,
|
||||
)
|
||||
|
||||
os.makedirs(os.path.dirname(host_path), exist_ok=True)
|
||||
shutil.copy2(staged_file, host_path)
|
||||
applied += 1
|
||||
|
||||
if applied:
|
||||
logger.info("sync_back: applied %d changed file(s)", applied)
|
||||
else:
|
||||
logger.debug("sync_back: no remote changes detected")
|
||||
|
||||
def _resolve_host_path(self, remote_path: str,
|
||||
file_mapping: list[tuple[str, str]] | None = None) -> str | None:
|
||||
"""Find the host path for a known remote path from the file mapping."""
|
||||
mapping = file_mapping if file_mapping is not None else []
|
||||
for host, remote in mapping:
|
||||
if remote == remote_path:
|
||||
return host
|
||||
return None
|
||||
|
||||
def _infer_host_path(self, remote_path: str,
|
||||
file_mapping: list[tuple[str, str]] | None = None) -> str | None:
|
||||
"""Infer a host path for a new remote file by matching path prefixes.
|
||||
|
||||
Uses the existing file mapping to find a remote->host directory
|
||||
pair, then applies the same prefix substitution to the new file.
|
||||
For example, if the mapping has ``/root/.hermes/skills/a.md`` →
|
||||
``~/.hermes/skills/a.md``, a new remote file at
|
||||
``/root/.hermes/skills/b.md`` maps to ``~/.hermes/skills/b.md``.
|
||||
"""
|
||||
mapping = file_mapping if file_mapping is not None else []
|
||||
for host, remote in mapping:
|
||||
remote_dir = str(Path(remote).parent)
|
||||
if remote_path.startswith(remote_dir + "/"):
|
||||
host_dir = str(Path(host).parent)
|
||||
suffix = remote_path[len(remote_dir):]
|
||||
return host_dir + suffix
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -269,6 +269,7 @@ class ModalEnvironment(BaseEnvironment):
|
|||
upload_fn=self._modal_upload,
|
||||
delete_fn=self._modal_delete,
|
||||
bulk_upload_fn=self._modal_bulk_upload,
|
||||
bulk_download_fn=self._modal_bulk_download,
|
||||
)
|
||||
self._sync_manager.sync(force=True)
|
||||
self.init_session()
|
||||
|
|
@ -347,6 +348,27 @@ class ModalEnvironment(BaseEnvironment):
|
|||
|
||||
self._worker.run_coroutine(_bulk(), timeout=120)
|
||||
|
||||
def _modal_bulk_download(self, dest: Path) -> None:
|
||||
"""Download remote .hermes/ as a tar archive.
|
||||
|
||||
Modal sandboxes always run as root, so /root/.hermes is hardcoded
|
||||
(consistent with iter_sync_files call on line 269).
|
||||
"""
|
||||
async def _download():
|
||||
proc = await self._sandbox.exec.aio(
|
||||
"bash", "-c", "tar cf - -C / root/.hermes"
|
||||
)
|
||||
data = await proc.stdout.read.aio()
|
||||
exit_code = await proc.wait.aio()
|
||||
if exit_code != 0:
|
||||
raise RuntimeError(f"Modal bulk download failed (exit {exit_code})")
|
||||
return data
|
||||
|
||||
tar_bytes = self._worker.run_coroutine(_download(), timeout=120)
|
||||
if isinstance(tar_bytes, str):
|
||||
tar_bytes = tar_bytes.encode()
|
||||
dest.write_bytes(tar_bytes)
|
||||
|
||||
def _modal_delete(self, remote_paths: list[str]) -> None:
|
||||
"""Batch-delete remote files via exec."""
|
||||
rm_cmd = quoted_rm_command(remote_paths)
|
||||
|
|
@ -404,6 +426,10 @@ class ModalEnvironment(BaseEnvironment):
|
|||
if self._sandbox is None:
|
||||
return
|
||||
|
||||
if self._sync_manager:
|
||||
logger.info("Modal: syncing files from sandbox...")
|
||||
self._sync_manager.sync_back()
|
||||
|
||||
if self._persistent:
|
||||
try:
|
||||
async def _snapshot():
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ class SSHEnvironment(BaseEnvironment):
|
|||
upload_fn=self._scp_upload,
|
||||
delete_fn=self._ssh_delete,
|
||||
bulk_upload_fn=self._ssh_bulk_upload,
|
||||
bulk_download_fn=self._ssh_bulk_download,
|
||||
)
|
||||
self._sync_manager.sync(force=True)
|
||||
|
||||
|
|
@ -216,6 +217,18 @@ class SSHEnvironment(BaseEnvironment):
|
|||
|
||||
logger.debug("SSH: bulk-uploaded %d file(s) via tar pipe", len(files))
|
||||
|
||||
def _ssh_bulk_download(self, dest: Path) -> None:
|
||||
"""Download remote .hermes/ as a tar archive."""
|
||||
# Tar from / with the full path so archive entries preserve absolute
|
||||
# paths (e.g. home/user/.hermes/skills/f.py), matching _pushed_hashes keys.
|
||||
rel_base = f"{self._remote_home}/.hermes".lstrip("/")
|
||||
ssh_cmd = self._build_ssh_command()
|
||||
ssh_cmd.append(f"tar cf - -C / {shlex.quote(rel_base)}")
|
||||
with open(dest, "wb") as f:
|
||||
result = subprocess.run(ssh_cmd, stdout=f, stderr=subprocess.PIPE, timeout=120)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"SSH bulk download failed: {result.stderr.decode(errors='replace').strip()}")
|
||||
|
||||
def _ssh_delete(self, remote_paths: list[str]) -> None:
|
||||
"""Batch-delete remote files in one SSH call."""
|
||||
cmd = self._build_ssh_command()
|
||||
|
|
@ -245,6 +258,10 @@ class SSHEnvironment(BaseEnvironment):
|
|||
return _popen_bash(cmd, stdin_data)
|
||||
|
||||
def cleanup(self):
|
||||
if self._sync_manager:
|
||||
logger.info("SSH: syncing files from sandbox...")
|
||||
self._sync_manager.sync_back()
|
||||
|
||||
if self.control_socket.exists():
|
||||
try:
|
||||
cmd = ["ssh", "-o", f"ControlPath={self.control_socket}",
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue