mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-02 02:01:47 +00:00
feat(environments): unified spawn-per-call execution layer
Replace dual execution model (PersistentShellMixin + per-backend oneshot) with spawn-per-call + session snapshot for all backends except ManagedModal. Core changes: - Every command spawns a fresh bash process; session snapshot (env vars, functions, aliases) captured at init and re-sourced before each command - CWD persists via file-based read (local) or in-band stdout markers (remote) - ProcessHandle protocol + _ThreadedProcessHandle adapter for SDK backends - cancel_fn wired for Modal (sandbox.terminate) and Daytona (sandbox.stop) - Shared utilities extracted: _pipe_stdin, _popen_bash, _load_json_store, _save_json_store, _file_mtime_key, _SYNC_INTERVAL_SECONDS - Rate-limited file sync unified in base _before_execute() with _sync_files() hook - execute_oneshot() removed; all 11 call sites in code_execution_tool.py migrated to execute() - Daytona timeout wrapper replaced with SDK-native timeout parameter - persistent_shell.py deleted (291 lines) Backend-specific: - Local: process-group kill via os.killpg, file-based CWD read - Docker: -e env flags only on init_session, not per-command - SSH: shlex.quote transport, ControlMaster connection reuse - Singularity: apptainer exec with instance://, no forced --pwd - Modal: _AsyncWorker + _ThreadedProcessHandle, cancel_fn -> sandbox.terminate - Daytona: SDK-level timeout (not shell wrapper), cancel_fn -> sandbox.stop - ManagedModal: unchanged (gateway owns execution); docstring added explaining why
This commit is contained in:
parent
7d26feb9a3
commit
d684d7ee7e
17 changed files with 1170 additions and 1686 deletions
|
|
@ -5,19 +5,19 @@ wrapper, while preserving Hermes' persistent snapshot behavior across sessions.
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import shlex
|
||||
import threading
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
from tools.environments.modal_common import (
|
||||
BaseModalExecutionEnvironment,
|
||||
ModalExecStart,
|
||||
PreparedModalExec,
|
||||
from tools.environments.base import (
|
||||
BaseEnvironment,
|
||||
_ThreadedProcessHandle,
|
||||
_file_mtime_key,
|
||||
_load_json_store,
|
||||
_save_json_store,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -26,20 +26,12 @@ _SNAPSHOT_STORE = get_hermes_home() / "modal_snapshots.json"
|
|||
_DIRECT_SNAPSHOT_NAMESPACE = "direct"
|
||||
|
||||
|
||||
def _load_snapshots() -> Dict[str, str]:
|
||||
"""Load snapshot ID mapping from disk."""
|
||||
if _SNAPSHOT_STORE.exists():
|
||||
try:
|
||||
return json.loads(_SNAPSHOT_STORE.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
def _load_snapshots() -> dict:
|
||||
return _load_json_store(_SNAPSHOT_STORE)
|
||||
|
||||
|
||||
def _save_snapshots(data: Dict[str, str]) -> None:
|
||||
"""Persist snapshot ID mapping to disk."""
|
||||
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
|
||||
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
|
||||
def _save_snapshots(data: dict) -> None:
|
||||
_save_json_store(_SNAPSHOT_STORE, data)
|
||||
|
||||
|
||||
def _direct_snapshot_key(task_id: str) -> str:
|
||||
|
|
@ -47,23 +39,18 @@ def _direct_snapshot_key(task_id: str) -> str:
|
|||
|
||||
|
||||
def _get_snapshot_restore_candidate(task_id: str) -> tuple[str | None, bool]:
|
||||
"""Return a snapshot id and whether it came from the legacy key format."""
|
||||
snapshots = _load_snapshots()
|
||||
|
||||
namespaced_key = _direct_snapshot_key(task_id)
|
||||
snapshot_id = snapshots.get(namespaced_key)
|
||||
if isinstance(snapshot_id, str) and snapshot_id:
|
||||
return snapshot_id, False
|
||||
|
||||
legacy_snapshot_id = snapshots.get(task_id)
|
||||
if isinstance(legacy_snapshot_id, str) and legacy_snapshot_id:
|
||||
return legacy_snapshot_id, True
|
||||
|
||||
return None, False
|
||||
|
||||
|
||||
def _store_direct_snapshot(task_id: str, snapshot_id: str) -> None:
|
||||
"""Persist the direct Modal snapshot id under the direct namespace."""
|
||||
snapshots = _load_snapshots()
|
||||
snapshots[_direct_snapshot_key(task_id)] = snapshot_id
|
||||
snapshots.pop(task_id, None)
|
||||
|
|
@ -71,10 +58,8 @@ def _store_direct_snapshot(task_id: str, snapshot_id: str) -> None:
|
|||
|
||||
|
||||
def _delete_direct_snapshot(task_id: str, snapshot_id: str | None = None) -> None:
|
||||
"""Remove direct Modal snapshot entries for a task, including legacy keys."""
|
||||
snapshots = _load_snapshots()
|
||||
updated = False
|
||||
|
||||
for key in (_direct_snapshot_key(task_id), task_id):
|
||||
value = snapshots.get(key)
|
||||
if value is None:
|
||||
|
|
@ -82,13 +67,15 @@ def _delete_direct_snapshot(task_id: str, snapshot_id: str | None = None) -> Non
|
|||
if snapshot_id is None or value == snapshot_id:
|
||||
snapshots.pop(key, None)
|
||||
updated = True
|
||||
|
||||
if updated:
|
||||
_save_snapshots(snapshots)
|
||||
|
||||
|
||||
def _resolve_modal_image(image_spec: Any) -> Any:
|
||||
"""Convert registry references or snapshot ids into Modal image objects."""
|
||||
"""Convert registry references or snapshot ids into Modal image objects.
|
||||
|
||||
Includes add_python support for ubuntu/debian images (absorbed from PR 4511).
|
||||
"""
|
||||
import modal as _modal
|
||||
|
||||
if not isinstance(image_spec, str):
|
||||
|
|
@ -97,12 +84,22 @@ def _resolve_modal_image(image_spec: Any) -> Any:
|
|||
if image_spec.startswith("im-"):
|
||||
return _modal.Image.from_id(image_spec)
|
||||
|
||||
# PR 4511: add python to ubuntu/debian images that don't have it
|
||||
lower = image_spec.lower()
|
||||
add_python = any(base in lower for base in ("ubuntu", "debian"))
|
||||
|
||||
setup_commands = [
|
||||
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
|
||||
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
|
||||
]
|
||||
if add_python:
|
||||
setup_commands.insert(0,
|
||||
"RUN apt-get update -qq && apt-get install -y -qq python3 python3-venv > /dev/null 2>&1 || true"
|
||||
)
|
||||
|
||||
return _modal.Image.from_registry(
|
||||
image_spec,
|
||||
setup_dockerfile_commands=[
|
||||
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
|
||||
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
|
||||
],
|
||||
setup_dockerfile_commands=setup_commands,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -138,19 +135,15 @@ class _AsyncWorker:
|
|||
self._thread.join(timeout=10)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _DirectModalExecHandle:
|
||||
thread: threading.Thread
|
||||
result_holder: Dict[str, Any]
|
||||
class ModalEnvironment(BaseEnvironment):
|
||||
"""Modal cloud execution via native Modal sandboxes.
|
||||
|
||||
|
||||
class ModalEnvironment(BaseModalExecutionEnvironment):
|
||||
"""Modal cloud execution via native Modal sandboxes."""
|
||||
Spawn-per-call via _ThreadedProcessHandle wrapping async SDK calls.
|
||||
cancel_fn wired to sandbox.terminate for interrupt support.
|
||||
"""
|
||||
|
||||
_stdin_mode = "heredoc"
|
||||
_poll_interval_seconds = 0.2
|
||||
_interrupt_output = "[Command interrupted - Modal sandbox terminated]"
|
||||
_unexpected_error_prefix = "Modal execution error"
|
||||
_snapshot_timeout = 60 # Modal cold starts can be slow
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -170,6 +163,7 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
|
|||
self._app = None
|
||||
self._worker = _AsyncWorker()
|
||||
self._synced_files: Dict[str, tuple] = {}
|
||||
self._last_sync_time: float = 0
|
||||
|
||||
sandbox_kwargs = dict(modal_sandbox_kwargs or {})
|
||||
|
||||
|
|
@ -199,27 +193,13 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
|
|||
remote_path=mount_entry["container_path"],
|
||||
)
|
||||
)
|
||||
logger.info(
|
||||
"Modal: mounting credential %s -> %s",
|
||||
mount_entry["host_path"],
|
||||
mount_entry["container_path"],
|
||||
)
|
||||
|
||||
# Mount individual skill files (symlinks filtered out).
|
||||
skills_files = iter_skills_files()
|
||||
for entry in skills_files:
|
||||
for entry in iter_skills_files():
|
||||
cred_mounts.append(
|
||||
_modal.Mount.from_local_file(
|
||||
entry["host_path"],
|
||||
remote_path=entry["container_path"],
|
||||
)
|
||||
)
|
||||
if skills_files:
|
||||
logger.info("Modal: mounting %d skill files", len(skills_files))
|
||||
|
||||
# Mount host-side cache files (documents, images, audio,
|
||||
# screenshots). New files arriving mid-session are picked up
|
||||
# by _sync_files() before each command execution.
|
||||
cache_files = iter_cache_files()
|
||||
for entry in cache_files:
|
||||
cred_mounts.append(
|
||||
|
|
@ -228,8 +208,6 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
|
|||
remote_path=entry["container_path"],
|
||||
)
|
||||
)
|
||||
if cache_files:
|
||||
logger.info("Modal: mounting %d cache files", len(cache_files))
|
||||
except Exception as e:
|
||||
logger.debug("Modal: could not load credential file mounts: %s", e)
|
||||
|
||||
|
|
@ -243,8 +221,7 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
|
|||
existing_mounts.extend(cred_mounts)
|
||||
create_kwargs["mounts"] = existing_mounts
|
||||
sandbox = await _modal.Sandbox.create.aio(
|
||||
"sleep",
|
||||
"infinity",
|
||||
"sleep", "infinity",
|
||||
image=image_spec,
|
||||
app=app,
|
||||
timeout=int(create_kwargs.pop("timeout", 3600)),
|
||||
|
|
@ -255,57 +232,41 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
|
|||
try:
|
||||
target_image_spec = restored_snapshot_id or image
|
||||
try:
|
||||
# _resolve_modal_image keeps the Modal bootstrap fix together:
|
||||
# it applies setup_dockerfile_commands with ensurepip before
|
||||
# Modal builds registry images, while snapshot ids restore via
|
||||
# modal.Image.from_id() without rebuilding.
|
||||
effective_image = _resolve_modal_image(target_image_spec)
|
||||
self._app, self._sandbox = self._worker.run_coroutine(
|
||||
_create_sandbox(effective_image),
|
||||
timeout=300,
|
||||
_create_sandbox(effective_image), timeout=300,
|
||||
)
|
||||
except Exception as exc:
|
||||
if not restored_snapshot_id:
|
||||
raise
|
||||
|
||||
logger.warning(
|
||||
"Modal: failed to restore snapshot %s, retrying with base image: %s",
|
||||
restored_snapshot_id[:20],
|
||||
exc,
|
||||
restored_snapshot_id[:20], exc,
|
||||
)
|
||||
_delete_direct_snapshot(self._task_id, restored_snapshot_id)
|
||||
base_image = _resolve_modal_image(image)
|
||||
self._app, self._sandbox = self._worker.run_coroutine(
|
||||
_create_sandbox(base_image),
|
||||
timeout=300,
|
||||
_create_sandbox(base_image), timeout=300,
|
||||
)
|
||||
else:
|
||||
if restored_snapshot_id and restored_from_legacy_key:
|
||||
_store_direct_snapshot(self._task_id, restored_snapshot_id)
|
||||
logger.info(
|
||||
"Modal: migrated legacy snapshot entry for task %s",
|
||||
self._task_id,
|
||||
)
|
||||
except Exception:
|
||||
self._worker.stop()
|
||||
raise
|
||||
|
||||
logger.info("Modal: sandbox created (task=%s)", self._task_id)
|
||||
self.init_session()
|
||||
|
||||
def _push_file_to_sandbox(self, host_path: str, container_path: str) -> bool:
|
||||
"""Push a single file into the sandbox if changed. Returns True if synced."""
|
||||
hp = Path(host_path)
|
||||
try:
|
||||
stat = hp.stat()
|
||||
file_key = (stat.st_mtime, stat.st_size)
|
||||
except OSError:
|
||||
"""Push a single file into the sandbox if changed."""
|
||||
file_key = _file_mtime_key(host_path)
|
||||
if file_key is None:
|
||||
return False
|
||||
|
||||
if self._synced_files.get(container_path) == file_key:
|
||||
return False
|
||||
|
||||
try:
|
||||
content = hp.read_bytes()
|
||||
content = Path(host_path).read_bytes()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
|
@ -326,85 +287,55 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
|
|||
return True
|
||||
|
||||
def _sync_files(self) -> None:
|
||||
"""Push credential, skill, and cache files into the running sandbox.
|
||||
|
||||
Runs before each command. Uses mtime+size caching so only changed
|
||||
files are pushed (~13μs overhead in the no-op case). Cache files
|
||||
are especially important here — new uploads/screenshots may appear
|
||||
mid-session after sandbox creation.
|
||||
"""
|
||||
"""Push credential, skill, and cache files into the running sandbox."""
|
||||
try:
|
||||
from tools.credential_files import (
|
||||
get_credential_file_mounts,
|
||||
iter_skills_files,
|
||||
iter_cache_files,
|
||||
)
|
||||
|
||||
for entry in get_credential_file_mounts():
|
||||
if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
|
||||
logger.debug("Modal: synced credential %s", entry["container_path"])
|
||||
|
||||
self._push_file_to_sandbox(entry["host_path"], entry["container_path"])
|
||||
for entry in iter_skills_files():
|
||||
if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
|
||||
logger.debug("Modal: synced skill file %s", entry["container_path"])
|
||||
|
||||
self._push_file_to_sandbox(entry["host_path"], entry["container_path"])
|
||||
for entry in iter_cache_files():
|
||||
if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
|
||||
logger.debug("Modal: synced cache file %s", entry["container_path"])
|
||||
self._push_file_to_sandbox(entry["host_path"], entry["container_path"])
|
||||
except Exception as e:
|
||||
logger.debug("Modal: file sync failed: %s", e)
|
||||
|
||||
def _before_execute(self) -> None:
|
||||
self._sync_files()
|
||||
def _run_bash(self, cmd_string: str, *, login: bool = False,
|
||||
timeout: int = 120,
|
||||
stdin_data: str | None = None):
|
||||
"""Return a _ThreadedProcessHandle wrapping an async Modal sandbox exec."""
|
||||
sandbox = self._sandbox
|
||||
worker = self._worker
|
||||
|
||||
def _start_modal_exec(self, prepared: PreparedModalExec) -> ModalExecStart:
|
||||
full_command = f"cd {shlex.quote(prepared.cwd)} && {prepared.command}"
|
||||
result_holder = {"value": None, "error": None}
|
||||
def cancel():
|
||||
worker.run_coroutine(sandbox.terminate.aio(), timeout=15)
|
||||
|
||||
def _run():
|
||||
try:
|
||||
async def _do_execute():
|
||||
process = await self._sandbox.exec.aio(
|
||||
"bash",
|
||||
"-c",
|
||||
full_command,
|
||||
timeout=prepared.timeout,
|
||||
)
|
||||
stdout = await process.stdout.read.aio()
|
||||
stderr = await process.stderr.read.aio()
|
||||
exit_code = await process.wait.aio()
|
||||
if isinstance(stdout, bytes):
|
||||
stdout = stdout.decode("utf-8", errors="replace")
|
||||
if isinstance(stderr, bytes):
|
||||
stderr = stderr.decode("utf-8", errors="replace")
|
||||
output = stdout
|
||||
if stderr:
|
||||
output = f"{stdout}\n{stderr}" if stdout else stderr
|
||||
return self._result(output, exit_code)
|
||||
def exec_fn() -> tuple[str, int]:
|
||||
async def _do():
|
||||
args = ["bash"]
|
||||
if login:
|
||||
args.extend(["-l", "-c", cmd_string])
|
||||
else:
|
||||
args.extend(["-c", cmd_string])
|
||||
process = await sandbox.exec.aio(*args, timeout=timeout)
|
||||
stdout = await process.stdout.read.aio()
|
||||
stderr = await process.stderr.read.aio()
|
||||
exit_code = await process.wait.aio()
|
||||
if isinstance(stdout, bytes):
|
||||
stdout = stdout.decode("utf-8", errors="replace")
|
||||
if isinstance(stderr, bytes):
|
||||
stderr = stderr.decode("utf-8", errors="replace")
|
||||
output = stdout
|
||||
if stderr:
|
||||
output = f"{stdout}\n{stderr}" if stdout else stderr
|
||||
return output, exit_code
|
||||
|
||||
result_holder["value"] = self._worker.run_coroutine(
|
||||
_do_execute(),
|
||||
timeout=prepared.timeout + 30,
|
||||
)
|
||||
except Exception as e:
|
||||
result_holder["error"] = e
|
||||
return worker.run_coroutine(_do(), timeout=timeout + 30)
|
||||
|
||||
t = threading.Thread(target=_run, daemon=True)
|
||||
t.start()
|
||||
return ModalExecStart(handle=_DirectModalExecHandle(thread=t, result_holder=result_holder))
|
||||
|
||||
def _poll_modal_exec(self, handle: _DirectModalExecHandle) -> dict | None:
|
||||
if handle.thread.is_alive():
|
||||
return None
|
||||
if handle.result_holder["error"]:
|
||||
return self._error_result(f"Modal execution error: {handle.result_holder['error']}")
|
||||
return handle.result_holder["value"]
|
||||
|
||||
def _cancel_modal_exec(self, handle: _DirectModalExecHandle) -> None:
|
||||
self._worker.run_coroutine(
|
||||
self._sandbox.terminate.aio(),
|
||||
timeout=15,
|
||||
)
|
||||
return _ThreadedProcessHandle(exec_fn, cancel_fn=cancel)
|
||||
|
||||
def cleanup(self):
|
||||
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
|
||||
|
|
@ -426,17 +357,13 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
|
|||
_store_direct_snapshot(self._task_id, snapshot_id)
|
||||
logger.info(
|
||||
"Modal: saved filesystem snapshot %s for task %s",
|
||||
snapshot_id[:20],
|
||||
self._task_id,
|
||||
snapshot_id[:20], self._task_id,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Modal: filesystem snapshot failed: %s", e)
|
||||
|
||||
try:
|
||||
self._worker.run_coroutine(
|
||||
self._sandbox.terminate.aio(),
|
||||
timeout=15,
|
||||
)
|
||||
self._worker.run_coroutine(self._sandbox.terminate.aio(), timeout=15)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue