feat(environments): unified spawn-per-call execution layer

Replace dual execution model (PersistentShellMixin + per-backend oneshot)
with spawn-per-call + session snapshot for all backends except ManagedModal.

Core changes:
- Every command spawns a fresh bash process; session snapshot (env vars,
  functions, aliases) captured at init and re-sourced before each command
- CWD persists via file-based read (local) or in-band stdout markers (remote)
- ProcessHandle protocol + _ThreadedProcessHandle adapter for SDK backends
- cancel_fn wired for Modal (sandbox.terminate) and Daytona (sandbox.stop)
- Shared utilities extracted: _pipe_stdin, _popen_bash, _load_json_store,
  _save_json_store, _file_mtime_key, _SYNC_INTERVAL_SECONDS
- Rate-limited file sync unified in base _before_execute() with _sync_files() hook
- execute_oneshot() removed; all 11 call sites in code_execution_tool.py
  migrated to execute()
- Daytona timeout wrapper replaced with SDK-native timeout parameter
- persistent_shell.py deleted (291 lines)

Backend-specific:
- Local: process-group kill via os.killpg, file-based CWD read
- Docker: -e env flags only on init_session, not per-command
- SSH: shlex.quote transport, ControlMaster connection reuse
- Singularity: apptainer exec with instance://, no forced --pwd
- Modal: _AsyncWorker + _ThreadedProcessHandle, cancel_fn -> sandbox.terminate
- Daytona: SDK-level timeout (not shell wrapper), cancel_fn -> sandbox.stop
- ManagedModal: unchanged (gateway owns execution); docstring added explaining why
This commit is contained in:
alt-glitch 2026-04-08 13:38:04 -07:00 committed by Teknium
parent 7d26feb9a3
commit d684d7ee7e
17 changed files with 1170 additions and 1686 deletions

View file

@ -5,19 +5,19 @@ wrapper, while preserving Hermes' persistent snapshot behavior across sessions.
"""
import asyncio
import json
import logging
import shlex
import threading
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional
from hermes_constants import get_hermes_home
from tools.environments.modal_common import (
BaseModalExecutionEnvironment,
ModalExecStart,
PreparedModalExec,
from tools.environments.base import (
BaseEnvironment,
_ThreadedProcessHandle,
_file_mtime_key,
_load_json_store,
_save_json_store,
)
logger = logging.getLogger(__name__)
@ -26,20 +26,12 @@ _SNAPSHOT_STORE = get_hermes_home() / "modal_snapshots.json"
_DIRECT_SNAPSHOT_NAMESPACE = "direct"
def _load_snapshots() -> Dict[str, str]:
"""Load snapshot ID mapping from disk."""
if _SNAPSHOT_STORE.exists():
try:
return json.loads(_SNAPSHOT_STORE.read_text())
except Exception:
pass
return {}
def _load_snapshots() -> dict:
return _load_json_store(_SNAPSHOT_STORE)
def _save_snapshots(data: Dict[str, str]) -> None:
"""Persist snapshot ID mapping to disk."""
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
def _save_snapshots(data: dict) -> None:
_save_json_store(_SNAPSHOT_STORE, data)
def _direct_snapshot_key(task_id: str) -> str:
@ -47,23 +39,18 @@ def _direct_snapshot_key(task_id: str) -> str:
def _get_snapshot_restore_candidate(task_id: str) -> tuple[str | None, bool]:
"""Return a snapshot id and whether it came from the legacy key format."""
snapshots = _load_snapshots()
namespaced_key = _direct_snapshot_key(task_id)
snapshot_id = snapshots.get(namespaced_key)
if isinstance(snapshot_id, str) and snapshot_id:
return snapshot_id, False
legacy_snapshot_id = snapshots.get(task_id)
if isinstance(legacy_snapshot_id, str) and legacy_snapshot_id:
return legacy_snapshot_id, True
return None, False
def _store_direct_snapshot(task_id: str, snapshot_id: str) -> None:
"""Persist the direct Modal snapshot id under the direct namespace."""
snapshots = _load_snapshots()
snapshots[_direct_snapshot_key(task_id)] = snapshot_id
snapshots.pop(task_id, None)
@ -71,10 +58,8 @@ def _store_direct_snapshot(task_id: str, snapshot_id: str) -> None:
def _delete_direct_snapshot(task_id: str, snapshot_id: str | None = None) -> None:
"""Remove direct Modal snapshot entries for a task, including legacy keys."""
snapshots = _load_snapshots()
updated = False
for key in (_direct_snapshot_key(task_id), task_id):
value = snapshots.get(key)
if value is None:
@ -82,13 +67,15 @@ def _delete_direct_snapshot(task_id: str, snapshot_id: str | None = None) -> Non
if snapshot_id is None or value == snapshot_id:
snapshots.pop(key, None)
updated = True
if updated:
_save_snapshots(snapshots)
def _resolve_modal_image(image_spec: Any) -> Any:
"""Convert registry references or snapshot ids into Modal image objects."""
"""Convert registry references or snapshot ids into Modal image objects.
Includes add_python support for ubuntu/debian images (absorbed from PR 4511).
"""
import modal as _modal
if not isinstance(image_spec, str):
@ -97,12 +84,22 @@ def _resolve_modal_image(image_spec: Any) -> Any:
if image_spec.startswith("im-"):
return _modal.Image.from_id(image_spec)
# PR 4511: add python to ubuntu/debian images that don't have it
lower = image_spec.lower()
add_python = any(base in lower for base in ("ubuntu", "debian"))
setup_commands = [
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
]
if add_python:
setup_commands.insert(0,
"RUN apt-get update -qq && apt-get install -y -qq python3 python3-venv > /dev/null 2>&1 || true"
)
return _modal.Image.from_registry(
image_spec,
setup_dockerfile_commands=[
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
],
setup_dockerfile_commands=setup_commands,
)
@ -138,19 +135,15 @@ class _AsyncWorker:
self._thread.join(timeout=10)
@dataclass
class _DirectModalExecHandle:
thread: threading.Thread
result_holder: Dict[str, Any]
class ModalEnvironment(BaseEnvironment):
"""Modal cloud execution via native Modal sandboxes.
class ModalEnvironment(BaseModalExecutionEnvironment):
"""Modal cloud execution via native Modal sandboxes."""
Spawn-per-call via _ThreadedProcessHandle wrapping async SDK calls.
cancel_fn wired to sandbox.terminate for interrupt support.
"""
_stdin_mode = "heredoc"
_poll_interval_seconds = 0.2
_interrupt_output = "[Command interrupted - Modal sandbox terminated]"
_unexpected_error_prefix = "Modal execution error"
_snapshot_timeout = 60 # Modal cold starts can be slow
def __init__(
self,
@ -170,6 +163,7 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
self._app = None
self._worker = _AsyncWorker()
self._synced_files: Dict[str, tuple] = {}
self._last_sync_time: float = 0
sandbox_kwargs = dict(modal_sandbox_kwargs or {})
@ -199,27 +193,13 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
remote_path=mount_entry["container_path"],
)
)
logger.info(
"Modal: mounting credential %s -> %s",
mount_entry["host_path"],
mount_entry["container_path"],
)
# Mount individual skill files (symlinks filtered out).
skills_files = iter_skills_files()
for entry in skills_files:
for entry in iter_skills_files():
cred_mounts.append(
_modal.Mount.from_local_file(
entry["host_path"],
remote_path=entry["container_path"],
)
)
if skills_files:
logger.info("Modal: mounting %d skill files", len(skills_files))
# Mount host-side cache files (documents, images, audio,
# screenshots). New files arriving mid-session are picked up
# by _sync_files() before each command execution.
cache_files = iter_cache_files()
for entry in cache_files:
cred_mounts.append(
@ -228,8 +208,6 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
remote_path=entry["container_path"],
)
)
if cache_files:
logger.info("Modal: mounting %d cache files", len(cache_files))
except Exception as e:
logger.debug("Modal: could not load credential file mounts: %s", e)
@ -243,8 +221,7 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
existing_mounts.extend(cred_mounts)
create_kwargs["mounts"] = existing_mounts
sandbox = await _modal.Sandbox.create.aio(
"sleep",
"infinity",
"sleep", "infinity",
image=image_spec,
app=app,
timeout=int(create_kwargs.pop("timeout", 3600)),
@ -255,57 +232,41 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
try:
target_image_spec = restored_snapshot_id or image
try:
# _resolve_modal_image keeps the Modal bootstrap fix together:
# it applies setup_dockerfile_commands with ensurepip before
# Modal builds registry images, while snapshot ids restore via
# modal.Image.from_id() without rebuilding.
effective_image = _resolve_modal_image(target_image_spec)
self._app, self._sandbox = self._worker.run_coroutine(
_create_sandbox(effective_image),
timeout=300,
_create_sandbox(effective_image), timeout=300,
)
except Exception as exc:
if not restored_snapshot_id:
raise
logger.warning(
"Modal: failed to restore snapshot %s, retrying with base image: %s",
restored_snapshot_id[:20],
exc,
restored_snapshot_id[:20], exc,
)
_delete_direct_snapshot(self._task_id, restored_snapshot_id)
base_image = _resolve_modal_image(image)
self._app, self._sandbox = self._worker.run_coroutine(
_create_sandbox(base_image),
timeout=300,
_create_sandbox(base_image), timeout=300,
)
else:
if restored_snapshot_id and restored_from_legacy_key:
_store_direct_snapshot(self._task_id, restored_snapshot_id)
logger.info(
"Modal: migrated legacy snapshot entry for task %s",
self._task_id,
)
except Exception:
self._worker.stop()
raise
logger.info("Modal: sandbox created (task=%s)", self._task_id)
self.init_session()
def _push_file_to_sandbox(self, host_path: str, container_path: str) -> bool:
"""Push a single file into the sandbox if changed. Returns True if synced."""
hp = Path(host_path)
try:
stat = hp.stat()
file_key = (stat.st_mtime, stat.st_size)
except OSError:
"""Push a single file into the sandbox if changed."""
file_key = _file_mtime_key(host_path)
if file_key is None:
return False
if self._synced_files.get(container_path) == file_key:
return False
try:
content = hp.read_bytes()
content = Path(host_path).read_bytes()
except Exception:
return False
@ -326,85 +287,55 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
return True
def _sync_files(self) -> None:
"""Push credential, skill, and cache files into the running sandbox.
Runs before each command. Uses mtime+size caching so only changed
files are pushed (~13μs overhead in the no-op case). Cache files
are especially important here new uploads/screenshots may appear
mid-session after sandbox creation.
"""
"""Push credential, skill, and cache files into the running sandbox."""
try:
from tools.credential_files import (
get_credential_file_mounts,
iter_skills_files,
iter_cache_files,
)
for entry in get_credential_file_mounts():
if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
logger.debug("Modal: synced credential %s", entry["container_path"])
self._push_file_to_sandbox(entry["host_path"], entry["container_path"])
for entry in iter_skills_files():
if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
logger.debug("Modal: synced skill file %s", entry["container_path"])
self._push_file_to_sandbox(entry["host_path"], entry["container_path"])
for entry in iter_cache_files():
if self._push_file_to_sandbox(entry["host_path"], entry["container_path"]):
logger.debug("Modal: synced cache file %s", entry["container_path"])
self._push_file_to_sandbox(entry["host_path"], entry["container_path"])
except Exception as e:
logger.debug("Modal: file sync failed: %s", e)
def _before_execute(self) -> None:
self._sync_files()
def _run_bash(self, cmd_string: str, *, login: bool = False,
timeout: int = 120,
stdin_data: str | None = None):
"""Return a _ThreadedProcessHandle wrapping an async Modal sandbox exec."""
sandbox = self._sandbox
worker = self._worker
def _start_modal_exec(self, prepared: PreparedModalExec) -> ModalExecStart:
full_command = f"cd {shlex.quote(prepared.cwd)} && {prepared.command}"
result_holder = {"value": None, "error": None}
def cancel():
worker.run_coroutine(sandbox.terminate.aio(), timeout=15)
def _run():
try:
async def _do_execute():
process = await self._sandbox.exec.aio(
"bash",
"-c",
full_command,
timeout=prepared.timeout,
)
stdout = await process.stdout.read.aio()
stderr = await process.stderr.read.aio()
exit_code = await process.wait.aio()
if isinstance(stdout, bytes):
stdout = stdout.decode("utf-8", errors="replace")
if isinstance(stderr, bytes):
stderr = stderr.decode("utf-8", errors="replace")
output = stdout
if stderr:
output = f"{stdout}\n{stderr}" if stdout else stderr
return self._result(output, exit_code)
def exec_fn() -> tuple[str, int]:
async def _do():
args = ["bash"]
if login:
args.extend(["-l", "-c", cmd_string])
else:
args.extend(["-c", cmd_string])
process = await sandbox.exec.aio(*args, timeout=timeout)
stdout = await process.stdout.read.aio()
stderr = await process.stderr.read.aio()
exit_code = await process.wait.aio()
if isinstance(stdout, bytes):
stdout = stdout.decode("utf-8", errors="replace")
if isinstance(stderr, bytes):
stderr = stderr.decode("utf-8", errors="replace")
output = stdout
if stderr:
output = f"{stdout}\n{stderr}" if stdout else stderr
return output, exit_code
result_holder["value"] = self._worker.run_coroutine(
_do_execute(),
timeout=prepared.timeout + 30,
)
except Exception as e:
result_holder["error"] = e
return worker.run_coroutine(_do(), timeout=timeout + 30)
t = threading.Thread(target=_run, daemon=True)
t.start()
return ModalExecStart(handle=_DirectModalExecHandle(thread=t, result_holder=result_holder))
def _poll_modal_exec(self, handle: _DirectModalExecHandle) -> dict | None:
if handle.thread.is_alive():
return None
if handle.result_holder["error"]:
return self._error_result(f"Modal execution error: {handle.result_holder['error']}")
return handle.result_holder["value"]
def _cancel_modal_exec(self, handle: _DirectModalExecHandle) -> None:
self._worker.run_coroutine(
self._sandbox.terminate.aio(),
timeout=15,
)
return _ThreadedProcessHandle(exec_fn, cancel_fn=cancel)
def cleanup(self):
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
@ -426,17 +357,13 @@ class ModalEnvironment(BaseModalExecutionEnvironment):
_store_direct_snapshot(self._task_id, snapshot_id)
logger.info(
"Modal: saved filesystem snapshot %s for task %s",
snapshot_id[:20],
self._task_id,
snapshot_id[:20], self._task_id,
)
except Exception as e:
logger.warning("Modal: filesystem snapshot failed: %s", e)
try:
self._worker.run_coroutine(
self._sandbox.terminate.aio(),
timeout=15,
)
self._worker.run_coroutine(self._sandbox.terminate.aio(), timeout=15)
except Exception:
pass
finally: