"""Singularity/Apptainer persistent container environment. Security-hardened with --containall, --no-home, capability dropping. Supports configurable resource limits and optional filesystem persistence via writable overlay directories that survive across sessions. """ import logging import os import shutil import subprocess import threading import uuid from pathlib import Path from typing import Optional from hermes_constants import get_hermes_home from tools.environments.base import ( BaseEnvironment, _load_json_store, _popen_bash, _save_json_store, ) logger = logging.getLogger(__name__) _SNAPSHOT_STORE = get_hermes_home() / "singularity_snapshots.json" def _find_singularity_executable() -> str: """Locate the apptainer or singularity CLI binary.""" if shutil.which("apptainer"): return "apptainer" if shutil.which("singularity"): return "singularity" raise RuntimeError( "Neither 'apptainer' nor 'singularity' was found in PATH. " "Install Apptainer (https://apptainer.org/docs/admin/main/installation.html) " "or Singularity and ensure the CLI is available." ) def _ensure_singularity_available() -> str: """Preflight check: resolve the executable and verify it responds.""" exe = _find_singularity_executable() try: result = subprocess.run( [exe, "version"], capture_output=True, text=True, timeout=10, ) except FileNotFoundError: raise RuntimeError( f"Singularity backend selected but '{exe}' could not be executed." ) except subprocess.TimeoutExpired: raise RuntimeError(f"'{exe} version' timed out.") if result.returncode != 0: stderr = result.stderr.strip()[:200] raise RuntimeError(f"'{exe} version' failed (exit code {result.returncode}): {stderr}") return exe def _load_snapshots() -> dict: return _load_json_store(_SNAPSHOT_STORE) def _save_snapshots(data: dict) -> None: _save_json_store(_SNAPSHOT_STORE, data) def _get_scratch_dir() -> Path: custom_scratch = os.getenv("TERMINAL_SCRATCH_DIR") if custom_scratch: scratch_path = Path(custom_scratch) scratch_path.mkdir(parents=True, exist_ok=True) return scratch_path from tools.environments.base import get_sandbox_dir sandbox = get_sandbox_dir() / "singularity" scratch = Path("/scratch") if scratch.exists() and os.access(scratch, os.W_OK): user_scratch = scratch / os.getenv("USER", "hermes") / "hermes-agent" user_scratch.mkdir(parents=True, exist_ok=True) logger.info("Using /scratch for sandboxes: %s", user_scratch) return user_scratch sandbox.mkdir(parents=True, exist_ok=True) return sandbox def _get_apptainer_cache_dir() -> Path: cache_dir = os.getenv("APPTAINER_CACHEDIR") if cache_dir: cache_path = Path(cache_dir) cache_path.mkdir(parents=True, exist_ok=True) return cache_path scratch = _get_scratch_dir() cache_path = scratch / ".apptainer" cache_path.mkdir(parents=True, exist_ok=True) return cache_path _sif_build_lock = threading.Lock() def _get_or_build_sif(image: str, executable: str = "apptainer") -> str: if image.endswith('.sif') and Path(image).exists(): return image if not image.startswith('docker://'): return image image_name = image.replace('docker://', '').replace('/', '-').replace(':', '-') cache_dir = _get_apptainer_cache_dir() sif_path = cache_dir / f"{image_name}.sif" if sif_path.exists(): return str(sif_path) with _sif_build_lock: if sif_path.exists(): return str(sif_path) logger.info("Building SIF image (one-time setup)...") logger.info(" Source: %s", image) logger.info(" Target: %s", sif_path) tmp_dir = cache_dir / "tmp" tmp_dir.mkdir(parents=True, exist_ok=True) env = os.environ.copy() env["APPTAINER_TMPDIR"] = str(tmp_dir) env["APPTAINER_CACHEDIR"] = str(cache_dir) try: result = subprocess.run( [executable, "build", str(sif_path), image], capture_output=True, text=True, timeout=600, env=env, ) if result.returncode != 0: logger.warning("SIF build failed, falling back to docker:// URL") logger.warning(" Error: %s", result.stderr[:500]) return image logger.info("SIF image built successfully") return str(sif_path) except subprocess.TimeoutExpired: logger.warning("SIF build timed out, falling back to docker:// URL") if sif_path.exists(): sif_path.unlink() return image except Exception as e: logger.warning("SIF build error: %s, falling back to docker:// URL", e) return image class SingularityEnvironment(BaseEnvironment): """Hardened Singularity/Apptainer container with resource limits and persistence. Spawn-per-call: every execute() spawns a fresh ``apptainer exec ... bash -c`` process. Session snapshot preserves env vars across calls. CWD persists via in-band stdout markers. """ def __init__( self, image: str, cwd: str = "~", timeout: int = 60, cpu: float = 0, memory: int = 0, disk: int = 0, persistent_filesystem: bool = False, task_id: str = "default", ): super().__init__(cwd=cwd, timeout=timeout) self.executable = _ensure_singularity_available() self.image = _get_or_build_sif(image, self.executable) self.instance_id = f"hermes_{uuid.uuid4().hex[:12]}" self._instance_started = False self._persistent = persistent_filesystem self._task_id = task_id self._overlay_dir: Optional[Path] = None self._cpu = cpu self._memory = memory if self._persistent: overlay_base = _get_scratch_dir() / "hermes-overlays" overlay_base.mkdir(parents=True, exist_ok=True) self._overlay_dir = overlay_base / f"overlay-{task_id}" self._overlay_dir.mkdir(parents=True, exist_ok=True) self._start_instance() self.init_session() def _start_instance(self): cmd = [self.executable, "instance", "start"] cmd.extend(["--containall", "--no-home"]) if self._persistent and self._overlay_dir: cmd.extend(["--overlay", str(self._overlay_dir)]) else: cmd.append("--writable-tmpfs") try: from tools.credential_files import get_credential_file_mounts, get_skills_directory_mount for mount_entry in get_credential_file_mounts(): cmd.extend(["--bind", f"{mount_entry['host_path']}:{mount_entry['container_path']}:ro"]) for skills_mount in get_skills_directory_mount(): cmd.extend(["--bind", f"{skills_mount['host_path']}:{skills_mount['container_path']}:ro"]) except Exception as e: logger.debug("Singularity: could not load credential/skills mounts: %s", e) if self._memory > 0: cmd.extend(["--memory", f"{self._memory}M"]) if self._cpu > 0: cmd.extend(["--cpus", str(self._cpu)]) cmd.extend([str(self.image), self.instance_id]) try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) if result.returncode != 0: raise RuntimeError(f"Failed to start instance: {result.stderr}") self._instance_started = True logger.info("Singularity instance %s started (persistent=%s)", self.instance_id, self._persistent) except subprocess.TimeoutExpired: raise RuntimeError("Instance start timed out") def _run_bash(self, cmd_string: str, *, login: bool = False, timeout: int = 120, stdin_data: str | None = None) -> subprocess.Popen: """Spawn a bash process inside the Singularity instance.""" if not self._instance_started: raise RuntimeError("Singularity instance not started") cmd = [self.executable, "exec", f"instance://{self.instance_id}"] if login: cmd.extend(["bash", "-l", "-c", cmd_string]) else: cmd.extend(["bash", "-c", cmd_string]) return _popen_bash(cmd, stdin_data) def cleanup(self): """Stop the instance. If persistent, the overlay dir survives.""" if self._instance_started: try: subprocess.run( [self.executable, "instance", "stop", self.instance_id], capture_output=True, text=True, timeout=30, ) logger.info("Singularity instance %s stopped", self.instance_id) except Exception as e: logger.warning("Failed to stop Singularity instance %s: %s", self.instance_id, e) self._instance_started = False if self._persistent and self._overlay_dir: snapshots = _load_snapshots() snapshots[self._task_id] = str(self._overlay_dir) _save_snapshots(snapshots)