mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 01:21:43 +00:00
feat: devex help, add Makefile, ruff, pre-commit, and modernize CI
This commit is contained in:
parent
172a38c344
commit
f4d7e6a29e
111 changed files with 11655 additions and 10200 deletions
|
|
@ -1,8 +1,8 @@
|
|||
"""Base class for all Hermes execution environment backends."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import os
|
||||
import subprocess
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
|
@ -34,9 +34,9 @@ class BaseEnvironment(ABC):
|
|||
self.env = env or {}
|
||||
|
||||
@abstractmethod
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
def execute(
|
||||
self, command: str, cwd: str = "", *, timeout: int | None = None, stdin_data: str | None = None
|
||||
) -> dict:
|
||||
"""Execute a command, return {"output": str, "returncode": int}."""
|
||||
...
|
||||
|
||||
|
|
@ -62,10 +62,10 @@ class BaseEnvironment(ABC):
|
|||
def _prepare_command(self, command: str) -> str:
|
||||
"""Transform sudo commands if SUDO_PASSWORD is available."""
|
||||
from tools.terminal_tool import _transform_sudo_command
|
||||
|
||||
return _transform_sudo_command(command)
|
||||
|
||||
def _build_run_kwargs(self, timeout: int | None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
def _build_run_kwargs(self, timeout: int | None, stdin_data: str | None = None) -> dict:
|
||||
"""Build common subprocess.run kwargs for non-interactive execution."""
|
||||
kw = {
|
||||
"text": True,
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ import shlex
|
|||
import threading
|
||||
import uuid
|
||||
import warnings
|
||||
from typing import Optional
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.interrupt import is_interrupted
|
||||
|
|
@ -32,8 +31,8 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
cwd: str = "/home/daytona",
|
||||
timeout: int = 60,
|
||||
cpu: int = 1,
|
||||
memory: int = 5120, # MB (hermes convention)
|
||||
disk: int = 10240, # MB (Daytona platform max is 10GB)
|
||||
memory: int = 5120, # MB (hermes convention)
|
||||
disk: int = 10240, # MB (Daytona platform max is 10GB)
|
||||
persistent_filesystem: bool = True,
|
||||
task_id: str = "default",
|
||||
):
|
||||
|
|
@ -41,8 +40,8 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
super().__init__(cwd=cwd, timeout=timeout)
|
||||
|
||||
from daytona import (
|
||||
Daytona,
|
||||
CreateSandboxFromImageParams,
|
||||
Daytona,
|
||||
DaytonaError,
|
||||
Resources,
|
||||
SandboxState,
|
||||
|
|
@ -73,13 +72,11 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
try:
|
||||
self._sandbox = self._daytona.find_one(labels=labels)
|
||||
self._sandbox.start()
|
||||
logger.info("Daytona: resumed sandbox %s for task %s",
|
||||
self._sandbox.id, task_id)
|
||||
logger.info("Daytona: resumed sandbox %s for task %s", self._sandbox.id, task_id)
|
||||
except DaytonaError:
|
||||
self._sandbox = None
|
||||
except Exception as e:
|
||||
logger.warning("Daytona: failed to resume sandbox for task %s: %s",
|
||||
task_id, e)
|
||||
logger.warning("Daytona: failed to resume sandbox for task %s: %s", task_id, e)
|
||||
self._sandbox = None
|
||||
|
||||
# Create a fresh sandbox if we don't have one
|
||||
|
|
@ -92,8 +89,7 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
resources=resources,
|
||||
)
|
||||
)
|
||||
logger.info("Daytona: created sandbox %s for task %s",
|
||||
self._sandbox.id, task_id)
|
||||
logger.info("Daytona: created sandbox %s for task %s", self._sandbox.id, task_id)
|
||||
|
||||
# Resolve cwd: detect actual home dir inside the sandbox
|
||||
if self._requested_cwd in ("~", "/home/daytona"):
|
||||
|
|
@ -112,7 +108,7 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
self._sandbox.start()
|
||||
logger.info("Daytona: restarted sandbox %s", self._sandbox.id)
|
||||
|
||||
def _exec_in_thread(self, exec_command: str, cwd: Optional[str], timeout: int) -> dict:
|
||||
def _exec_in_thread(self, exec_command: str, cwd: str | None, timeout: int) -> dict:
|
||||
"""Run exec in a background thread with interrupt polling.
|
||||
|
||||
The Daytona SDK's exec(timeout=...) parameter is unreliable (the
|
||||
|
|
@ -130,7 +126,8 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
def _run():
|
||||
try:
|
||||
response = self._sandbox.process.exec(
|
||||
timed_command, cwd=cwd,
|
||||
timed_command,
|
||||
cwd=cwd,
|
||||
)
|
||||
result_holder["value"] = {
|
||||
"output": response.result or "",
|
||||
|
|
@ -169,9 +166,9 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
return {"error": result_holder["error"]}
|
||||
return result_holder["value"]
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: Optional[int] = None,
|
||||
stdin_data: Optional[str] = None) -> dict:
|
||||
def execute(
|
||||
self, command: str, cwd: str = "", *, timeout: int | None = None, stdin_data: str | None = None
|
||||
) -> dict:
|
||||
with self._lock:
|
||||
self._ensure_sandbox_ready()
|
||||
|
||||
|
|
@ -189,6 +186,7 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
|
||||
if "error" in result:
|
||||
from daytona import DaytonaError
|
||||
|
||||
err = result["error"]
|
||||
if isinstance(err, DaytonaError):
|
||||
with self._lock:
|
||||
|
|
@ -210,8 +208,7 @@ class DaytonaEnvironment(BaseEnvironment):
|
|||
try:
|
||||
if self._persistent:
|
||||
self._sandbox.stop()
|
||||
logger.info("Daytona: stopped sandbox %s (filesystem preserved)",
|
||||
self._sandbox.id)
|
||||
logger.info("Daytona: stopped sandbox %s (filesystem preserved)", self._sandbox.id)
|
||||
else:
|
||||
self._daytona.delete(self._sandbox)
|
||||
logger.info("Daytona: deleted sandbox %s", self._sandbox.id)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ import subprocess
|
|||
import sys
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.interrupt import is_interrupted
|
||||
|
|
@ -19,7 +18,6 @@ from tools.interrupt import is_interrupted
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
# Security flags applied to every container.
|
||||
# The container itself is the security boundary (isolated from host).
|
||||
# We drop all capabilities then add back the minimum needed:
|
||||
|
|
@ -28,19 +26,28 @@ logger = logging.getLogger(__name__)
|
|||
# Block privilege escalation and limit PIDs.
|
||||
# /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds).
|
||||
_SECURITY_ARGS = [
|
||||
"--cap-drop", "ALL",
|
||||
"--cap-add", "DAC_OVERRIDE",
|
||||
"--cap-add", "CHOWN",
|
||||
"--cap-add", "FOWNER",
|
||||
"--security-opt", "no-new-privileges",
|
||||
"--pids-limit", "256",
|
||||
"--tmpfs", "/tmp:rw,nosuid,size=512m",
|
||||
"--tmpfs", "/var/tmp:rw,noexec,nosuid,size=256m",
|
||||
"--tmpfs", "/run:rw,noexec,nosuid,size=64m",
|
||||
"--cap-drop",
|
||||
"ALL",
|
||||
"--cap-add",
|
||||
"DAC_OVERRIDE",
|
||||
"--cap-add",
|
||||
"CHOWN",
|
||||
"--cap-add",
|
||||
"FOWNER",
|
||||
"--security-opt",
|
||||
"no-new-privileges",
|
||||
"--pids-limit",
|
||||
"256",
|
||||
"--tmpfs",
|
||||
"/tmp:rw,nosuid,size=512m",
|
||||
"--tmpfs",
|
||||
"/var/tmp:rw,noexec,nosuid,size=256m",
|
||||
"--tmpfs",
|
||||
"/run:rw,noexec,nosuid,size=64m",
|
||||
]
|
||||
|
||||
|
||||
_storage_opt_ok: Optional[bool] = None # cached result across instances
|
||||
_storage_opt_ok: bool | None = None # cached result across instances
|
||||
|
||||
|
||||
class DockerEnvironment(BaseEnvironment):
|
||||
|
|
@ -74,7 +81,7 @@ class DockerEnvironment(BaseEnvironment):
|
|||
self._base_image = image
|
||||
self._persistent = persistent_filesystem
|
||||
self._task_id = task_id
|
||||
self._container_id: Optional[str] = None
|
||||
self._container_id: str | None = None
|
||||
logger.info(f"DockerEnvironment volumes: {volumes}")
|
||||
# Ensure volumes is a list (config.yaml could be malformed)
|
||||
if volumes is not None and not isinstance(volumes, list):
|
||||
|
|
@ -105,8 +112,8 @@ class DockerEnvironment(BaseEnvironment):
|
|||
# mode uses tmpfs (ephemeral, fast, gone on cleanup).
|
||||
from tools.environments.base import get_sandbox_dir
|
||||
|
||||
self._workspace_dir: Optional[str] = None
|
||||
self._home_dir: Optional[str] = None
|
||||
self._workspace_dir: str | None = None
|
||||
self._home_dir: str | None = None
|
||||
if self._persistent:
|
||||
sandbox = get_sandbox_dir() / "docker" / task_id
|
||||
self._workspace_dir = str(sandbox / "workspace")
|
||||
|
|
@ -114,14 +121,19 @@ class DockerEnvironment(BaseEnvironment):
|
|||
os.makedirs(self._workspace_dir, exist_ok=True)
|
||||
os.makedirs(self._home_dir, exist_ok=True)
|
||||
writable_args = [
|
||||
"-v", f"{self._workspace_dir}:/workspace",
|
||||
"-v", f"{self._home_dir}:/root",
|
||||
"-v",
|
||||
f"{self._workspace_dir}:/workspace",
|
||||
"-v",
|
||||
f"{self._home_dir}:/root",
|
||||
]
|
||||
else:
|
||||
writable_args = [
|
||||
"--tmpfs", "/workspace:rw,exec,size=10g",
|
||||
"--tmpfs", "/home:rw,exec,size=1g",
|
||||
"--tmpfs", "/root:rw,exec,size=1g",
|
||||
"--tmpfs",
|
||||
"/workspace:rw,exec,size=10g",
|
||||
"--tmpfs",
|
||||
"/home:rw,exec,size=1g",
|
||||
"--tmpfs",
|
||||
"/root:rw,exec,size=1g",
|
||||
]
|
||||
|
||||
# All containers get security hardening (capabilities dropped, no privilege
|
||||
|
|
@ -129,7 +141,7 @@ class DockerEnvironment(BaseEnvironment):
|
|||
# can install packages as needed.
|
||||
# User-configured volume mounts (from config.yaml docker_volumes)
|
||||
volume_args = []
|
||||
for vol in (volumes or []):
|
||||
for vol in volumes or []:
|
||||
if not isinstance(vol, str):
|
||||
logger.warning(f"Docker volume entry is not a string: {vol!r}")
|
||||
continue
|
||||
|
|
@ -146,7 +158,9 @@ class DockerEnvironment(BaseEnvironment):
|
|||
logger.info(f"Docker run_args: {all_run_args}")
|
||||
|
||||
self._inner = _Docker(
|
||||
image=image, cwd=cwd, timeout=timeout,
|
||||
image=image,
|
||||
cwd=cwd,
|
||||
timeout=timeout,
|
||||
run_args=all_run_args,
|
||||
)
|
||||
self._container_id = self._inner.container_id
|
||||
|
|
@ -154,7 +168,7 @@ class DockerEnvironment(BaseEnvironment):
|
|||
@staticmethod
|
||||
def _storage_opt_supported() -> bool:
|
||||
"""Check if Docker's storage driver supports --storage-opt size=.
|
||||
|
||||
|
||||
Only overlay2 on XFS with pquota supports per-container disk quotas.
|
||||
Ubuntu (and most distros) default to ext4, where this flag errors out.
|
||||
"""
|
||||
|
|
@ -164,7 +178,9 @@ class DockerEnvironment(BaseEnvironment):
|
|||
try:
|
||||
result = subprocess.run(
|
||||
["docker", "info", "--format", "{{.Driver}}"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
driver = result.stdout.strip().lower()
|
||||
if driver != "overlay2":
|
||||
|
|
@ -174,14 +190,15 @@ class DockerEnvironment(BaseEnvironment):
|
|||
# Probe by attempting a dry-ish run — the fastest reliable check.
|
||||
probe = subprocess.run(
|
||||
["docker", "create", "--storage-opt", "size=1m", "hello-world"],
|
||||
capture_output=True, text=True, timeout=15,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
if probe.returncode == 0:
|
||||
# Clean up the created container
|
||||
container_id = probe.stdout.strip()
|
||||
if container_id:
|
||||
subprocess.run(["docker", "rm", container_id],
|
||||
capture_output=True, timeout=5)
|
||||
subprocess.run(["docker", "rm", container_id], capture_output=True, timeout=5)
|
||||
_storage_opt_ok = True
|
||||
else:
|
||||
_storage_opt_ok = False
|
||||
|
|
@ -190,9 +207,9 @@ class DockerEnvironment(BaseEnvironment):
|
|||
logger.debug("Docker --storage-opt support: %s", _storage_opt_ok)
|
||||
return _storage_opt_ok
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
def execute(
|
||||
self, command: str, cwd: str = "", *, timeout: int | None = None, stdin_data: str | None = None
|
||||
) -> dict:
|
||||
exec_command = self._prepare_command(command)
|
||||
work_dir = cwd or self.cwd
|
||||
effective_timeout = timeout or self.timeout
|
||||
|
|
@ -218,7 +235,8 @@ class DockerEnvironment(BaseEnvironment):
|
|||
_output_chunks = []
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
stdin=subprocess.PIPE if stdin_data else subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
|
|
@ -269,6 +287,7 @@ class DockerEnvironment(BaseEnvironment):
|
|||
|
||||
if not self._persistent:
|
||||
import shutil
|
||||
|
||||
for d in (self._workspace_dir, self._home_dir):
|
||||
if d:
|
||||
shutil.rmtree(d, ignore_errors=True)
|
||||
|
|
|
|||
|
|
@ -154,9 +154,9 @@ class LocalEnvironment(BaseEnvironment):
|
|||
def __init__(self, cwd: str = "", timeout: int = 60, env: dict = None):
|
||||
super().__init__(cwd=cwd or os.getcwd(), timeout=timeout, env=env)
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
def execute(
|
||||
self, command: str, cwd: str = "", *, timeout: int | None = None, stdin_data: str | None = None
|
||||
) -> dict:
|
||||
from tools.terminal_tool import _interrupt_event
|
||||
|
||||
work_dir = cwd or self.cwd or os.getcwd()
|
||||
|
|
@ -172,11 +172,7 @@ class LocalEnvironment(BaseEnvironment):
|
|||
# Wrap with output fences so we can later extract the real
|
||||
# command output and discard shell init/exit noise.
|
||||
fenced_cmd = (
|
||||
f"printf '{_OUTPUT_FENCE}';"
|
||||
f" {exec_command};"
|
||||
f" __hermes_rc=$?;"
|
||||
f" printf '{_OUTPUT_FENCE}';"
|
||||
f" exit $__hermes_rc"
|
||||
f"printf '{_OUTPUT_FENCE}'; {exec_command}; __hermes_rc=$?; printf '{_OUTPUT_FENCE}'; exit $__hermes_rc"
|
||||
)
|
||||
# Ensure PATH always includes standard dirs — systemd services
|
||||
# and some terminal multiplexers inherit a minimal PATH.
|
||||
|
|
@ -200,12 +196,14 @@ class LocalEnvironment(BaseEnvironment):
|
|||
)
|
||||
|
||||
if stdin_data is not None:
|
||||
|
||||
def _write_stdin():
|
||||
try:
|
||||
proc.stdin.write(stdin_data)
|
||||
proc.stdin.close()
|
||||
except (BrokenPipeError, OSError):
|
||||
pass
|
||||
|
||||
threading.Thread(target=_write_stdin, daemon=True).start()
|
||||
|
||||
_output_chunks: list[str] = []
|
||||
|
|
|
|||
|
|
@ -8,10 +8,9 @@ project files, and config changes survive across sessions.
|
|||
import json
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.interrupt import is_interrupted
|
||||
|
|
@ -21,7 +20,7 @@ logger = logging.getLogger(__name__)
|
|||
_SNAPSHOT_STORE = Path.home() / ".hermes" / "modal_snapshots.json"
|
||||
|
||||
|
||||
def _load_snapshots() -> Dict[str, str]:
|
||||
def _load_snapshots() -> dict[str, str]:
|
||||
"""Load snapshot ID mapping from disk."""
|
||||
if _SNAPSHOT_STORE.exists():
|
||||
try:
|
||||
|
|
@ -31,7 +30,7 @@ def _load_snapshots() -> Dict[str, str]:
|
|||
return {}
|
||||
|
||||
|
||||
def _save_snapshots(data: Dict[str, str]) -> None:
|
||||
def _save_snapshots(data: dict[str, str]) -> None:
|
||||
"""Persist snapshot ID mapping to disk."""
|
||||
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
|
||||
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
|
||||
|
|
@ -52,7 +51,7 @@ class ModalEnvironment(BaseEnvironment):
|
|||
image: str,
|
||||
cwd: str = "~",
|
||||
timeout: int = 60,
|
||||
modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
|
||||
modal_sandbox_kwargs: dict[str, Any] | None = None,
|
||||
persistent_filesystem: bool = True,
|
||||
task_id: str = "default",
|
||||
):
|
||||
|
|
@ -61,6 +60,7 @@ class ModalEnvironment(BaseEnvironment):
|
|||
if not ModalEnvironment._patches_applied:
|
||||
try:
|
||||
from environments.patches import apply_patches
|
||||
|
||||
apply_patches()
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
@ -79,6 +79,7 @@ class ModalEnvironment(BaseEnvironment):
|
|||
if snapshot_id:
|
||||
try:
|
||||
import modal
|
||||
|
||||
restored_image = modal.Image.from_id(snapshot_id)
|
||||
logger.info("Modal: restoring from snapshot %s", snapshot_id[:20])
|
||||
except Exception as e:
|
||||
|
|
@ -88,6 +89,7 @@ class ModalEnvironment(BaseEnvironment):
|
|||
effective_image = restored_image if restored_image else image
|
||||
|
||||
from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment
|
||||
|
||||
self._inner = SwerexModalEnvironment(
|
||||
image=effective_image,
|
||||
cwd=cwd,
|
||||
|
|
@ -97,9 +99,9 @@ class ModalEnvironment(BaseEnvironment):
|
|||
modal_sandbox_kwargs=sandbox_kwargs,
|
||||
)
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
def execute(
|
||||
self, command: str, cwd: str = "", *, timeout: int | None = None, stdin_data: str | None = None
|
||||
) -> dict:
|
||||
if stdin_data is not None:
|
||||
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
|
||||
while marker in stdin_data:
|
||||
|
|
@ -139,29 +141,29 @@ class ModalEnvironment(BaseEnvironment):
|
|||
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
|
||||
if self._persistent:
|
||||
try:
|
||||
sandbox = getattr(self._inner, 'deployment', None)
|
||||
sandbox = getattr(sandbox, '_sandbox', None) if sandbox else None
|
||||
sandbox = getattr(self._inner, "deployment", None)
|
||||
sandbox = getattr(sandbox, "_sandbox", None) if sandbox else None
|
||||
if sandbox:
|
||||
import asyncio
|
||||
|
||||
async def _snapshot():
|
||||
img = await sandbox.snapshot_filesystem.aio()
|
||||
return img.object_id
|
||||
|
||||
try:
|
||||
snapshot_id = asyncio.run(_snapshot())
|
||||
except RuntimeError:
|
||||
import concurrent.futures
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
||||
snapshot_id = pool.submit(
|
||||
asyncio.run, _snapshot()
|
||||
).result(timeout=60)
|
||||
snapshot_id = pool.submit(asyncio.run, _snapshot()).result(timeout=60)
|
||||
|
||||
snapshots = _load_snapshots()
|
||||
snapshots[self._task_id] = snapshot_id
|
||||
_save_snapshots(snapshots)
|
||||
logger.info("Modal: saved filesystem snapshot %s for task %s",
|
||||
snapshot_id[:20], self._task_id)
|
||||
logger.info("Modal: saved filesystem snapshot %s for task %s", snapshot_id[:20], self._task_id)
|
||||
except Exception as e:
|
||||
logger.warning("Modal: filesystem snapshot failed: %s", e)
|
||||
|
||||
if hasattr(self._inner, 'stop'):
|
||||
if hasattr(self._inner, "stop"):
|
||||
self._inner.stop()
|
||||
|
|
|
|||
|
|
@ -10,11 +10,9 @@ import logging
|
|||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.interrupt import is_interrupted
|
||||
|
|
@ -24,7 +22,7 @@ logger = logging.getLogger(__name__)
|
|||
_SNAPSHOT_STORE = Path.home() / ".hermes" / "singularity_snapshots.json"
|
||||
|
||||
|
||||
def _load_snapshots() -> Dict[str, str]:
|
||||
def _load_snapshots() -> dict[str, str]:
|
||||
if _SNAPSHOT_STORE.exists():
|
||||
try:
|
||||
return json.loads(_SNAPSHOT_STORE.read_text())
|
||||
|
|
@ -33,7 +31,7 @@ def _load_snapshots() -> Dict[str, str]:
|
|||
return {}
|
||||
|
||||
|
||||
def _save_snapshots(data: Dict[str, str]) -> None:
|
||||
def _save_snapshots(data: dict[str, str]) -> None:
|
||||
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
|
||||
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
|
||||
|
||||
|
|
@ -42,6 +40,7 @@ def _save_snapshots(data: Dict[str, str]) -> None:
|
|||
# Singularity helpers (scratch dir, SIF cache, SIF building)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _get_scratch_dir() -> Path:
|
||||
"""Get the best directory for Singularity sandboxes.
|
||||
|
||||
|
|
@ -58,6 +57,7 @@ def _get_scratch_dir() -> Path:
|
|||
return scratch_path
|
||||
|
||||
from tools.environments.base import get_sandbox_dir
|
||||
|
||||
sandbox = get_sandbox_dir() / "singularity"
|
||||
|
||||
scratch = Path("/scratch")
|
||||
|
|
@ -93,12 +93,12 @@ def _get_or_build_sif(image: str, executable: str = "apptainer") -> str:
|
|||
Returns the path unchanged if it's already a .sif file.
|
||||
For docker:// URLs, checks the cache and builds if needed.
|
||||
"""
|
||||
if image.endswith('.sif') and Path(image).exists():
|
||||
if image.endswith(".sif") and Path(image).exists():
|
||||
return image
|
||||
if not image.startswith('docker://'):
|
||||
if not image.startswith("docker://"):
|
||||
return image
|
||||
|
||||
image_name = image.replace('docker://', '').replace('/', '-').replace(':', '-')
|
||||
image_name = image.replace("docker://", "").replace("/", "-").replace(":", "-")
|
||||
cache_dir = _get_apptainer_cache_dir()
|
||||
sif_path = cache_dir / f"{image_name}.sif"
|
||||
|
||||
|
|
@ -123,7 +123,10 @@ def _get_or_build_sif(image: str, executable: str = "apptainer") -> str:
|
|||
try:
|
||||
result = subprocess.run(
|
||||
[executable, "build", str(sif_path), image],
|
||||
capture_output=True, text=True, timeout=600, env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600,
|
||||
env=env,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
logger.warning("SIF build failed, falling back to docker:// URL")
|
||||
|
|
@ -145,6 +148,7 @@ def _get_or_build_sif(image: str, executable: str = "apptainer") -> str:
|
|||
# SingularityEnvironment
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
|
||||
class SingularityEnvironment(BaseEnvironment):
|
||||
"""Hardened Singularity/Apptainer container with resource limits and persistence.
|
||||
|
||||
|
|
@ -174,7 +178,7 @@ class SingularityEnvironment(BaseEnvironment):
|
|||
self._instance_started = False
|
||||
self._persistent = persistent_filesystem
|
||||
self._task_id = task_id
|
||||
self._overlay_dir: Optional[Path] = None
|
||||
self._overlay_dir: Path | None = None
|
||||
|
||||
# Resource limits
|
||||
self._cpu = cpu
|
||||
|
|
@ -215,14 +219,13 @@ class SingularityEnvironment(BaseEnvironment):
|
|||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Failed to start instance: {result.stderr}")
|
||||
self._instance_started = True
|
||||
logger.info("Singularity instance %s started (persistent=%s)",
|
||||
self.instance_id, self._persistent)
|
||||
logger.info("Singularity instance %s started (persistent=%s)", self.instance_id, self._persistent)
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError("Instance start timed out")
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
def execute(
|
||||
self, command: str, cwd: str = "", *, timeout: int | None = None, stdin_data: str | None = None
|
||||
) -> dict:
|
||||
if not self._instance_started:
|
||||
return {"output": "Instance not started", "returncode": -1}
|
||||
|
||||
|
|
@ -235,16 +238,16 @@ class SingularityEnvironment(BaseEnvironment):
|
|||
exec_command = f"cd {work_dir} && {exec_command}"
|
||||
work_dir = "/tmp"
|
||||
|
||||
cmd = [self.executable, "exec", "--pwd", work_dir,
|
||||
f"instance://{self.instance_id}",
|
||||
"bash", "-c", exec_command]
|
||||
cmd = [self.executable, "exec", "--pwd", work_dir, f"instance://{self.instance_id}", "bash", "-c", exec_command]
|
||||
|
||||
try:
|
||||
import time as _time
|
||||
|
||||
_output_chunks = []
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
stdin=subprocess.PIPE if stdin_data else subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
|
|
@ -295,7 +298,9 @@ class SingularityEnvironment(BaseEnvironment):
|
|||
try:
|
||||
subprocess.run(
|
||||
[self.executable, "instance", "stop", self.instance_id],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
logger.info("Singularity instance %s stopped", self.instance_id)
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -24,8 +24,7 @@ class SSHEnvironment(BaseEnvironment):
|
|||
and a remote kill is attempted over the ControlMaster socket.
|
||||
"""
|
||||
|
||||
def __init__(self, host: str, user: str, cwd: str = "~",
|
||||
timeout: int = 60, port: int = 22, key_path: str = ""):
|
||||
def __init__(self, host: str, user: str, cwd: str = "~", timeout: int = 60, port: int = 22, key_path: str = ""):
|
||||
super().__init__(cwd=cwd, timeout=timeout)
|
||||
self.host = host
|
||||
self.user = user
|
||||
|
|
@ -65,12 +64,12 @@ class SSHEnvironment(BaseEnvironment):
|
|||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError(f"SSH connection to {self.user}@{self.host} timed out")
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
def execute(
|
||||
self, command: str, cwd: str = "", *, timeout: int | None = None, stdin_data: str | None = None
|
||||
) -> dict:
|
||||
work_dir = cwd or self.cwd
|
||||
exec_command = self._prepare_command(command)
|
||||
wrapped = f'cd {work_dir} && {exec_command}'
|
||||
wrapped = f"cd {work_dir} && {exec_command}"
|
||||
effective_timeout = timeout or self.timeout
|
||||
|
||||
cmd = self._build_ssh_command()
|
||||
|
|
@ -136,8 +135,7 @@ class SSHEnvironment(BaseEnvironment):
|
|||
def cleanup(self):
|
||||
if self.control_socket.exists():
|
||||
try:
|
||||
cmd = ["ssh", "-o", f"ControlPath={self.control_socket}",
|
||||
"-O", "exit", f"{self.user}@{self.host}"]
|
||||
cmd = ["ssh", "-o", f"ControlPath={self.control_socket}", "-O", "exit", f"{self.user}@{self.host}"]
|
||||
subprocess.run(cmd, capture_output=True, timeout=5)
|
||||
except (OSError, subprocess.SubprocessError):
|
||||
pass
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue