Refactor Terminal and AIAgent cleanup

This commit is contained in:
teknium1 2026-02-21 22:31:43 -08:00
parent 9018e9dd70
commit 9123cfb5dd
17 changed files with 1842 additions and 976 deletions

View file

@ -0,0 +1,13 @@
"""Hermes execution environment backends.
Each backend provides the same interface (BaseEnvironment ABC) for running
shell commands in a specific execution context: local, Docker, Singularity,
SSH, or Modal.
The terminal_tool.py factory (_create_environment) selects the backend
based on the TERMINAL_ENV configuration.
"""
from tools.environments.base import BaseEnvironment
__all__ = ["BaseEnvironment"]

View file

@ -0,0 +1,72 @@
"""Base class for all Hermes execution environment backends."""
from abc import ABC, abstractmethod
import subprocess
class BaseEnvironment(ABC):
"""Common interface for all Hermes execution backends.
Subclasses implement execute() and cleanup(). Shared helpers eliminate
duplicated subprocess boilerplate across backends.
"""
def __init__(self, cwd: str, timeout: int, env: dict = None):
self.cwd = cwd
self.timeout = timeout
self.env = env or {}
@abstractmethod
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
"""Execute a command, return {"output": str, "returncode": int}."""
...
@abstractmethod
def cleanup(self):
"""Release backend resources (container, instance, connection)."""
...
def stop(self):
"""Alias for cleanup (compat with older callers)."""
self.cleanup()
def __del__(self):
try:
self.cleanup()
except Exception:
pass
# ------------------------------------------------------------------
# Shared helpers (eliminate duplication across backends)
# ------------------------------------------------------------------
def _prepare_command(self, command: str) -> str:
"""Transform sudo commands if SUDO_PASSWORD is available."""
from tools.terminal_tool import _transform_sudo_command
return _transform_sudo_command(command)
def _build_run_kwargs(self, timeout: int | None,
stdin_data: str | None = None) -> dict:
"""Build common subprocess.run kwargs for non-interactive execution."""
kw = {
"text": True,
"timeout": timeout or self.timeout,
"encoding": "utf-8",
"errors": "replace",
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
}
if stdin_data is not None:
kw["input"] = stdin_data
else:
kw["stdin"] = subprocess.DEVNULL
return kw
def _timeout_result(self, timeout: int | None) -> dict:
"""Standard return dict when a command times out."""
return {
"output": f"Command timed out after {timeout or self.timeout}s",
"returncode": 124,
}

View file

@ -0,0 +1,47 @@
"""Docker execution environment wrapping mini-swe-agent's DockerEnvironment."""
import os
import subprocess
from tools.environments.base import BaseEnvironment
class DockerEnvironment(BaseEnvironment):
"""Docker container execution via mini-swe-agent.
Wraps the upstream DockerEnvironment and adds non-blocking stdin
and sudo -S support.
"""
def __init__(self, image: str, cwd: str = "/", timeout: int = 60):
super().__init__(cwd=cwd, timeout=timeout)
from minisweagent.environments.docker import DockerEnvironment as _Docker
self._inner = _Docker(image=image, cwd=cwd, timeout=timeout)
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
exec_command = self._prepare_command(command)
work_dir = cwd or self.cwd
effective_timeout = timeout or self.timeout
assert self._inner.container_id, "Container not started"
cmd = [self._inner.config.executable, "exec"]
if stdin_data is not None:
cmd.append("-i")
cmd.extend(["-w", work_dir])
for key in self._inner.config.forward_env:
if (value := os.getenv(key)) is not None:
cmd.extend(["-e", f"{key}={value}"])
for key, value in self._inner.config.env.items():
cmd.extend(["-e", f"{key}={value}"])
cmd.extend([self._inner.container_id, "bash", "-lc", exec_command])
try:
result = subprocess.run(cmd, **self._build_run_kwargs(timeout, stdin_data))
return {"output": result.stdout, "returncode": result.returncode}
except subprocess.TimeoutExpired:
return self._timeout_result(effective_timeout)
def cleanup(self):
self._inner.cleanup()

103
tools/environments/local.py Normal file
View file

@ -0,0 +1,103 @@
"""Local execution environment with interrupt support and non-blocking I/O."""
import os
import signal
import subprocess
import threading
import time
from tools.environments.base import BaseEnvironment
class LocalEnvironment(BaseEnvironment):
"""Run commands directly on the host machine.
Features:
- Popen + polling for interrupt support (user can cancel mid-command)
- Background stdout drain thread to prevent pipe buffer deadlocks
- stdin_data support for piping content (bypasses ARG_MAX limits)
- sudo -S transform via SUDO_PASSWORD env var
"""
def __init__(self, cwd: str = "", timeout: int = 60, env: dict = None):
super().__init__(cwd=cwd or os.getcwd(), timeout=timeout, env=env)
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
from tools.terminal_tool import _interrupt_event
work_dir = cwd or self.cwd or os.getcwd()
effective_timeout = timeout or self.timeout
exec_command = self._prepare_command(command)
try:
proc = subprocess.Popen(
exec_command,
shell=True,
text=True,
cwd=work_dir,
env=os.environ | self.env,
encoding="utf-8",
errors="replace",
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
stdin=subprocess.PIPE if stdin_data is not None else subprocess.DEVNULL,
preexec_fn=os.setsid,
)
if stdin_data is not None:
def _write_stdin():
try:
proc.stdin.write(stdin_data)
proc.stdin.close()
except (BrokenPipeError, OSError):
pass
threading.Thread(target=_write_stdin, daemon=True).start()
_output_chunks: list[str] = []
def _drain_stdout():
try:
for line in proc.stdout:
_output_chunks.append(line)
except ValueError:
pass
finally:
try:
proc.stdout.close()
except Exception:
pass
reader = threading.Thread(target=_drain_stdout, daemon=True)
reader.start()
deadline = time.monotonic() + effective_timeout
while proc.poll() is None:
if _interrupt_event.is_set():
try:
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
except (ProcessLookupError, PermissionError):
proc.kill()
reader.join(timeout=2)
return {
"output": "".join(_output_chunks) + "\n[Command interrupted — user sent a new message]",
"returncode": 130,
}
if time.monotonic() > deadline:
try:
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
except (ProcessLookupError, PermissionError):
proc.kill()
reader.join(timeout=2)
return self._timeout_result(effective_timeout)
time.sleep(0.2)
reader.join(timeout=5)
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
except Exception as e:
return {"output": f"Execution error: {str(e)}", "returncode": 1}
def cleanup(self):
pass

View file

@ -0,0 +1,49 @@
"""Modal cloud execution environment wrapping mini-swe-agent's SwerexModalEnvironment."""
import uuid
from tools.environments.base import BaseEnvironment
class ModalEnvironment(BaseEnvironment):
"""Modal cloud execution via mini-swe-agent.
Wraps SwerexModalEnvironment and adds sudo -S support.
Async-safety patches are applied once before first use so Modal
works inside any event loop (Atropos, gateway, etc.).
"""
_patches_applied = False
def __init__(self, image: str, cwd: str = "/root", timeout: int = 60):
super().__init__(cwd=cwd, timeout=timeout)
if not ModalEnvironment._patches_applied:
try:
from environments.patches import apply_patches
apply_patches()
except ImportError:
pass
ModalEnvironment._patches_applied = True
from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment
self._inner = SwerexModalEnvironment(
image=image, cwd=cwd, timeout=timeout,
startup_timeout=180.0, runtime_timeout=3600.0,
)
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
if stdin_data is not None:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
while marker in stdin_data:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
command = f"{command} << '{marker}'\n{stdin_data}\n{marker}"
exec_command = self._prepare_command(command)
return self._inner.execute(exec_command, cwd=cwd, timeout=timeout)
def cleanup(self):
if hasattr(self._inner, 'stop'):
self._inner.stop()

View file

@ -0,0 +1,174 @@
"""Singularity/Apptainer persistent container environment.
Also contains the Singularity-specific helpers: scratch dir management,
Apptainer cache, and SIF image building.
"""
import logging
import os
import shutil
import subprocess
import tempfile
import threading
import uuid
from pathlib import Path
from tools.environments.base import BaseEnvironment
logger = logging.getLogger(__name__)
# -------------------------------------------------------------------------
# Singularity helpers (scratch dir, SIF cache, SIF building)
# -------------------------------------------------------------------------
def _get_scratch_dir() -> Path:
"""Get the best directory for Singularity sandboxes -- prefers /scratch on HPC."""
custom_scratch = os.getenv("TERMINAL_SCRATCH_DIR")
if custom_scratch:
scratch_path = Path(custom_scratch)
scratch_path.mkdir(parents=True, exist_ok=True)
return scratch_path
scratch = Path("/scratch")
if scratch.exists() and os.access(scratch, os.W_OK):
user_scratch = scratch / os.getenv("USER", "hermes") / "hermes-agent"
user_scratch.mkdir(parents=True, exist_ok=True)
logger.info("Using /scratch for sandboxes: %s", user_scratch)
return user_scratch
logger.debug("/scratch not available, using /tmp for sandboxes")
return Path(tempfile.gettempdir())
def _get_apptainer_cache_dir() -> Path:
"""Get the Apptainer cache directory for SIF images."""
cache_dir = os.getenv("APPTAINER_CACHEDIR")
if cache_dir:
cache_path = Path(cache_dir)
cache_path.mkdir(parents=True, exist_ok=True)
return cache_path
scratch = _get_scratch_dir()
cache_path = scratch / ".apptainer"
cache_path.mkdir(parents=True, exist_ok=True)
return cache_path
_sif_build_lock = threading.Lock()
def _get_or_build_sif(image: str, executable: str = "apptainer") -> str:
"""Get or build a SIF image from a docker:// URL.
Returns the path unchanged if it's already a .sif file.
For docker:// URLs, checks the cache and builds if needed.
"""
if image.endswith('.sif') and Path(image).exists():
return image
if not image.startswith('docker://'):
return image
image_name = image.replace('docker://', '').replace('/', '-').replace(':', '-')
cache_dir = _get_apptainer_cache_dir()
sif_path = cache_dir / f"{image_name}.sif"
if sif_path.exists():
return str(sif_path)
with _sif_build_lock:
if sif_path.exists():
return str(sif_path)
logger.info("Building SIF image (one-time setup)...")
logger.info(" Source: %s", image)
logger.info(" Target: %s", sif_path)
tmp_dir = cache_dir / "tmp"
tmp_dir.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env["APPTAINER_TMPDIR"] = str(tmp_dir)
env["APPTAINER_CACHEDIR"] = str(cache_dir)
try:
result = subprocess.run(
[executable, "build", str(sif_path), image],
capture_output=True, text=True, timeout=600, env=env,
)
if result.returncode != 0:
logger.warning("SIF build failed, falling back to docker:// URL")
logger.warning(" Error: %s", result.stderr[:500])
return image
logger.info("SIF image built successfully")
return str(sif_path)
except subprocess.TimeoutExpired:
logger.warning("SIF build timed out, falling back to docker:// URL")
if sif_path.exists():
sif_path.unlink()
return image
except Exception as e:
logger.warning("SIF build error: %s, falling back to docker:// URL", e)
return image
# -------------------------------------------------------------------------
# SingularityEnvironment
# -------------------------------------------------------------------------
class SingularityEnvironment(BaseEnvironment):
"""Persistent Singularity/Apptainer container environment.
Uses ``apptainer instance`` to create a long-running container that persists
state across all commands within a task.
"""
def __init__(self, image: str, cwd: str = "/root", timeout: int = 60):
super().__init__(cwd=cwd, timeout=timeout)
self.executable = "apptainer" if shutil.which("apptainer") else "singularity"
self.image = _get_or_build_sif(image, self.executable)
self.instance_id = f"hermes_{uuid.uuid4().hex[:12]}"
self._instance_started = False
self._start_instance()
def _start_instance(self):
cmd = [
self.executable, "instance", "start",
"--writable-tmpfs", "--containall",
str(self.image), self.instance_id,
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
raise RuntimeError(f"Failed to start instance: {result.stderr}")
self._instance_started = True
logger.info("Singularity instance %s started", self.instance_id)
except subprocess.TimeoutExpired:
raise RuntimeError("Instance start timed out")
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
if not self._instance_started:
return {"output": "Instance not started", "returncode": -1}
cmd = [self.executable, "exec", "--pwd", cwd or self.cwd,
f"instance://{self.instance_id}",
"bash", "-c", self._prepare_command(command)]
try:
result = subprocess.run(cmd, **self._build_run_kwargs(timeout, stdin_data))
return {"output": result.stdout, "returncode": result.returncode}
except subprocess.TimeoutExpired:
return self._timeout_result(timeout)
def cleanup(self):
if self._instance_started:
try:
subprocess.run(
[self.executable, "instance", "stop", self.instance_id],
capture_output=True, text=True, timeout=30,
)
logger.info("Singularity instance %s stopped", self.instance_id)
except Exception as e:
logger.warning("Failed to stop Singularity instance %s: %s", self.instance_id, e)
self._instance_started = False

91
tools/environments/ssh.py Normal file
View file

@ -0,0 +1,91 @@
"""SSH remote execution environment with ControlMaster connection persistence."""
import logging
import subprocess
import tempfile
from pathlib import Path
from tools.environments.base import BaseEnvironment
logger = logging.getLogger(__name__)
class SSHEnvironment(BaseEnvironment):
"""Run commands on a remote machine over SSH.
Uses SSH ControlMaster for connection persistence so subsequent
commands are fast. Security benefit: the agent cannot modify its
own code since execution happens on a separate machine.
"""
def __init__(self, host: str, user: str, cwd: str = "/tmp",
timeout: int = 60, port: int = 22, key_path: str = ""):
super().__init__(cwd=cwd, timeout=timeout)
self.host = host
self.user = user
self.port = port
self.key_path = key_path
self.control_dir = Path(tempfile.gettempdir()) / "hermes-ssh"
self.control_dir.mkdir(parents=True, exist_ok=True)
self.control_socket = self.control_dir / f"{user}@{host}:{port}.sock"
self._establish_connection()
def _build_ssh_command(self, extra_args: list = None) -> list:
cmd = ["ssh"]
cmd.extend(["-o", f"ControlPath={self.control_socket}"])
cmd.extend(["-o", "ControlMaster=auto"])
cmd.extend(["-o", "ControlPersist=300"])
cmd.extend(["-o", "BatchMode=yes"])
cmd.extend(["-o", "StrictHostKeyChecking=accept-new"])
cmd.extend(["-o", "ConnectTimeout=10"])
if self.port != 22:
cmd.extend(["-p", str(self.port)])
if self.key_path:
cmd.extend(["-i", self.key_path])
if extra_args:
cmd.extend(extra_args)
cmd.append(f"{self.user}@{self.host}")
return cmd
def _establish_connection(self):
cmd = self._build_ssh_command()
cmd.append("echo 'SSH connection established'")
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
if result.returncode != 0:
error_msg = result.stderr.strip() or result.stdout.strip()
raise RuntimeError(f"SSH connection failed: {error_msg}")
except subprocess.TimeoutExpired:
raise RuntimeError(f"SSH connection to {self.user}@{self.host} timed out")
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
work_dir = cwd or self.cwd
exec_command = self._prepare_command(command)
wrapped = f'cd {work_dir} && {exec_command}'
cmd = self._build_ssh_command()
cmd.extend(["bash", "-c", wrapped])
try:
result = subprocess.run(cmd, **self._build_run_kwargs(timeout, stdin_data))
return {"output": result.stdout, "returncode": result.returncode}
except subprocess.TimeoutExpired:
return self._timeout_result(timeout)
except Exception as e:
return {"output": f"SSH execution error: {str(e)}", "returncode": 1}
def cleanup(self):
if self.control_socket.exists():
try:
cmd = ["ssh", "-o", f"ControlPath={self.control_socket}",
"-O", "exit", f"{self.user}@{self.host}"]
subprocess.run(cmd, capture_output=True, timeout=5)
except (OSError, subprocess.SubprocessError):
pass
try:
self.control_socket.unlink()
except OSError:
pass