mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
The Docker backend docs claim "Single persistent container — ONE long- lived container shared across sessions, /new, /reset, and delegate_task subagents. Stopped/removed on shutdown." In practice the code only honored that contract within a single Python process via the in-memory \`_active_environments[task_id]\` cache. Every \`hermes chat\` invocation spawned a fresh \`hermes-<hex>\` container; older containers piled up in \`Exited\` state and accumulated until manual \`docker rm\` (issue #20561). Three root causes, all addressed by this commit: 1. No cross-process container discovery. 2. \`cleanup()\` used fire-and-forget \`subprocess.Popen("... &", shell=True)\` which raced with parent-process exit — when Python exited promptly the detached shell child got killed mid-\`docker stop\`, leaving stopped containers behind. 3. The \`docker rm\` step in cleanup was gated on \`not self._persistent\` (the bind-mount-persistence flag). Default config sets \`container_persistent: true\`, so the default happy path skipped \`rm\` entirely — even when the user explicitly didn't want cross-process reuse, containers leaked. Fix: * Add \`DockerEnvironment.__init__(persist_across_processes=True)\`. When true, init probes \`docker ps -a --filter label=hermes-agent=1 --filter label=hermes-task-id=<task> --filter label=hermes-profile=<profile>\` and reuses a matching container (running → attach; stopped → \`docker start\` → attach; \`docker start\` failure → fall through to a fresh \`docker run\`). Multiple matches prefer the running one, with the stragglers left for the orphan reaper (next commit) to clean up. * Rewrite \`cleanup()\`. Uses \`subprocess.run(..., timeout=30)\` on a daemon \`threading.Thread\`, not the racy \`Popen(... &)\`. The \`_persistent\` guard is dropped on the \`rm\` step — \`rm\` now runs whenever \`persist_across_processes\` is false, regardless of the bind-mount-persistence setting. The leak class is gone in all combinations. * Add \`wait_for_cleanup(timeout)\`. \`tools/terminal_tool.py\`'s atexit hook calls this on every active env, blocking up to 15s for the cleanup thread before interpreter exit. Without this, \`hermes /quit\` raced the daemon-thread teardown and dropped the stop/rm work. * New config \`terminal.docker_persist_across_processes\` (default \`true\` — restores the documented contract). Set \`false\` for hard per-process isolation. Wired through all four config-bridge sites (cli.py env_mappings, gateway/run.py _terminal_env_map, hermes_cli/config.py _config_to_env_sync, tests/conftest.py env-strip list); regression-pinned by \`test_docker_persist_across_processes_is_bridged_everywhere\` matching the existing pattern for docker_run_as_host_user / docker_env. Reuse intentionally does NOT compare image / mounts / resources — only the labels. Operators changing those settings should set \`docker_persist_across_processes: false\` (or \`docker rm -f\` the labeled container) to force a fresh start. This keeps the probe cheap and the failure mode obvious. Coverage: 12 new unit tests in tests/tools/test_docker_environment.py covering reuse paths (running, stopped, fallback, opt-out, duplicate preference) and cleanup behavior (persist-mode no-rm, opt-out always-rm, no-Popen, wait_for_cleanup semantics, partial-init safety). Plus one config-bridge regression pin. Refs #20561
872 lines
35 KiB
Python
872 lines
35 KiB
Python
"""Docker execution environment for sandboxed command execution.
|
|
|
|
Security hardened (cap-drop ALL, no-new-privileges, PID limits),
|
|
configurable resource limits (CPU, memory, disk), and optional filesystem
|
|
persistence via bind mounts.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import uuid
|
|
from typing import Optional
|
|
|
|
from tools.environments.base import BaseEnvironment, _popen_bash
|
|
from tools.environments.local import _HERMES_PROVIDER_ENV_BLOCKLIST
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Common Docker Desktop install paths checked when 'docker' is not in PATH.
|
|
# macOS Intel: /usr/local/bin, macOS Apple Silicon (Homebrew): /opt/homebrew/bin,
|
|
# Docker Desktop app bundle: /Applications/Docker.app/Contents/Resources/bin
|
|
_DOCKER_SEARCH_PATHS = [
|
|
"/usr/local/bin/docker",
|
|
"/opt/homebrew/bin/docker",
|
|
"/Applications/Docker.app/Contents/Resources/bin/docker",
|
|
]
|
|
|
|
_docker_executable: Optional[str] = None # resolved once, cached
|
|
_ENV_VAR_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
|
|
|
|
|
def _normalize_forward_env_names(forward_env: list[str] | None) -> list[str]:
|
|
"""Return a deduplicated list of valid environment variable names."""
|
|
normalized: list[str] = []
|
|
seen: set[str] = set()
|
|
|
|
for item in forward_env or []:
|
|
if not isinstance(item, str):
|
|
logger.warning("Ignoring non-string docker_forward_env entry: %r", item)
|
|
continue
|
|
|
|
key = item.strip()
|
|
if not key:
|
|
continue
|
|
if not _ENV_VAR_NAME_RE.match(key):
|
|
logger.warning("Ignoring invalid docker_forward_env entry: %r", item)
|
|
continue
|
|
if key in seen:
|
|
continue
|
|
|
|
seen.add(key)
|
|
normalized.append(key)
|
|
|
|
return normalized
|
|
|
|
|
|
def _normalize_env_dict(env: dict | None) -> dict[str, str]:
|
|
"""Validate and normalize a docker_env dict to {str: str}.
|
|
|
|
Filters out entries with invalid variable names or non-string values.
|
|
"""
|
|
if not env:
|
|
return {}
|
|
if not isinstance(env, dict):
|
|
logger.warning("docker_env is not a dict: %r", env)
|
|
return {}
|
|
|
|
normalized: dict[str, str] = {}
|
|
for key, value in env.items():
|
|
if not isinstance(key, str) or not _ENV_VAR_NAME_RE.match(key.strip()):
|
|
logger.warning("Ignoring invalid docker_env key: %r", key)
|
|
continue
|
|
key = key.strip()
|
|
if not isinstance(value, str):
|
|
# Coerce simple scalar types (int, bool, float) to string;
|
|
# reject complex types.
|
|
if isinstance(value, (int, float, bool)):
|
|
value = str(value)
|
|
else:
|
|
logger.warning("Ignoring non-string docker_env value for %r: %r", key, value)
|
|
continue
|
|
normalized[key] = value
|
|
|
|
return normalized
|
|
|
|
|
|
def _load_hermes_env_vars() -> dict[str, str]:
|
|
"""Load ~/.hermes/.env values without failing Docker command execution."""
|
|
try:
|
|
from hermes_cli.config import load_env
|
|
|
|
return load_env() or {}
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
# Docker label values must match [a-zA-Z0-9_.-] and stay ≤63 chars to round-trip
|
|
# safely through `docker ps --filter label=key=value`. Profile and task names
|
|
# can technically contain other characters; sanitize defensively.
|
|
_LABEL_VALUE_OK_RE = re.compile(r"[^A-Za-z0-9_.-]")
|
|
|
|
|
|
def _sanitize_label_value(value: str) -> str:
|
|
"""Coerce *value* into a Docker label-safe form (alnum + ``_.-``, ≤63 chars).
|
|
|
|
Empty or all-invalid inputs collapse to ``"unknown"`` so the resulting
|
|
label is always queryable. Used at container-create time; never round-trip
|
|
a sanitized value back into application logic.
|
|
"""
|
|
if not isinstance(value, str) or not value:
|
|
return "unknown"
|
|
cleaned = _LABEL_VALUE_OK_RE.sub("_", value)
|
|
cleaned = cleaned[:63] or "unknown"
|
|
return cleaned
|
|
|
|
|
|
def _get_active_profile_name() -> str:
|
|
"""Return the active Hermes profile name, or ``"default"`` on any error.
|
|
|
|
Resolved at container-create time so a single container is permanently
|
|
tagged with the profile that created it. Profile switches inside the
|
|
same process don't retroactively relabel running containers.
|
|
"""
|
|
try:
|
|
from hermes_cli.profiles import get_active_profile_name
|
|
|
|
return get_active_profile_name() or "default"
|
|
except Exception:
|
|
return "default"
|
|
|
|
|
|
def find_docker() -> Optional[str]:
|
|
"""Locate the docker (or podman) CLI binary.
|
|
|
|
Resolution order:
|
|
1. ``HERMES_DOCKER_BINARY`` env var — explicit override (e.g. ``/usr/bin/podman``)
|
|
2. ``docker`` on PATH via ``shutil.which``
|
|
3. ``podman`` on PATH via ``shutil.which``
|
|
4. Well-known macOS Docker Desktop install locations
|
|
|
|
Returns the absolute path, or ``None`` if neither runtime can be found.
|
|
"""
|
|
global _docker_executable
|
|
if _docker_executable is not None:
|
|
return _docker_executable
|
|
|
|
# 1. Explicit override via env var (e.g. for Podman on immutable distros)
|
|
override = os.getenv("HERMES_DOCKER_BINARY")
|
|
if override and os.path.isfile(override) and os.access(override, os.X_OK):
|
|
_docker_executable = override
|
|
logger.info("Using HERMES_DOCKER_BINARY override: %s", override)
|
|
return override
|
|
|
|
# 2. docker on PATH
|
|
found = shutil.which("docker")
|
|
if found:
|
|
_docker_executable = found
|
|
return found
|
|
|
|
# 3. podman on PATH (drop-in compatible for our use case)
|
|
found = shutil.which("podman")
|
|
if found:
|
|
_docker_executable = found
|
|
logger.info("Using podman as container runtime: %s", found)
|
|
return found
|
|
|
|
# 4. Well-known macOS Docker Desktop locations
|
|
for path in _DOCKER_SEARCH_PATHS:
|
|
if os.path.isfile(path) and os.access(path, os.X_OK):
|
|
_docker_executable = path
|
|
logger.info("Found docker at non-PATH location: %s", path)
|
|
return path
|
|
|
|
return None
|
|
|
|
|
|
# Security flags applied to every container.
|
|
# The container itself is the security boundary (isolated from host).
|
|
# We drop all capabilities then add back the minimum needed:
|
|
# DAC_OVERRIDE - root can write to bind-mounted dirs owned by host user
|
|
# CHOWN/FOWNER - package managers (pip, npm, apt) need to set file ownership
|
|
# SETUID/SETGID - the image's init drops from root to the 'hermes'
|
|
# user (via `s6-setuidgid` in the bundled image, or whatever
|
|
# privilege-drop helper a user image uses), which requires these
|
|
# caps. Combined with `no-new-privileges`, the dropped process
|
|
# still cannot escalate back to root, so the security posture is
|
|
# preserved. Omitted entirely when the container starts as a
|
|
# non-root user via --user, since no privilege drop is needed
|
|
# in that mode.
|
|
# Block privilege escalation and limit PIDs.
|
|
# /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds).
|
|
_BASE_SECURITY_ARGS = [
|
|
"--cap-drop", "ALL",
|
|
"--cap-add", "DAC_OVERRIDE",
|
|
"--cap-add", "CHOWN",
|
|
"--cap-add", "FOWNER",
|
|
"--security-opt", "no-new-privileges",
|
|
"--pids-limit", "256",
|
|
"--tmpfs", "/tmp:rw,nosuid,size=512m",
|
|
"--tmpfs", "/var/tmp:rw,noexec,nosuid,size=256m",
|
|
"--tmpfs", "/run:rw,noexec,nosuid,size=64m",
|
|
]
|
|
|
|
# Extra caps needed when the container starts as root and an init/entrypoint
|
|
# must drop privileges (via `s6-setuidgid`, `gosu`, `su`, or similar).
|
|
# Skipped when --user is passed because the container already starts
|
|
# unprivileged and never needs to switch.
|
|
_PRIVDROP_CAP_ARGS = [
|
|
"--cap-add", "SETUID",
|
|
"--cap-add", "SETGID",
|
|
]
|
|
|
|
|
|
def _build_security_args(run_as_host_user: bool) -> list[str]:
|
|
"""Return the security/cap/tmpfs args tailored to the privilege mode."""
|
|
if run_as_host_user:
|
|
return list(_BASE_SECURITY_ARGS)
|
|
return list(_BASE_SECURITY_ARGS) + list(_PRIVDROP_CAP_ARGS)
|
|
|
|
|
|
def _resolve_host_user_spec() -> Optional[str]:
|
|
"""Return ``<uid>:<gid>`` for the current host user, or ``None`` on platforms
|
|
where this is not meaningful (e.g. Windows without posix ids).
|
|
|
|
We intentionally read ``os.getuid()``/``os.getgid()`` directly rather than
|
|
going through ``getpass``/``pwd`` so this stays cheap and never raises on
|
|
nameless UIDs (nss lookups can fail inside sandboxed launchers).
|
|
"""
|
|
get_uid = getattr(os, "getuid", None)
|
|
get_gid = getattr(os, "getgid", None)
|
|
if get_uid is None or get_gid is None:
|
|
return None
|
|
try:
|
|
return f"{get_uid()}:{get_gid()}"
|
|
except Exception: # pragma: no cover - defensive
|
|
return None
|
|
|
|
|
|
_storage_opt_ok: Optional[bool] = None # cached result across instances
|
|
|
|
|
|
def _ensure_docker_available() -> None:
|
|
"""Best-effort check that the docker CLI is available before use.
|
|
|
|
Reuses ``find_docker()`` so this preflight stays consistent with the rest of
|
|
the Docker backend, including known non-PATH Docker Desktop locations.
|
|
"""
|
|
docker_exe = find_docker()
|
|
if not docker_exe:
|
|
logger.error(
|
|
"Docker backend selected but no docker executable was found in PATH "
|
|
"or known install locations. Install Docker Desktop and ensure the "
|
|
"CLI is available."
|
|
)
|
|
raise RuntimeError(
|
|
"Docker executable not found in PATH or known install locations. "
|
|
"Install Docker and ensure the 'docker' command is available."
|
|
)
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[docker_exe, "version"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
)
|
|
except FileNotFoundError:
|
|
logger.error(
|
|
"Docker backend selected but the resolved docker executable '%s' could "
|
|
"not be executed.",
|
|
docker_exe,
|
|
exc_info=True,
|
|
)
|
|
raise RuntimeError(
|
|
"Docker executable could not be executed. Check your Docker installation."
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
logger.error(
|
|
"Docker backend selected but '%s version' timed out. "
|
|
"The Docker daemon may not be running.",
|
|
docker_exe,
|
|
exc_info=True,
|
|
)
|
|
raise RuntimeError(
|
|
"Docker daemon is not responding. Ensure Docker is running and try again."
|
|
)
|
|
except Exception:
|
|
logger.error(
|
|
"Unexpected error while checking Docker availability.",
|
|
exc_info=True,
|
|
)
|
|
raise
|
|
else:
|
|
if result.returncode != 0:
|
|
logger.error(
|
|
"Docker backend selected but '%s version' failed "
|
|
"(exit code %d, stderr=%s)",
|
|
docker_exe,
|
|
result.returncode,
|
|
result.stderr.strip(),
|
|
)
|
|
raise RuntimeError(
|
|
"Docker command is available but 'docker version' failed. "
|
|
"Check your Docker installation."
|
|
)
|
|
|
|
|
|
class DockerEnvironment(BaseEnvironment):
|
|
"""Hardened Docker container execution with resource limits and persistence.
|
|
|
|
Security: all capabilities dropped, no privilege escalation, PID limits,
|
|
size-limited tmpfs for scratch dirs. The container itself is the security
|
|
boundary — the filesystem inside is writable so agents can install packages
|
|
(pip, npm, apt) as needed. Writable workspace via tmpfs or bind mounts.
|
|
|
|
Persistence: when enabled, bind mounts preserve /workspace and /root
|
|
across container restarts.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
image: str,
|
|
cwd: str = "/root",
|
|
timeout: int = 60,
|
|
cpu: float = 0,
|
|
memory: int = 0,
|
|
disk: int = 0,
|
|
persistent_filesystem: bool = False,
|
|
task_id: str = "default",
|
|
volumes: list = None,
|
|
forward_env: list[str] | None = None,
|
|
env: dict | None = None,
|
|
network: bool = True,
|
|
host_cwd: str = None,
|
|
auto_mount_cwd: bool = False,
|
|
run_as_host_user: bool = False,
|
|
extra_args: list = None,
|
|
persist_across_processes: bool = True,
|
|
):
|
|
if cwd == "~":
|
|
cwd = "/root"
|
|
super().__init__(cwd=cwd, timeout=timeout)
|
|
self._persistent = persistent_filesystem
|
|
self._persist_across_processes = persist_across_processes
|
|
self._task_id = task_id
|
|
self._forward_env = _normalize_forward_env_names(forward_env)
|
|
self._env = _normalize_env_dict(env)
|
|
self._container_id: Optional[str] = None
|
|
self._labels: dict[str, str] = {}
|
|
logger.info(f"DockerEnvironment volumes: {volumes}")
|
|
# Ensure volumes is a list (config.yaml could be malformed)
|
|
if volumes is not None and not isinstance(volumes, list):
|
|
logger.warning(f"docker_volumes config is not a list: {volumes!r}")
|
|
volumes = []
|
|
|
|
# Fail fast if Docker is not available.
|
|
_ensure_docker_available()
|
|
|
|
# Build resource limit args
|
|
resource_args = []
|
|
if cpu > 0:
|
|
resource_args.extend(["--cpus", str(cpu)])
|
|
if memory > 0:
|
|
resource_args.extend(["--memory", f"{memory}m"])
|
|
if disk > 0 and sys.platform != "darwin":
|
|
if self._storage_opt_supported():
|
|
resource_args.extend(["--storage-opt", f"size={disk}m"])
|
|
else:
|
|
logger.warning(
|
|
"Docker storage driver does not support per-container disk limits "
|
|
"(requires overlay2 on XFS with pquota). Container will run without disk quota."
|
|
)
|
|
if not network:
|
|
resource_args.append("--network=none")
|
|
|
|
# Persistent workspace via bind mounts from a configurable host directory
|
|
# (TERMINAL_SANDBOX_DIR, default ~/.hermes/sandboxes/). Non-persistent
|
|
# mode uses tmpfs (ephemeral, fast, gone on cleanup).
|
|
from tools.environments.base import get_sandbox_dir
|
|
|
|
# User-configured volume mounts (from config.yaml docker_volumes)
|
|
volume_args = []
|
|
workspace_explicitly_mounted = False
|
|
for vol in (volumes or []):
|
|
if not isinstance(vol, str):
|
|
logger.warning(f"Docker volume entry is not a string: {vol!r}")
|
|
continue
|
|
vol = vol.strip()
|
|
if not vol:
|
|
continue
|
|
if ":" in vol:
|
|
volume_args.extend(["-v", vol])
|
|
if ":/workspace" in vol:
|
|
workspace_explicitly_mounted = True
|
|
else:
|
|
logger.warning(f"Docker volume '{vol}' missing colon, skipping")
|
|
|
|
host_cwd_abs = os.path.abspath(os.path.expanduser(host_cwd)) if host_cwd else ""
|
|
bind_host_cwd = (
|
|
auto_mount_cwd
|
|
and bool(host_cwd_abs)
|
|
and os.path.isdir(host_cwd_abs)
|
|
and not workspace_explicitly_mounted
|
|
)
|
|
if auto_mount_cwd and host_cwd and not os.path.isdir(host_cwd_abs):
|
|
logger.debug(f"Skipping docker cwd mount: host_cwd is not a valid directory: {host_cwd}")
|
|
|
|
self._workspace_dir: Optional[str] = None
|
|
self._home_dir: Optional[str] = None
|
|
writable_args = []
|
|
if self._persistent:
|
|
sandbox = get_sandbox_dir() / "docker" / task_id
|
|
self._home_dir = str(sandbox / "home")
|
|
os.makedirs(self._home_dir, exist_ok=True)
|
|
writable_args.extend([
|
|
"-v", f"{self._home_dir}:/root",
|
|
])
|
|
if not bind_host_cwd and not workspace_explicitly_mounted:
|
|
self._workspace_dir = str(sandbox / "workspace")
|
|
os.makedirs(self._workspace_dir, exist_ok=True)
|
|
writable_args.extend([
|
|
"-v", f"{self._workspace_dir}:/workspace",
|
|
])
|
|
else:
|
|
if not bind_host_cwd and not workspace_explicitly_mounted:
|
|
writable_args.extend([
|
|
"--tmpfs", "/workspace:rw,exec,size=10g",
|
|
])
|
|
writable_args.extend([
|
|
"--tmpfs", "/home:rw,exec,size=1g",
|
|
"--tmpfs", "/root:rw,exec,size=1g",
|
|
])
|
|
|
|
if bind_host_cwd:
|
|
logger.info(f"Mounting configured host cwd to /workspace: {host_cwd_abs}")
|
|
volume_args = ["-v", f"{host_cwd_abs}:/workspace", *volume_args]
|
|
elif workspace_explicitly_mounted:
|
|
logger.debug("Skipping docker cwd mount: /workspace already mounted by user config")
|
|
|
|
# Mount credential files (OAuth tokens, etc.) declared by skills.
|
|
# Read-only so the container can authenticate but not modify host creds.
|
|
try:
|
|
from tools.credential_files import (
|
|
get_credential_file_mounts,
|
|
get_skills_directory_mount,
|
|
get_cache_directory_mounts,
|
|
)
|
|
|
|
for mount_entry in get_credential_file_mounts():
|
|
volume_args.extend([
|
|
"-v",
|
|
f"{mount_entry['host_path']}:{mount_entry['container_path']}:ro",
|
|
])
|
|
logger.info(
|
|
"Docker: mounting credential %s -> %s",
|
|
mount_entry["host_path"],
|
|
mount_entry["container_path"],
|
|
)
|
|
|
|
# Mount skill directories (local + external) so skill
|
|
# scripts/templates are available inside the container.
|
|
for skills_mount in get_skills_directory_mount():
|
|
volume_args.extend([
|
|
"-v",
|
|
f"{skills_mount['host_path']}:{skills_mount['container_path']}:ro",
|
|
])
|
|
logger.info(
|
|
"Docker: mounting skills dir %s -> %s",
|
|
skills_mount["host_path"],
|
|
skills_mount["container_path"],
|
|
)
|
|
|
|
# Mount host-side cache directories (documents, images, audio,
|
|
# screenshots) so the agent can access uploaded files and other
|
|
# cached media from inside the container. Read-only — the
|
|
# container reads these but the host gateway manages writes.
|
|
for cache_mount in get_cache_directory_mounts():
|
|
volume_args.extend([
|
|
"-v",
|
|
f"{cache_mount['host_path']}:{cache_mount['container_path']}:ro",
|
|
])
|
|
logger.info(
|
|
"Docker: mounting cache dir %s -> %s",
|
|
cache_mount["host_path"],
|
|
cache_mount["container_path"],
|
|
)
|
|
except Exception as e:
|
|
logger.debug("Docker: could not load credential file mounts: %s", e)
|
|
|
|
# Explicit environment variables (docker_env config) — set at container
|
|
# creation so they're available to all processes (including entrypoint).
|
|
env_args = []
|
|
for key in sorted(self._env):
|
|
env_args.extend(["-e", f"{key}={self._env[key]}"])
|
|
|
|
# Optional: run the container as the host user so files written into
|
|
# bind-mounted dirs (/workspace, /root, docker_volumes entries) are
|
|
# owned by that user on the host instead of by root. Skip cleanly on
|
|
# platforms without POSIX uid/gid (e.g. native Windows Docker).
|
|
user_args: list[str] = []
|
|
if run_as_host_user:
|
|
user_spec = _resolve_host_user_spec()
|
|
if user_spec is not None:
|
|
user_args = ["--user", user_spec]
|
|
logger.info("Docker: running container as host user %s", user_spec)
|
|
else:
|
|
logger.warning(
|
|
"docker_run_as_host_user is enabled but this platform does "
|
|
"not expose POSIX uid/gid; container will start as its "
|
|
"image default user."
|
|
)
|
|
# Fall back to the full cap set — without --user, an image's
|
|
# init may still need s6-setuidgid/gosu/su to drop privileges.
|
|
security_args = _build_security_args(run_as_host_user and bool(user_args))
|
|
|
|
logger.info(f"Docker volume_args: {volume_args}")
|
|
# User-supplied extra docker run flags (docker_extra_args in config.yaml).
|
|
# Appended last so they can override defaults if needed.
|
|
validated_extra = []
|
|
for arg in (extra_args or []):
|
|
if not isinstance(arg, str):
|
|
logger.warning("Ignoring non-string docker_extra_args entry: %r", arg)
|
|
continue
|
|
validated_extra.append(arg)
|
|
|
|
all_run_args = (
|
|
security_args
|
|
+ user_args
|
|
+ writable_args
|
|
+ resource_args
|
|
+ volume_args
|
|
+ env_args
|
|
+ validated_extra
|
|
)
|
|
logger.info(f"Docker run_args: {all_run_args}")
|
|
|
|
# Resolve the docker executable once so it works even when
|
|
# /usr/local/bin is not in PATH (common on macOS gateway/service).
|
|
self._docker_exe = find_docker() or "docker"
|
|
|
|
# Start the container directly via `docker run -d`.
|
|
container_name = f"hermes-{uuid.uuid4().hex[:8]}"
|
|
# Labels make hermes-created containers identifiable to:
|
|
# * the orphan reaper (`hermes-agent=1` for the global sweep filter)
|
|
# * future cross-process reuse (`hermes-task-id`, `hermes-profile`)
|
|
# * operators running `docker ps --filter label=hermes-agent=1`
|
|
# Values are limited to the safe character set defined by
|
|
# _sanitize_label_value(); the active Hermes profile is captured at
|
|
# container-start time and never changes for the container's lifetime.
|
|
profile_name = _sanitize_label_value(_get_active_profile_name())
|
|
task_label = _sanitize_label_value(task_id)
|
|
label_args = [
|
|
"--label", "hermes-agent=1",
|
|
"--label", f"hermes-task-id={task_label}",
|
|
"--label", f"hermes-profile={profile_name}",
|
|
]
|
|
self._labels = {
|
|
"hermes-agent": "1",
|
|
"hermes-task-id": task_label,
|
|
"hermes-profile": profile_name,
|
|
}
|
|
|
|
# Cross-process reuse (issue #20561 — docs claim "ONE long-lived
|
|
# container shared across sessions"). If a prior Hermes process
|
|
# already started a container for this (task_id, profile) and it
|
|
# still exists, attach to it instead of starting a fresh one. This
|
|
# restores the documented contract; opt out via
|
|
# ``terminal.docker_persist_across_processes: false``.
|
|
#
|
|
# Reuse matches on labels only — we deliberately do NOT compare image
|
|
# / mounts / resources. Operators who need a fresh container after
|
|
# changing those settings should set ``docker_persist_across_processes:
|
|
# false`` (or run ``docker rm -f`` against the labeled container) to
|
|
# force a clean start.
|
|
reused = False
|
|
if persist_across_processes:
|
|
existing = self._find_reusable_container(task_label, profile_name)
|
|
if existing is not None:
|
|
container_id, state = existing
|
|
self._container_id = container_id
|
|
if state != "running":
|
|
try:
|
|
subprocess.run(
|
|
[self._docker_exe, "start", container_id],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
check=True,
|
|
)
|
|
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
|
logger.warning(
|
|
"Failed to start existing container %s (state=%s): "
|
|
"%s — falling back to a fresh container.",
|
|
container_id[:12], state, e,
|
|
)
|
|
self._container_id = None
|
|
if self._container_id:
|
|
logger.info(
|
|
"Reusing container %s (task=%s, profile=%s, prior state=%s)",
|
|
container_id[:12], task_label, profile_name, state,
|
|
)
|
|
reused = True
|
|
|
|
if not reused:
|
|
run_cmd = [
|
|
self._docker_exe, "run", "-d",
|
|
"--init", # tini/catatonit as PID 1 — reaps zombie children
|
|
"--name", container_name,
|
|
*label_args,
|
|
"-w", cwd,
|
|
*all_run_args,
|
|
image,
|
|
"sleep", "infinity", # no fixed lifetime — idle reaper handles cleanup
|
|
]
|
|
logger.debug(f"Starting container: {' '.join(run_cmd)}")
|
|
result = subprocess.run(
|
|
run_cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120, # image pull may take a while
|
|
check=True,
|
|
)
|
|
self._container_id = result.stdout.strip()
|
|
logger.info(f"Started container {container_name} ({self._container_id[:12]})")
|
|
|
|
# Build the init-time env forwarding args (used only by init_session
|
|
# to inject host env vars into the snapshot; subsequent commands get
|
|
# them from the snapshot file).
|
|
self._init_env_args = self._build_init_env_args()
|
|
|
|
# Initialize session snapshot inside the container
|
|
self.init_session()
|
|
|
|
def _build_init_env_args(self) -> list[str]:
|
|
"""Build -e KEY=VALUE args for injecting host env vars into init_session.
|
|
|
|
These are used once during init_session() so that export -p captures
|
|
them into the snapshot. Subsequent execute() calls don't need -e flags.
|
|
"""
|
|
exec_env: dict[str, str] = dict(self._env)
|
|
|
|
explicit_forward_keys = set(self._forward_env)
|
|
passthrough_keys: set[str] = set()
|
|
try:
|
|
from tools.env_passthrough import get_all_passthrough
|
|
passthrough_keys = set(get_all_passthrough())
|
|
except Exception:
|
|
pass
|
|
# Explicit docker_forward_env entries are an intentional opt-in and must
|
|
# win over the generic Hermes secret blocklist. Only implicit passthrough
|
|
# keys are filtered.
|
|
forward_keys = explicit_forward_keys | (passthrough_keys - _HERMES_PROVIDER_ENV_BLOCKLIST)
|
|
hermes_env = _load_hermes_env_vars() if forward_keys else {}
|
|
for key in sorted(forward_keys):
|
|
value = os.getenv(key)
|
|
if value is None:
|
|
value = hermes_env.get(key)
|
|
if value is not None:
|
|
exec_env[key] = value
|
|
|
|
args = []
|
|
for key in sorted(exec_env):
|
|
args.extend(["-e", f"{key}={exec_env[key]}"])
|
|
return args
|
|
|
|
def _run_bash(self, cmd_string: str, *, login: bool = False,
|
|
timeout: int = 120,
|
|
stdin_data: str | None = None) -> subprocess.Popen:
|
|
"""Spawn a bash process inside the Docker container."""
|
|
assert self._container_id, "Container not started"
|
|
cmd = [self._docker_exe, "exec"]
|
|
if stdin_data is not None:
|
|
cmd.append("-i")
|
|
|
|
# Only inject -e env args during init_session (login=True).
|
|
# Subsequent commands get env vars from the snapshot.
|
|
if login:
|
|
cmd.extend(self._init_env_args)
|
|
|
|
cmd.extend([self._container_id])
|
|
|
|
if login:
|
|
cmd.extend(["bash", "-l", "-c", cmd_string])
|
|
else:
|
|
cmd.extend(["bash", "-c", cmd_string])
|
|
|
|
return _popen_bash(cmd, stdin_data)
|
|
|
|
@staticmethod
|
|
def _storage_opt_supported() -> bool:
|
|
"""Check if Docker's storage driver supports --storage-opt size=.
|
|
|
|
Only overlay2 on XFS with pquota supports per-container disk quotas.
|
|
Ubuntu (and most distros) default to ext4, where this flag errors out.
|
|
"""
|
|
global _storage_opt_ok
|
|
if _storage_opt_ok is not None:
|
|
return _storage_opt_ok
|
|
try:
|
|
docker = find_docker() or "docker"
|
|
result = subprocess.run(
|
|
[docker, "info", "--format", "{{.Driver}}"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
driver = result.stdout.strip().lower()
|
|
if driver != "overlay2":
|
|
_storage_opt_ok = False
|
|
return False
|
|
# overlay2 only supports storage-opt on XFS with pquota.
|
|
# Probe by attempting a dry-ish run — the fastest reliable check.
|
|
probe = subprocess.run(
|
|
[docker, "create", "--storage-opt", "size=1m", "hello-world"],
|
|
capture_output=True, text=True, timeout=15,
|
|
)
|
|
if probe.returncode == 0:
|
|
# Clean up the created container
|
|
container_id = probe.stdout.strip()
|
|
if container_id:
|
|
subprocess.run([docker, "rm", container_id],
|
|
capture_output=True, timeout=5)
|
|
_storage_opt_ok = True
|
|
else:
|
|
_storage_opt_ok = False
|
|
except Exception:
|
|
_storage_opt_ok = False
|
|
logger.debug("Docker --storage-opt support: %s", _storage_opt_ok)
|
|
return _storage_opt_ok
|
|
|
|
def _find_reusable_container(self, task_label: str, profile_label: str) -> Optional[tuple[str, str]]:
|
|
"""Look for an existing container labeled for this (task, profile).
|
|
|
|
Returns ``(container_id, state)`` on hit, ``None`` on miss / on any
|
|
failure (including ``docker ps`` itself failing). State is one of the
|
|
values Docker reports via ``{{.State}}`` — e.g. ``running``, ``exited``,
|
|
``created``, ``paused``, ``restarting``, ``dead``. The caller decides
|
|
whether the state warrants ``docker start`` before reuse.
|
|
|
|
Restricted to the docker-stored label set this class creates; never
|
|
matches containers that happened to be named ``hermes-*`` but were
|
|
started by some other tool.
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
self._docker_exe, "ps", "-a",
|
|
"--filter", "label=hermes-agent=1",
|
|
"--filter", f"label=hermes-task-id={task_label}",
|
|
"--filter", f"label=hermes-profile={profile_label}",
|
|
"--format", "{{.ID}}\t{{.State}}",
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
check=False,
|
|
)
|
|
except (subprocess.TimeoutExpired, OSError) as e:
|
|
logger.debug("docker ps probe failed: %s — will start a fresh container", e)
|
|
return None
|
|
if result.returncode != 0:
|
|
logger.debug(
|
|
"docker ps probe returned %d: %s — will start a fresh container",
|
|
result.returncode, result.stderr.strip(),
|
|
)
|
|
return None
|
|
lines = [ln.strip() for ln in result.stdout.splitlines() if ln.strip()]
|
|
if not lines:
|
|
return None
|
|
# Multiple matches are unusual (one (task, profile) should produce one
|
|
# container) but can happen if a previous Hermes process crashed
|
|
# mid-cleanup. Prefer a running one if present; otherwise pick the
|
|
# first listed. Stale duplicates get reaped by the orphan-reaper in a
|
|
# follow-up commit; we don't try to be heroic about them here.
|
|
running = None
|
|
first = None
|
|
for ln in lines:
|
|
parts = ln.split("\t", 1)
|
|
if len(parts) != 2:
|
|
continue
|
|
cid, state = parts[0], parts[1].lower()
|
|
if first is None:
|
|
first = (cid, state)
|
|
if state == "running" and running is None:
|
|
running = (cid, state)
|
|
return running or first
|
|
|
|
def cleanup(self):
|
|
"""Stop (and optionally remove) the container.
|
|
|
|
Behavior depends on ``persist_across_processes`` (init kwarg):
|
|
|
|
* **True** (default) — only ``docker stop`` so the container is
|
|
available for reuse by the next Hermes process. The orphan-reaper
|
|
eventually removes it if no subsequent process picks it up.
|
|
* **False** — ``docker stop`` followed by ``docker rm -f``, regardless
|
|
of ``persistent_filesystem``. The previous ``rm`` path was gated on
|
|
``not self._persistent`` which meant ``container_persistent: true``
|
|
users (the default) leaked Exited containers forever (issue #20561).
|
|
|
|
Cleanup runs on a daemon thread with bounded ``subprocess.run`` calls,
|
|
not the previous fire-and-forget ``Popen(... &)`` shell construct.
|
|
That pattern raced with parent-process exit and silently dropped
|
|
cleanup work when the parent didn't outlive the detached shell — the
|
|
primary mechanism behind Exited-container accumulation under SIGTERM
|
|
/ ``hermes /quit`` / dead terminals.
|
|
"""
|
|
container_id = self._container_id
|
|
if not container_id:
|
|
# Still drop the bind-mount dirs if any were allocated.
|
|
if not self._persistent:
|
|
for d in (self._workspace_dir, self._home_dir):
|
|
if d:
|
|
shutil.rmtree(d, ignore_errors=True)
|
|
return
|
|
|
|
# Capture state needed by the worker before we null out the attrs —
|
|
# the worker thread can outlive ``self``.
|
|
docker_exe = self._docker_exe
|
|
should_remove = not self._persist_across_processes
|
|
log_id = container_id[:12]
|
|
|
|
def _do_cleanup() -> None:
|
|
try:
|
|
subprocess.run(
|
|
[docker_exe, "stop", "-t", "10", container_id],
|
|
capture_output=True, timeout=30,
|
|
)
|
|
except (subprocess.TimeoutExpired, OSError) as e:
|
|
logger.warning("docker stop %s timed out / failed: %s", log_id, e)
|
|
if should_remove:
|
|
try:
|
|
subprocess.run(
|
|
[docker_exe, "rm", "-f", container_id],
|
|
capture_output=True, timeout=30,
|
|
)
|
|
except (subprocess.TimeoutExpired, OSError) as e:
|
|
logger.warning("docker rm -f %s failed: %s", log_id, e)
|
|
|
|
# Daemon thread: doesn't block interpreter exit (atexit returns
|
|
# promptly), but unlike the old ``Popen(... &)`` shell trick the
|
|
# Python-level join semantics let the thread actually run to
|
|
# completion if the interpreter is still alive. atexit registers
|
|
# ``_atexit_cleanup`` in terminal_tool.py which waits up to ~60s for
|
|
# outstanding cleanups, so most exits complete the work cleanly.
|
|
import threading
|
|
t = threading.Thread(target=_do_cleanup, daemon=True, name=f"hermes-cleanup-{log_id}")
|
|
t.start()
|
|
self._cleanup_thread = t
|
|
self._container_id = None
|
|
|
|
if not self._persistent:
|
|
for d in (self._workspace_dir, self._home_dir):
|
|
if d:
|
|
shutil.rmtree(d, ignore_errors=True)
|
|
|
|
def wait_for_cleanup(self, timeout: float = 30.0) -> bool:
|
|
"""Block up to *timeout* seconds for the cleanup worker thread.
|
|
|
|
Returns ``True`` if the thread finished (or no thread was started),
|
|
``False`` on timeout. The atexit hook in terminal_tool.py calls this
|
|
on every active environment so docker stop/rm actually completes
|
|
before the Python process exits — without this, ``hermes /quit``
|
|
races the interpreter shutdown and leaves stopped containers behind.
|
|
"""
|
|
thread = getattr(self, "_cleanup_thread", None)
|
|
if thread is None or not thread.is_alive():
|
|
return True
|
|
thread.join(timeout=timeout)
|
|
return not thread.is_alive()
|