fix(docker): reuse containers across processes + fix cleanup leaks

The Docker backend docs claim "Single persistent container — ONE long-
lived container shared across sessions, /new, /reset, and delegate_task
subagents. Stopped/removed on shutdown." In practice the code only
honored that contract within a single Python process via the in-memory
\`_active_environments[task_id]\` cache. Every \`hermes chat\` invocation
spawned a fresh \`hermes-<hex>\` container; older containers piled up in
\`Exited\` state and accumulated until manual \`docker rm\` (issue #20561).

Three root causes, all addressed by this commit:

1. No cross-process container discovery.
2. \`cleanup()\` used fire-and-forget \`subprocess.Popen("... &", shell=True)\`
   which raced with parent-process exit — when Python exited promptly the
   detached shell child got killed mid-\`docker stop\`, leaving stopped
   containers behind.
3. The \`docker rm\` step in cleanup was gated on \`not self._persistent\`
   (the bind-mount-persistence flag). Default config sets
   \`container_persistent: true\`, so the default happy path skipped \`rm\`
   entirely — even when the user explicitly didn't want cross-process
   reuse, containers leaked.

Fix:

* Add \`DockerEnvironment.__init__(persist_across_processes=True)\`. When
  true, init probes
  \`docker ps -a --filter label=hermes-agent=1
                  --filter label=hermes-task-id=<task>
                  --filter label=hermes-profile=<profile>\`
  and reuses a matching container (running → attach; stopped →
  \`docker start\` → attach; \`docker start\` failure → fall through to a
  fresh \`docker run\`). Multiple matches prefer the running one, with the
  stragglers left for the orphan reaper (next commit) to clean up.

* Rewrite \`cleanup()\`. Uses \`subprocess.run(..., timeout=30)\` on a
  daemon \`threading.Thread\`, not the racy \`Popen(... &)\`. The
  \`_persistent\` guard is dropped on the \`rm\` step — \`rm\` now runs
  whenever \`persist_across_processes\` is false, regardless of the
  bind-mount-persistence setting. The leak class is gone in all
  combinations.

* Add \`wait_for_cleanup(timeout)\`. \`tools/terminal_tool.py\`'s atexit
  hook calls this on every active env, blocking up to 15s for the
  cleanup thread before interpreter exit. Without this, \`hermes /quit\`
  raced the daemon-thread teardown and dropped the stop/rm work.

* New config \`terminal.docker_persist_across_processes\` (default
  \`true\` — restores the documented contract). Set \`false\` for hard
  per-process isolation. Wired through all four config-bridge sites
  (cli.py env_mappings, gateway/run.py _terminal_env_map,
  hermes_cli/config.py _config_to_env_sync, tests/conftest.py env-strip
  list); regression-pinned by
  \`test_docker_persist_across_processes_is_bridged_everywhere\` matching
  the existing pattern for docker_run_as_host_user / docker_env.

Reuse intentionally does NOT compare image / mounts / resources — only
the labels. Operators changing those settings should set
\`docker_persist_across_processes: false\` (or \`docker rm -f\` the
labeled container) to force a fresh start. This keeps the probe cheap
and the failure mode obvious.

Coverage: 12 new unit tests in tests/tools/test_docker_environment.py
covering reuse paths (running, stopped, fallback, opt-out, duplicate
preference) and cleanup behavior (persist-mode no-rm, opt-out always-rm,
no-Popen, wait_for_cleanup semantics, partial-init safety). Plus one
config-bridge regression pin.

Refs #20561
This commit is contained in:
Ben 2026-05-28 14:00:26 +10:00 committed by Ben Barclay
parent 8d129d013b
commit ac8e238bc8
8 changed files with 612 additions and 51 deletions

1
cli.py
View file

@ -576,6 +576,7 @@ def load_cli_config() -> Dict[str, Any]:
"docker_env": "TERMINAL_DOCKER_ENV",
"docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
"docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
"docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
"sandbox_dir": "TERMINAL_SANDBOX_DIR",
# Persistent shell (non-local backends)
"persistent_shell": "TERMINAL_PERSISTENT_SHELL",

View file

@ -831,6 +831,7 @@ if _config_path.exists():
"docker_env": "TERMINAL_DOCKER_ENV",
"docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
"docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
"docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
"sandbox_dir": "TERMINAL_SANDBOX_DIR",
"persistent_shell": "TERMINAL_PERSISTENT_SHELL",
}

View file

@ -5551,6 +5551,7 @@ def set_config_value(key: str, value: str):
"terminal.daytona_image": "TERMINAL_DAYTONA_IMAGE",
"terminal.docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
"terminal.docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
"terminal.docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
"terminal.docker_env": "TERMINAL_DOCKER_ENV",
# terminal.cwd intentionally excluded — CLI resolves at runtime,
# gateway bridges it in gateway/run.py. Persisting to .env causes

View file

@ -227,6 +227,7 @@ _HERMES_BEHAVIORAL_VARS = frozenset({
"TERMINAL_CONTAINER_DISK",
"TERMINAL_CONTAINER_MEMORY",
"TERMINAL_CONTAINER_PERSISTENT",
"TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
"TERMINAL_DOCKER_RUN_AS_HOST_USER",
"BROWSER_CDP_URL",
"CAMOFOX_URL",

View file

@ -203,25 +203,43 @@ def test_auto_mount_replaces_persistent_workspace_bind(monkeypatch, tmp_path):
def test_non_persistent_cleanup_removes_container(monkeypatch):
"""When persistent=false, cleanup() must schedule docker stop + rm."""
"""When persist_across_processes=false, cleanup() must docker stop AND
docker rm so containers don't leak across hermes processes.
Updated for issue #20561: the previous implementation used fire-and-forget
``subprocess.Popen("... &", shell=True)`` which raced with parent exit;
the new implementation uses ``subprocess.run`` on a daemon thread with
bounded timeouts. See test_cleanup_with_persist_disabled_stops_and_rms
for the full behavior contract.
"""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
calls = _mock_subprocess_run(monkeypatch)
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
_mock_subprocess_run(monkeypatch)
# Run the worker thread synchronously so assertions can observe its work.
import threading
monkeypatch.setattr(threading, "Thread", _FakeThread)
popen_cmds = []
monkeypatch.setattr(
docker_env.subprocess, "Popen",
lambda cmd, **kw: (popen_cmds.append(cmd), type("P", (), {"poll": lambda s: 0, "wait": lambda s, **k: None, "returncode": 0, "stdout": iter([]), "stdin": None})())[1],
env = docker_env.DockerEnvironment(
image="python:3.11", cwd="/root", timeout=60,
task_id="ephemeral-task", persistent_filesystem=False,
persist_across_processes=False,
)
env = _make_dummy_env(persistent_filesystem=False, task_id="ephemeral-task")
assert env._container_id
container_id = env._container_id
assert container_id
# Capture cleanup-time docker calls (everything before this was init).
cleanup_calls = []
real_run = docker_env.subprocess.run
def _capture(cmd, **kw):
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kw))
return real_run(cmd, **kw)
monkeypatch.setattr(docker_env.subprocess, "run", _capture)
env.cleanup()
# Should have stop and rm calls via Popen
stop_cmds = [c for c in popen_cmds if container_id in str(c) and "stop" in str(c)]
assert len(stop_cmds) >= 1, f"cleanup() should schedule docker stop for {container_id}"
stops = [c for c in cleanup_calls if isinstance(c[0], list) and c[0][1:2] == ["stop"]]
assert stops, f"cleanup() should docker stop {container_id}; got {cleanup_calls}"
class _FakePopen:
@ -622,3 +640,338 @@ def test_labels_attribute_populated_after_init(monkeypatch):
"hermes-task-id": "abc",
"hermes-profile": "default",
}
# ── Cross-process container reuse (issue #20561) ──────────────────
def _mock_subprocess_run_with_reuse(monkeypatch, ps_state: str | None,
start_succeeds: bool = True):
"""Reuse-aware subprocess.run mock.
``ps_state`` controls what ``docker ps -a --filter ...`` returns:
* ``None`` no match (empty stdout). Forces a fresh ``docker run``.
* ``"running"`` / ``"exited"`` / ... emit ``CID\\tSTATE`` so the reuse
path picks it up. ``"running"`` skips ``docker start``; other states
trigger ``docker start`` (which can be forced to fail via
``start_succeeds=False``).
Returns the captured call list so the test can verify which docker
commands actually ran.
"""
calls = []
def _run(cmd, **kwargs):
calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
if isinstance(cmd, list) and len(cmd) >= 2:
sub = cmd[1]
if sub == "version":
return subprocess.CompletedProcess(cmd, 0, stdout="Docker version", stderr="")
if sub == "ps":
if ps_state is None:
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
return subprocess.CompletedProcess(
cmd, 0, stdout=f"reused-cid\t{ps_state}\n", stderr="",
)
if sub == "start":
if not start_succeeds:
# Real subprocess.run with check=True raises on non-zero exit;
# mirror that so the production code's except clause fires.
raise subprocess.CalledProcessError(1, cmd, output="", stderr="no such container")
return subprocess.CompletedProcess(cmd, 0, stdout="reused-cid\n", stderr="")
if sub == "run":
return subprocess.CompletedProcess(cmd, 0, stdout="fresh-cid\n", stderr="")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
monkeypatch.setattr(docker_env.subprocess, "run", _run)
return calls
def test_reuse_attaches_to_running_container_without_docker_run(monkeypatch):
"""When a labeled container is already ``running``, the reuse probe
must pick it up and skip ``docker run`` entirely. Regression for the
issue #20561 root cause: every Hermes process spawning a new container
despite docs claiming "ONE long-lived container shared across sessions"."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
calls = _mock_subprocess_run_with_reuse(monkeypatch, ps_state="running")
env = _make_dummy_env(task_id="reuse-test")
# The reuse path must populate _container_id from the ps probe output.
assert env._container_id == "reused-cid", (
f"expected reused container id, got {env._container_id!r}"
)
# And it must NOT have run `docker run`.
run_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "run"]
assert not run_invocations, (
f"docker run should be skipped on reuse, got: {run_invocations}"
)
# And it must have NOT issued a `docker start` for an already-running container.
start_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "start"]
assert not start_invocations, (
f"docker start should be skipped when container already running, got: {start_invocations}"
)
def test_reuse_starts_stopped_container_before_attaching(monkeypatch):
"""A labeled container in ``exited`` state must be restarted via
``docker start`` before the new Hermes process uses it. Without this
step, ``docker exec`` against a stopped container errors out and the
first agent command fails opaquely."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
calls = _mock_subprocess_run_with_reuse(monkeypatch, ps_state="exited")
env = _make_dummy_env(task_id="reuse-stopped")
assert env._container_id == "reused-cid"
start_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "start"]
assert start_invocations, "expected docker start for exited container"
run_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "run"]
assert not run_invocations, "should not docker run when reusing an exited container"
def test_reuse_falls_back_to_fresh_run_when_start_fails(monkeypatch):
"""If ``docker start`` on the matched container fails (container was
removed between probe and start, daemon paused, etc.), the code must
silently fall through to a fresh ``docker run`` rather than leaving the
user with a broken environment. Defensive recovery the probe is best-
effort, not authoritative."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
calls = _mock_subprocess_run_with_reuse(
monkeypatch, ps_state="exited", start_succeeds=False,
)
env = _make_dummy_env(task_id="reuse-broken-start")
# docker start should be attempted then fail; code falls through to run.
assert env._container_id == "fresh-cid", (
f"expected fresh container id after fallback, got {env._container_id!r}"
)
run_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "run"]
assert run_invocations, "fallback to fresh docker run must happen on start failure"
def test_no_reuse_when_persist_across_processes_disabled(monkeypatch):
"""Opt-out path: ``persist_across_processes=False`` skips the ps probe
entirely and always starts a fresh container, matching the pre-fix
behavior for users who want hard per-process isolation."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
# ps_state=running would trigger reuse if the probe ran — assert it doesn't.
calls = _mock_subprocess_run_with_reuse(monkeypatch, ps_state="running")
env = docker_env.DockerEnvironment(
image="python:3.11", cwd="/root", timeout=60,
task_id="no-reuse", persist_across_processes=False,
)
# Must NOT have issued docker ps (the probe is gated by the flag).
ps_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "ps"]
assert not ps_invocations, (
f"docker ps probe should be skipped when persist_across_processes=False, got: {ps_invocations}"
)
# Should have started a fresh container.
assert env._container_id == "fresh-cid"
def test_find_reusable_container_prefers_running_over_stopped(monkeypatch):
"""When the probe returns multiple matches (shouldn't normally happen,
but can after a crash leaves stale duplicates), a ``running`` container
is preferred over any stopped one. The duplicate gets reaped later by
the orphan reaper; we don't try to be heroic about it here."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
def _run(cmd, **kwargs):
if isinstance(cmd, list) and len(cmd) >= 2:
if cmd[1] == "version":
return subprocess.CompletedProcess(cmd, 0, stdout="ok", stderr="")
if cmd[1] == "ps":
# Two matches: stopped first, running second.
return subprocess.CompletedProcess(
cmd, 0,
stdout="stopped-cid\texited\nrunning-cid\trunning\n",
stderr="",
)
return subprocess.CompletedProcess(cmd, 0, stdout="fresh-cid\n", stderr="")
monkeypatch.setattr(docker_env.subprocess, "run", _run)
env = _make_dummy_env(task_id="dup-match")
assert env._container_id == "running-cid", (
f"running container should win over stopped duplicate, got {env._container_id!r}"
)
# ── Cleanup correctness (issue #20561) ────────────────────────────
class _FakeThread:
"""Stand-in for threading.Thread that captures target/args and calls
target() synchronously when .start() runs, so cleanup behavior is
observable without actually backgrounding subprocess calls."""
def __init__(self, target=None, daemon=None, name=None):
self._target = target
self.daemon = daemon
self.name = name
self._done = False
def start(self):
if self._target is not None:
self._target()
self._done = True
def is_alive(self):
return not self._done
def join(self, timeout=None):
self._done = True
def _install_fake_thread(monkeypatch):
import threading
monkeypatch.setattr(threading, "Thread", _FakeThread)
def test_cleanup_with_persist_only_stops_no_rm(monkeypatch):
"""``persist_across_processes=True`` (default) cleanup must docker stop
the container but NEVER docker rm the container has to survive so the
next Hermes process can reuse it. Issue #20561 — the previous code
matched this on the `_persistent` flag instead of a dedicated
cross-process flag, which made reuse impossible."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
_mock_subprocess_run(monkeypatch)
_install_fake_thread(monkeypatch)
env = _make_dummy_env(task_id="cleanup-persist", persistent_filesystem=False)
# Default persist_across_processes=True.
container_id = env._container_id
assert container_id
cleanup_calls = []
real_run = docker_env.subprocess.run
def _capturing_run(cmd, **kwargs):
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
return real_run(cmd, **kwargs)
monkeypatch.setattr(docker_env.subprocess, "run", _capturing_run)
env.cleanup()
stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
assert stops, f"expected docker stop call, got cleanup_calls: {cleanup_calls}"
assert not rms, (
f"docker rm must NOT be called when persist_across_processes=True; "
f"reuse would be impossible. Got: {rms}"
)
def test_cleanup_with_persist_disabled_stops_and_rms(monkeypatch):
"""``persist_across_processes=False`` cleanup must docker stop AND docker
rm so containers don't leak. Crucially, this runs regardless of the
``persistent_filesystem`` setting the original code only rm'd when
``not self._persistent``, which meant the default-on ``container_persistent:
true`` users (the documented happy path) leaked Exited containers forever.
Issue #20561 root-cause fix."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
_mock_subprocess_run(monkeypatch)
_install_fake_thread(monkeypatch)
# Note: persistent_filesystem=True (the prior-leak scenario) + the new
# cross-process toggle OFF must still result in a clean rm.
env = docker_env.DockerEnvironment(
image="python:3.11", cwd="/root", timeout=60,
task_id="cleanup-no-persist", persistent_filesystem=True,
persist_across_processes=False,
)
cleanup_calls = []
real_run = docker_env.subprocess.run
def _capturing_run(cmd, **kwargs):
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
return real_run(cmd, **kwargs)
monkeypatch.setattr(docker_env.subprocess, "run", _capturing_run)
env.cleanup()
stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
assert stops, "expected docker stop"
assert rms, (
"docker rm MUST run when persist_across_processes=False, even with "
"persistent_filesystem=True — that gating was the leak source in #20561."
)
def test_cleanup_uses_subprocess_run_not_detached_shell(monkeypatch):
"""The pre-fix code used ``subprocess.Popen(\"... &\", shell=True)`` which
raced with parent-process exit and silently dropped cleanup work. The
new code must use ``subprocess.run`` with bounded ``timeout=`` so the
work actually completes within the process lifetime.
Asserts cleanup never reaches into shell-mode Popen.
"""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
_mock_subprocess_run(monkeypatch)
_install_fake_thread(monkeypatch)
def _forbidden_popen(*args, **kwargs):
raise AssertionError(
f"cleanup must not use subprocess.Popen anymore (issue #20561); "
f"got args={args} kwargs={kwargs}"
)
monkeypatch.setattr(docker_env.subprocess, "Popen", _forbidden_popen)
env = _make_dummy_env(task_id="no-popen-cleanup")
env.cleanup() # must not raise
def test_wait_for_cleanup_returns_true_when_no_thread_started():
"""``wait_for_cleanup`` must be a no-op when ``cleanup`` was never called
(or the env has no live cleanup thread) atexit calls it unconditionally
across all active envs, so a False return would falsely flag healthy
shutdowns."""
env = docker_env.DockerEnvironment.__new__(docker_env.DockerEnvironment)
# No _cleanup_thread set — simulates an env that was never cleanup()'d.
assert env.wait_for_cleanup(timeout=1.0) is True
def test_wait_for_cleanup_after_cleanup_returns_true(monkeypatch):
"""End-to-end: cleanup() starts a thread, wait_for_cleanup() joins it
and reports completion. Atexit relies on this contract to ensure docker
stop/rm actually finishes before the Python interpreter exits."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
_mock_subprocess_run(monkeypatch)
_install_fake_thread(monkeypatch)
env = _make_dummy_env(task_id="wait-test")
env.cleanup()
assert env.wait_for_cleanup(timeout=5.0) is True
def test_cleanup_on_env_with_no_container_id_does_not_raise(monkeypatch):
"""A DockerEnvironment whose ``__init__`` failed before the container_id
was set (image-pull error, docker daemon down) should still be safe to
cleanup() the post-creation failure path in callers always tries.
Without this guard the daemon-down case used to NameError on the cleanup
branch."""
env = docker_env.DockerEnvironment.__new__(docker_env.DockerEnvironment)
env._container_id = None
env._persistent = False
env._workspace_dir = None
env._home_dir = None
# No exception expected.
env.cleanup()

View file

@ -224,3 +224,23 @@ def test_docker_env_is_bridged_everywhere():
assert "docker_env" in _gateway_env_map_keys()
assert "docker_env" in _save_config_env_sync_keys()
assert "TERMINAL_DOCKER_ENV" in _terminal_tool_env_var_names()
def test_docker_persist_across_processes_is_bridged_everywhere():
"""Regression pin for the cross-process container reuse toggle.
``terminal.docker_persist_across_processes`` (issue #20561) controls
whether ``DockerEnvironment.__init__`` probes for and reuses an existing
labeled container at startup, and whether ``cleanup()`` removes the
container on Hermes exit or just stops it (keeping it for the next
process). Same four-bridge invariant as docker_run_as_host_user /
docker_env / docker_mount_cwd_to_workspace drift between any of the
four sites means ``terminal.docker_persist_across_processes: false`` in
config.yaml silently does nothing for that entry point, leaving the
user unable to opt out of the documented "ONE long-lived container
shared across sessions" behavior.
"""
assert "docker_persist_across_processes" in _cli_env_map_keys()
assert "docker_persist_across_processes" in _gateway_env_map_keys()
assert "docker_persist_across_processes" in _save_config_env_sync_keys()
assert "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES" in _terminal_tool_env_var_names()

View file

@ -339,11 +339,13 @@ class DockerEnvironment(BaseEnvironment):
auto_mount_cwd: bool = False,
run_as_host_user: bool = False,
extra_args: list = None,
persist_across_processes: bool = True,
):
if cwd == "~":
cwd = "/root"
super().__init__(cwd=cwd, timeout=timeout)
self._persistent = persistent_filesystem
self._persist_across_processes = persist_across_processes
self._task_id = task_id
self._forward_env = _normalize_forward_env_names(forward_env)
self._env = _normalize_env_dict(env)
@ -561,26 +563,69 @@ class DockerEnvironment(BaseEnvironment):
"hermes-task-id": task_label,
"hermes-profile": profile_name,
}
run_cmd = [
self._docker_exe, "run", "-d",
"--init", # tini/catatonit as PID 1 — reaps zombie children
"--name", container_name,
*label_args,
"-w", cwd,
*all_run_args,
image,
"sleep", "infinity", # no fixed lifetime — idle reaper handles cleanup
]
logger.debug(f"Starting container: {' '.join(run_cmd)}")
result = subprocess.run(
run_cmd,
capture_output=True,
text=True,
timeout=120, # image pull may take a while
check=True,
)
self._container_id = result.stdout.strip()
logger.info(f"Started container {container_name} ({self._container_id[:12]})")
# Cross-process reuse (issue #20561 — docs claim "ONE long-lived
# container shared across sessions"). If a prior Hermes process
# already started a container for this (task_id, profile) and it
# still exists, attach to it instead of starting a fresh one. This
# restores the documented contract; opt out via
# ``terminal.docker_persist_across_processes: false``.
#
# Reuse matches on labels only — we deliberately do NOT compare image
# / mounts / resources. Operators who need a fresh container after
# changing those settings should set ``docker_persist_across_processes:
# false`` (or run ``docker rm -f`` against the labeled container) to
# force a clean start.
reused = False
if persist_across_processes:
existing = self._find_reusable_container(task_label, profile_name)
if existing is not None:
container_id, state = existing
self._container_id = container_id
if state != "running":
try:
subprocess.run(
[self._docker_exe, "start", container_id],
capture_output=True,
text=True,
timeout=30,
check=True,
)
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
logger.warning(
"Failed to start existing container %s (state=%s): "
"%s — falling back to a fresh container.",
container_id[:12], state, e,
)
self._container_id = None
if self._container_id:
logger.info(
"Reusing container %s (task=%s, profile=%s, prior state=%s)",
container_id[:12], task_label, profile_name, state,
)
reused = True
if not reused:
run_cmd = [
self._docker_exe, "run", "-d",
"--init", # tini/catatonit as PID 1 — reaps zombie children
"--name", container_name,
*label_args,
"-w", cwd,
*all_run_args,
image,
"sleep", "infinity", # no fixed lifetime — idle reaper handles cleanup
]
logger.debug(f"Starting container: {' '.join(run_cmd)}")
result = subprocess.run(
run_cmd,
capture_output=True,
text=True,
timeout=120, # image pull may take a while
check=True,
)
self._container_id = result.stdout.strip()
logger.info(f"Started container {container_name} ({self._container_id[:12]})")
# Build the init-time env forwarding args (used only by init_session
# to inject host env vars into the snapshot; subsequent commands get
@ -685,31 +730,143 @@ class DockerEnvironment(BaseEnvironment):
logger.debug("Docker --storage-opt support: %s", _storage_opt_ok)
return _storage_opt_ok
def cleanup(self):
"""Stop and remove the container. Bind-mount dirs persist if persistent=True."""
if self._container_id:
try:
# Stop in background so cleanup doesn't block
stop_cmd = (
f"(timeout 60 {self._docker_exe} stop {self._container_id} || "
f"{self._docker_exe} rm -f {self._container_id}) >/dev/null 2>&1 &"
)
subprocess.Popen(stop_cmd, shell=True)
except Exception as e:
logger.warning("Failed to stop container %s: %s", self._container_id, e)
def _find_reusable_container(self, task_label: str, profile_label: str) -> Optional[tuple[str, str]]:
"""Look for an existing container labeled for this (task, profile).
Returns ``(container_id, state)`` on hit, ``None`` on miss / on any
failure (including ``docker ps`` itself failing). State is one of the
values Docker reports via ``{{.State}}`` e.g. ``running``, ``exited``,
``created``, ``paused``, ``restarting``, ``dead``. The caller decides
whether the state warrants ``docker start`` before reuse.
Restricted to the docker-stored label set this class creates; never
matches containers that happened to be named ``hermes-*`` but were
started by some other tool.
"""
try:
result = subprocess.run(
[
self._docker_exe, "ps", "-a",
"--filter", "label=hermes-agent=1",
"--filter", f"label=hermes-task-id={task_label}",
"--filter", f"label=hermes-profile={profile_label}",
"--format", "{{.ID}}\t{{.State}}",
],
capture_output=True,
text=True,
timeout=10,
check=False,
)
except (subprocess.TimeoutExpired, OSError) as e:
logger.debug("docker ps probe failed: %s — will start a fresh container", e)
return None
if result.returncode != 0:
logger.debug(
"docker ps probe returned %d: %s — will start a fresh container",
result.returncode, result.stderr.strip(),
)
return None
lines = [ln.strip() for ln in result.stdout.splitlines() if ln.strip()]
if not lines:
return None
# Multiple matches are unusual (one (task, profile) should produce one
# container) but can happen if a previous Hermes process crashed
# mid-cleanup. Prefer a running one if present; otherwise pick the
# first listed. Stale duplicates get reaped by the orphan-reaper in a
# follow-up commit; we don't try to be heroic about them here.
running = None
first = None
for ln in lines:
parts = ln.split("\t", 1)
if len(parts) != 2:
continue
cid, state = parts[0], parts[1].lower()
if first is None:
first = (cid, state)
if state == "running" and running is None:
running = (cid, state)
return running or first
def cleanup(self):
"""Stop (and optionally remove) the container.
Behavior depends on ``persist_across_processes`` (init kwarg):
* **True** (default) only ``docker stop`` so the container is
available for reuse by the next Hermes process. The orphan-reaper
eventually removes it if no subsequent process picks it up.
* **False** ``docker stop`` followed by ``docker rm -f``, regardless
of ``persistent_filesystem``. The previous ``rm`` path was gated on
``not self._persistent`` which meant ``container_persistent: true``
users (the default) leaked Exited containers forever (issue #20561).
Cleanup runs on a daemon thread with bounded ``subprocess.run`` calls,
not the previous fire-and-forget ``Popen(... &)`` shell construct.
That pattern raced with parent-process exit and silently dropped
cleanup work when the parent didn't outlive the detached shell — the
primary mechanism behind Exited-container accumulation under SIGTERM
/ ``hermes /quit`` / dead terminals.
"""
container_id = self._container_id
if not container_id:
# Still drop the bind-mount dirs if any were allocated.
if not self._persistent:
# Also schedule removal (stop only leaves it as stopped)
for d in (self._workspace_dir, self._home_dir):
if d:
shutil.rmtree(d, ignore_errors=True)
return
# Capture state needed by the worker before we null out the attrs —
# the worker thread can outlive ``self``.
docker_exe = self._docker_exe
should_remove = not self._persist_across_processes
log_id = container_id[:12]
def _do_cleanup() -> None:
try:
subprocess.run(
[docker_exe, "stop", "-t", "10", container_id],
capture_output=True, timeout=30,
)
except (subprocess.TimeoutExpired, OSError) as e:
logger.warning("docker stop %s timed out / failed: %s", log_id, e)
if should_remove:
try:
subprocess.Popen(
f"sleep 3 && {self._docker_exe} rm -f {self._container_id} >/dev/null 2>&1 &",
shell=True,
subprocess.run(
[docker_exe, "rm", "-f", container_id],
capture_output=True, timeout=30,
)
except Exception:
pass
self._container_id = None
except (subprocess.TimeoutExpired, OSError) as e:
logger.warning("docker rm -f %s failed: %s", log_id, e)
# Daemon thread: doesn't block interpreter exit (atexit returns
# promptly), but unlike the old ``Popen(... &)`` shell trick the
# Python-level join semantics let the thread actually run to
# completion if the interpreter is still alive. atexit registers
# ``_atexit_cleanup`` in terminal_tool.py which waits up to ~60s for
# outstanding cleanups, so most exits complete the work cleanly.
import threading
t = threading.Thread(target=_do_cleanup, daemon=True, name=f"hermes-cleanup-{log_id}")
t.start()
self._cleanup_thread = t
self._container_id = None
if not self._persistent:
for d in (self._workspace_dir, self._home_dir):
if d:
shutil.rmtree(d, ignore_errors=True)
def wait_for_cleanup(self, timeout: float = 30.0) -> bool:
"""Block up to *timeout* seconds for the cleanup worker thread.
Returns ``True`` if the thread finished (or no thread was started),
``False`` on timeout. The atexit hook in terminal_tool.py calls this
on every active environment so docker stop/rm actually completes
before the Python process exits without this, ``hermes /quit``
races the interpreter shutdown and leaves stopped containers behind.
"""
thread = getattr(self, "_cleanup_thread", None)
if thread is None or not thread.is_alive():
return True
thread.join(timeout=timeout)
return not thread.is_alive()

View file

@ -1024,6 +1024,15 @@ def _get_env_config() -> Dict[str, Any]:
"docker_env": _parse_env_var("TERMINAL_DOCKER_ENV", "{}", json.loads, "valid JSON"),
"docker_run_as_host_user": os.getenv("TERMINAL_DOCKER_RUN_AS_HOST_USER", "false").lower() in {"true", "1", "yes"},
"docker_extra_args": _parse_env_var("TERMINAL_DOCKER_EXTRA_ARGS", "[]", json.loads, "valid JSON"),
# Cross-process container reuse (issue #20561). The docs claim
# "ONE long-lived container shared across sessions" — this toggle
# makes that real by probing for a labeled container at startup and
# attaching to it instead of always starting a fresh one. Set to
# ``false`` for hard per-process isolation (no reuse, container is
# removed on exit).
"docker_persist_across_processes": os.getenv(
"TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES", "true"
).lower() in {"true", "1", "yes"},
}
@ -1083,6 +1092,7 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int,
env=docker_env,
run_as_host_user=cc.get("docker_run_as_host_user", False),
extra_args=docker_extra_args,
persist_across_processes=cc.get("docker_persist_across_processes", True),
)
elif env_type == "singularity":
@ -1378,7 +1388,23 @@ def _atexit_cleanup():
if _active_environments:
count = len(_active_environments)
logger.info("Shutting down %d remaining sandbox(es)...", count)
# Snapshot the env objects BEFORE cleanup_all_environments empties
# the dict; we need them to wait on docker cleanup threads after the
# registry has been cleared.
envs_to_wait = list(_active_environments.values())
cleanup_all_environments()
# Block briefly so docker stop/rm actually completes before the
# interpreter exits. Issue #20561 — without this join, the daemon
# cleanup threads were getting torn down mid-`docker stop`, leaving
# Exited containers piled up on the host.
for env in envs_to_wait:
wait_fn = getattr(env, "wait_for_cleanup", None)
if wait_fn is None:
continue
try:
wait_fn(timeout=15.0)
except Exception as e: # never block shutdown on a bad backend
logger.debug("wait_for_cleanup raised on exit: %s", e)
atexit.register(_atexit_cleanup)
@ -1746,6 +1772,7 @@ def terminal_tool(
"docker_env": config.get("docker_env", {}),
"docker_run_as_host_user": config.get("docker_run_as_host_user", False),
"docker_extra_args": config.get("docker_extra_args", []),
"docker_persist_across_processes": config.get("docker_persist_across_processes", True),
}
local_config = None