mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
fix(docker): reuse containers across processes + fix cleanup leaks
The Docker backend docs claim "Single persistent container — ONE long- lived container shared across sessions, /new, /reset, and delegate_task subagents. Stopped/removed on shutdown." In practice the code only honored that contract within a single Python process via the in-memory \`_active_environments[task_id]\` cache. Every \`hermes chat\` invocation spawned a fresh \`hermes-<hex>\` container; older containers piled up in \`Exited\` state and accumulated until manual \`docker rm\` (issue #20561). Three root causes, all addressed by this commit: 1. No cross-process container discovery. 2. \`cleanup()\` used fire-and-forget \`subprocess.Popen("... &", shell=True)\` which raced with parent-process exit — when Python exited promptly the detached shell child got killed mid-\`docker stop\`, leaving stopped containers behind. 3. The \`docker rm\` step in cleanup was gated on \`not self._persistent\` (the bind-mount-persistence flag). Default config sets \`container_persistent: true\`, so the default happy path skipped \`rm\` entirely — even when the user explicitly didn't want cross-process reuse, containers leaked. Fix: * Add \`DockerEnvironment.__init__(persist_across_processes=True)\`. When true, init probes \`docker ps -a --filter label=hermes-agent=1 --filter label=hermes-task-id=<task> --filter label=hermes-profile=<profile>\` and reuses a matching container (running → attach; stopped → \`docker start\` → attach; \`docker start\` failure → fall through to a fresh \`docker run\`). Multiple matches prefer the running one, with the stragglers left for the orphan reaper (next commit) to clean up. * Rewrite \`cleanup()\`. Uses \`subprocess.run(..., timeout=30)\` on a daemon \`threading.Thread\`, not the racy \`Popen(... &)\`. The \`_persistent\` guard is dropped on the \`rm\` step — \`rm\` now runs whenever \`persist_across_processes\` is false, regardless of the bind-mount-persistence setting. The leak class is gone in all combinations. * Add \`wait_for_cleanup(timeout)\`. \`tools/terminal_tool.py\`'s atexit hook calls this on every active env, blocking up to 15s for the cleanup thread before interpreter exit. Without this, \`hermes /quit\` raced the daemon-thread teardown and dropped the stop/rm work. * New config \`terminal.docker_persist_across_processes\` (default \`true\` — restores the documented contract). Set \`false\` for hard per-process isolation. Wired through all four config-bridge sites (cli.py env_mappings, gateway/run.py _terminal_env_map, hermes_cli/config.py _config_to_env_sync, tests/conftest.py env-strip list); regression-pinned by \`test_docker_persist_across_processes_is_bridged_everywhere\` matching the existing pattern for docker_run_as_host_user / docker_env. Reuse intentionally does NOT compare image / mounts / resources — only the labels. Operators changing those settings should set \`docker_persist_across_processes: false\` (or \`docker rm -f\` the labeled container) to force a fresh start. This keeps the probe cheap and the failure mode obvious. Coverage: 12 new unit tests in tests/tools/test_docker_environment.py covering reuse paths (running, stopped, fallback, opt-out, duplicate preference) and cleanup behavior (persist-mode no-rm, opt-out always-rm, no-Popen, wait_for_cleanup semantics, partial-init safety). Plus one config-bridge regression pin. Refs #20561
This commit is contained in:
parent
8d129d013b
commit
ac8e238bc8
8 changed files with 612 additions and 51 deletions
1
cli.py
1
cli.py
|
|
@ -576,6 +576,7 @@ def load_cli_config() -> Dict[str, Any]:
|
|||
"docker_env": "TERMINAL_DOCKER_ENV",
|
||||
"docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
|
||||
"docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
|
||||
"docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
|
||||
"sandbox_dir": "TERMINAL_SANDBOX_DIR",
|
||||
# Persistent shell (non-local backends)
|
||||
"persistent_shell": "TERMINAL_PERSISTENT_SHELL",
|
||||
|
|
|
|||
|
|
@ -831,6 +831,7 @@ if _config_path.exists():
|
|||
"docker_env": "TERMINAL_DOCKER_ENV",
|
||||
"docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
|
||||
"docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
|
||||
"docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
|
||||
"sandbox_dir": "TERMINAL_SANDBOX_DIR",
|
||||
"persistent_shell": "TERMINAL_PERSISTENT_SHELL",
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5551,6 +5551,7 @@ def set_config_value(key: str, value: str):
|
|||
"terminal.daytona_image": "TERMINAL_DAYTONA_IMAGE",
|
||||
"terminal.docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
|
||||
"terminal.docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
|
||||
"terminal.docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
|
||||
"terminal.docker_env": "TERMINAL_DOCKER_ENV",
|
||||
# terminal.cwd intentionally excluded — CLI resolves at runtime,
|
||||
# gateway bridges it in gateway/run.py. Persisting to .env causes
|
||||
|
|
|
|||
|
|
@ -227,6 +227,7 @@ _HERMES_BEHAVIORAL_VARS = frozenset({
|
|||
"TERMINAL_CONTAINER_DISK",
|
||||
"TERMINAL_CONTAINER_MEMORY",
|
||||
"TERMINAL_CONTAINER_PERSISTENT",
|
||||
"TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
|
||||
"TERMINAL_DOCKER_RUN_AS_HOST_USER",
|
||||
"BROWSER_CDP_URL",
|
||||
"CAMOFOX_URL",
|
||||
|
|
|
|||
|
|
@ -203,25 +203,43 @@ def test_auto_mount_replaces_persistent_workspace_bind(monkeypatch, tmp_path):
|
|||
|
||||
|
||||
def test_non_persistent_cleanup_removes_container(monkeypatch):
|
||||
"""When persistent=false, cleanup() must schedule docker stop + rm."""
|
||||
"""When persist_across_processes=false, cleanup() must docker stop AND
|
||||
docker rm so containers don't leak across hermes processes.
|
||||
|
||||
Updated for issue #20561: the previous implementation used fire-and-forget
|
||||
``subprocess.Popen("... &", shell=True)`` which raced with parent exit;
|
||||
the new implementation uses ``subprocess.run`` on a daemon thread with
|
||||
bounded timeouts. See test_cleanup_with_persist_disabled_stops_and_rms
|
||||
for the full behavior contract.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
calls = _mock_subprocess_run(monkeypatch)
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
# Run the worker thread synchronously so assertions can observe its work.
|
||||
import threading
|
||||
monkeypatch.setattr(threading, "Thread", _FakeThread)
|
||||
|
||||
popen_cmds = []
|
||||
monkeypatch.setattr(
|
||||
docker_env.subprocess, "Popen",
|
||||
lambda cmd, **kw: (popen_cmds.append(cmd), type("P", (), {"poll": lambda s: 0, "wait": lambda s, **k: None, "returncode": 0, "stdout": iter([]), "stdin": None})())[1],
|
||||
env = docker_env.DockerEnvironment(
|
||||
image="python:3.11", cwd="/root", timeout=60,
|
||||
task_id="ephemeral-task", persistent_filesystem=False,
|
||||
persist_across_processes=False,
|
||||
)
|
||||
|
||||
env = _make_dummy_env(persistent_filesystem=False, task_id="ephemeral-task")
|
||||
assert env._container_id
|
||||
container_id = env._container_id
|
||||
assert container_id
|
||||
|
||||
# Capture cleanup-time docker calls (everything before this was init).
|
||||
cleanup_calls = []
|
||||
real_run = docker_env.subprocess.run
|
||||
|
||||
def _capture(cmd, **kw):
|
||||
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kw))
|
||||
return real_run(cmd, **kw)
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _capture)
|
||||
env.cleanup()
|
||||
|
||||
# Should have stop and rm calls via Popen
|
||||
stop_cmds = [c for c in popen_cmds if container_id in str(c) and "stop" in str(c)]
|
||||
assert len(stop_cmds) >= 1, f"cleanup() should schedule docker stop for {container_id}"
|
||||
stops = [c for c in cleanup_calls if isinstance(c[0], list) and c[0][1:2] == ["stop"]]
|
||||
assert stops, f"cleanup() should docker stop {container_id}; got {cleanup_calls}"
|
||||
|
||||
|
||||
class _FakePopen:
|
||||
|
|
@ -622,3 +640,338 @@ def test_labels_attribute_populated_after_init(monkeypatch):
|
|||
"hermes-task-id": "abc",
|
||||
"hermes-profile": "default",
|
||||
}
|
||||
|
||||
|
||||
# ── Cross-process container reuse (issue #20561) ──────────────────
|
||||
|
||||
|
||||
def _mock_subprocess_run_with_reuse(monkeypatch, ps_state: str | None,
|
||||
start_succeeds: bool = True):
|
||||
"""Reuse-aware subprocess.run mock.
|
||||
|
||||
``ps_state`` controls what ``docker ps -a --filter ...`` returns:
|
||||
* ``None`` → no match (empty stdout). Forces a fresh ``docker run``.
|
||||
* ``"running"`` / ``"exited"`` / ... → emit ``CID\\tSTATE`` so the reuse
|
||||
path picks it up. ``"running"`` skips ``docker start``; other states
|
||||
trigger ``docker start`` (which can be forced to fail via
|
||||
``start_succeeds=False``).
|
||||
|
||||
Returns the captured call list so the test can verify which docker
|
||||
commands actually ran.
|
||||
"""
|
||||
calls = []
|
||||
|
||||
def _run(cmd, **kwargs):
|
||||
calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
|
||||
if isinstance(cmd, list) and len(cmd) >= 2:
|
||||
sub = cmd[1]
|
||||
if sub == "version":
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="Docker version", stderr="")
|
||||
if sub == "ps":
|
||||
if ps_state is None:
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0, stdout=f"reused-cid\t{ps_state}\n", stderr="",
|
||||
)
|
||||
if sub == "start":
|
||||
if not start_succeeds:
|
||||
# Real subprocess.run with check=True raises on non-zero exit;
|
||||
# mirror that so the production code's except clause fires.
|
||||
raise subprocess.CalledProcessError(1, cmd, output="", stderr="no such container")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="reused-cid\n", stderr="")
|
||||
if sub == "run":
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="fresh-cid\n", stderr="")
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _run)
|
||||
return calls
|
||||
|
||||
|
||||
def test_reuse_attaches_to_running_container_without_docker_run(monkeypatch):
|
||||
"""When a labeled container is already ``running``, the reuse probe
|
||||
must pick it up and skip ``docker run`` entirely. Regression for the
|
||||
issue #20561 root cause: every Hermes process spawning a new container
|
||||
despite docs claiming "ONE long-lived container shared across sessions"."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
calls = _mock_subprocess_run_with_reuse(monkeypatch, ps_state="running")
|
||||
|
||||
env = _make_dummy_env(task_id="reuse-test")
|
||||
|
||||
# The reuse path must populate _container_id from the ps probe output.
|
||||
assert env._container_id == "reused-cid", (
|
||||
f"expected reused container id, got {env._container_id!r}"
|
||||
)
|
||||
# And it must NOT have run `docker run`.
|
||||
run_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "run"]
|
||||
assert not run_invocations, (
|
||||
f"docker run should be skipped on reuse, got: {run_invocations}"
|
||||
)
|
||||
# And it must have NOT issued a `docker start` for an already-running container.
|
||||
start_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "start"]
|
||||
assert not start_invocations, (
|
||||
f"docker start should be skipped when container already running, got: {start_invocations}"
|
||||
)
|
||||
|
||||
|
||||
def test_reuse_starts_stopped_container_before_attaching(monkeypatch):
|
||||
"""A labeled container in ``exited`` state must be restarted via
|
||||
``docker start`` before the new Hermes process uses it. Without this
|
||||
step, ``docker exec`` against a stopped container errors out and the
|
||||
first agent command fails opaquely."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
calls = _mock_subprocess_run_with_reuse(monkeypatch, ps_state="exited")
|
||||
|
||||
env = _make_dummy_env(task_id="reuse-stopped")
|
||||
|
||||
assert env._container_id == "reused-cid"
|
||||
start_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "start"]
|
||||
assert start_invocations, "expected docker start for exited container"
|
||||
run_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "run"]
|
||||
assert not run_invocations, "should not docker run when reusing an exited container"
|
||||
|
||||
|
||||
def test_reuse_falls_back_to_fresh_run_when_start_fails(monkeypatch):
|
||||
"""If ``docker start`` on the matched container fails (container was
|
||||
removed between probe and start, daemon paused, etc.), the code must
|
||||
silently fall through to a fresh ``docker run`` rather than leaving the
|
||||
user with a broken environment. Defensive recovery — the probe is best-
|
||||
effort, not authoritative."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
calls = _mock_subprocess_run_with_reuse(
|
||||
monkeypatch, ps_state="exited", start_succeeds=False,
|
||||
)
|
||||
|
||||
env = _make_dummy_env(task_id="reuse-broken-start")
|
||||
|
||||
# docker start should be attempted then fail; code falls through to run.
|
||||
assert env._container_id == "fresh-cid", (
|
||||
f"expected fresh container id after fallback, got {env._container_id!r}"
|
||||
)
|
||||
run_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "run"]
|
||||
assert run_invocations, "fallback to fresh docker run must happen on start failure"
|
||||
|
||||
|
||||
def test_no_reuse_when_persist_across_processes_disabled(monkeypatch):
|
||||
"""Opt-out path: ``persist_across_processes=False`` skips the ps probe
|
||||
entirely and always starts a fresh container, matching the pre-fix
|
||||
behavior for users who want hard per-process isolation."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
# ps_state=running would trigger reuse if the probe ran — assert it doesn't.
|
||||
calls = _mock_subprocess_run_with_reuse(monkeypatch, ps_state="running")
|
||||
|
||||
env = docker_env.DockerEnvironment(
|
||||
image="python:3.11", cwd="/root", timeout=60,
|
||||
task_id="no-reuse", persist_across_processes=False,
|
||||
)
|
||||
|
||||
# Must NOT have issued docker ps (the probe is gated by the flag).
|
||||
ps_invocations = [c for c in calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "ps"]
|
||||
assert not ps_invocations, (
|
||||
f"docker ps probe should be skipped when persist_across_processes=False, got: {ps_invocations}"
|
||||
)
|
||||
# Should have started a fresh container.
|
||||
assert env._container_id == "fresh-cid"
|
||||
|
||||
|
||||
def test_find_reusable_container_prefers_running_over_stopped(monkeypatch):
|
||||
"""When the probe returns multiple matches (shouldn't normally happen,
|
||||
but can after a crash leaves stale duplicates), a ``running`` container
|
||||
is preferred over any stopped one. The duplicate gets reaped later by
|
||||
the orphan reaper; we don't try to be heroic about it here."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
|
||||
def _run(cmd, **kwargs):
|
||||
if isinstance(cmd, list) and len(cmd) >= 2:
|
||||
if cmd[1] == "version":
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="ok", stderr="")
|
||||
if cmd[1] == "ps":
|
||||
# Two matches: stopped first, running second.
|
||||
return subprocess.CompletedProcess(
|
||||
cmd, 0,
|
||||
stdout="stopped-cid\texited\nrunning-cid\trunning\n",
|
||||
stderr="",
|
||||
)
|
||||
return subprocess.CompletedProcess(cmd, 0, stdout="fresh-cid\n", stderr="")
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _run)
|
||||
|
||||
env = _make_dummy_env(task_id="dup-match")
|
||||
assert env._container_id == "running-cid", (
|
||||
f"running container should win over stopped duplicate, got {env._container_id!r}"
|
||||
)
|
||||
|
||||
|
||||
# ── Cleanup correctness (issue #20561) ────────────────────────────
|
||||
|
||||
|
||||
class _FakeThread:
|
||||
"""Stand-in for threading.Thread that captures target/args and calls
|
||||
target() synchronously when .start() runs, so cleanup behavior is
|
||||
observable without actually backgrounding subprocess calls."""
|
||||
|
||||
def __init__(self, target=None, daemon=None, name=None):
|
||||
self._target = target
|
||||
self.daemon = daemon
|
||||
self.name = name
|
||||
self._done = False
|
||||
|
||||
def start(self):
|
||||
if self._target is not None:
|
||||
self._target()
|
||||
self._done = True
|
||||
|
||||
def is_alive(self):
|
||||
return not self._done
|
||||
|
||||
def join(self, timeout=None):
|
||||
self._done = True
|
||||
|
||||
|
||||
def _install_fake_thread(monkeypatch):
|
||||
import threading
|
||||
monkeypatch.setattr(threading, "Thread", _FakeThread)
|
||||
|
||||
|
||||
def test_cleanup_with_persist_only_stops_no_rm(monkeypatch):
|
||||
"""``persist_across_processes=True`` (default) cleanup must docker stop
|
||||
the container but NEVER docker rm — the container has to survive so the
|
||||
next Hermes process can reuse it. Issue #20561 — the previous code
|
||||
matched this on the `_persistent` flag instead of a dedicated
|
||||
cross-process flag, which made reuse impossible."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
_install_fake_thread(monkeypatch)
|
||||
|
||||
env = _make_dummy_env(task_id="cleanup-persist", persistent_filesystem=False)
|
||||
# Default persist_across_processes=True.
|
||||
container_id = env._container_id
|
||||
assert container_id
|
||||
|
||||
cleanup_calls = []
|
||||
real_run = docker_env.subprocess.run
|
||||
|
||||
def _capturing_run(cmd, **kwargs):
|
||||
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
|
||||
return real_run(cmd, **kwargs)
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _capturing_run)
|
||||
|
||||
env.cleanup()
|
||||
|
||||
stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
|
||||
rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
|
||||
assert stops, f"expected docker stop call, got cleanup_calls: {cleanup_calls}"
|
||||
assert not rms, (
|
||||
f"docker rm must NOT be called when persist_across_processes=True; "
|
||||
f"reuse would be impossible. Got: {rms}"
|
||||
)
|
||||
|
||||
|
||||
def test_cleanup_with_persist_disabled_stops_and_rms(monkeypatch):
|
||||
"""``persist_across_processes=False`` cleanup must docker stop AND docker
|
||||
rm so containers don't leak. Crucially, this runs regardless of the
|
||||
``persistent_filesystem`` setting — the original code only rm'd when
|
||||
``not self._persistent``, which meant the default-on ``container_persistent:
|
||||
true`` users (the documented happy path) leaked Exited containers forever.
|
||||
Issue #20561 root-cause fix."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
_install_fake_thread(monkeypatch)
|
||||
|
||||
# Note: persistent_filesystem=True (the prior-leak scenario) + the new
|
||||
# cross-process toggle OFF must still result in a clean rm.
|
||||
env = docker_env.DockerEnvironment(
|
||||
image="python:3.11", cwd="/root", timeout=60,
|
||||
task_id="cleanup-no-persist", persistent_filesystem=True,
|
||||
persist_across_processes=False,
|
||||
)
|
||||
|
||||
cleanup_calls = []
|
||||
real_run = docker_env.subprocess.run
|
||||
|
||||
def _capturing_run(cmd, **kwargs):
|
||||
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
|
||||
return real_run(cmd, **kwargs)
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _capturing_run)
|
||||
|
||||
env.cleanup()
|
||||
|
||||
stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
|
||||
rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
|
||||
assert stops, "expected docker stop"
|
||||
assert rms, (
|
||||
"docker rm MUST run when persist_across_processes=False, even with "
|
||||
"persistent_filesystem=True — that gating was the leak source in #20561."
|
||||
)
|
||||
|
||||
|
||||
def test_cleanup_uses_subprocess_run_not_detached_shell(monkeypatch):
|
||||
"""The pre-fix code used ``subprocess.Popen(\"... &\", shell=True)`` which
|
||||
raced with parent-process exit and silently dropped cleanup work. The
|
||||
new code must use ``subprocess.run`` with bounded ``timeout=`` so the
|
||||
work actually completes within the process lifetime.
|
||||
|
||||
Asserts cleanup never reaches into shell-mode Popen.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
_install_fake_thread(monkeypatch)
|
||||
|
||||
def _forbidden_popen(*args, **kwargs):
|
||||
raise AssertionError(
|
||||
f"cleanup must not use subprocess.Popen anymore (issue #20561); "
|
||||
f"got args={args} kwargs={kwargs}"
|
||||
)
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "Popen", _forbidden_popen)
|
||||
|
||||
env = _make_dummy_env(task_id="no-popen-cleanup")
|
||||
env.cleanup() # must not raise
|
||||
|
||||
|
||||
def test_wait_for_cleanup_returns_true_when_no_thread_started():
|
||||
"""``wait_for_cleanup`` must be a no-op when ``cleanup`` was never called
|
||||
(or the env has no live cleanup thread) — atexit calls it unconditionally
|
||||
across all active envs, so a False return would falsely flag healthy
|
||||
shutdowns."""
|
||||
env = docker_env.DockerEnvironment.__new__(docker_env.DockerEnvironment)
|
||||
# No _cleanup_thread set — simulates an env that was never cleanup()'d.
|
||||
assert env.wait_for_cleanup(timeout=1.0) is True
|
||||
|
||||
|
||||
def test_wait_for_cleanup_after_cleanup_returns_true(monkeypatch):
|
||||
"""End-to-end: cleanup() starts a thread, wait_for_cleanup() joins it
|
||||
and reports completion. Atexit relies on this contract to ensure docker
|
||||
stop/rm actually finishes before the Python interpreter exits."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
_install_fake_thread(monkeypatch)
|
||||
|
||||
env = _make_dummy_env(task_id="wait-test")
|
||||
env.cleanup()
|
||||
assert env.wait_for_cleanup(timeout=5.0) is True
|
||||
|
||||
|
||||
def test_cleanup_on_env_with_no_container_id_does_not_raise(monkeypatch):
|
||||
"""A DockerEnvironment whose ``__init__`` failed before the container_id
|
||||
was set (image-pull error, docker daemon down) should still be safe to
|
||||
cleanup() — the post-creation failure path in callers always tries.
|
||||
Without this guard the daemon-down case used to NameError on the cleanup
|
||||
branch."""
|
||||
env = docker_env.DockerEnvironment.__new__(docker_env.DockerEnvironment)
|
||||
env._container_id = None
|
||||
env._persistent = False
|
||||
env._workspace_dir = None
|
||||
env._home_dir = None
|
||||
# No exception expected.
|
||||
env.cleanup()
|
||||
|
|
|
|||
|
|
@ -224,3 +224,23 @@ def test_docker_env_is_bridged_everywhere():
|
|||
assert "docker_env" in _gateway_env_map_keys()
|
||||
assert "docker_env" in _save_config_env_sync_keys()
|
||||
assert "TERMINAL_DOCKER_ENV" in _terminal_tool_env_var_names()
|
||||
|
||||
|
||||
def test_docker_persist_across_processes_is_bridged_everywhere():
|
||||
"""Regression pin for the cross-process container reuse toggle.
|
||||
|
||||
``terminal.docker_persist_across_processes`` (issue #20561) controls
|
||||
whether ``DockerEnvironment.__init__`` probes for and reuses an existing
|
||||
labeled container at startup, and whether ``cleanup()`` removes the
|
||||
container on Hermes exit or just stops it (keeping it for the next
|
||||
process). Same four-bridge invariant as docker_run_as_host_user /
|
||||
docker_env / docker_mount_cwd_to_workspace — drift between any of the
|
||||
four sites means ``terminal.docker_persist_across_processes: false`` in
|
||||
config.yaml silently does nothing for that entry point, leaving the
|
||||
user unable to opt out of the documented "ONE long-lived container
|
||||
shared across sessions" behavior.
|
||||
"""
|
||||
assert "docker_persist_across_processes" in _cli_env_map_keys()
|
||||
assert "docker_persist_across_processes" in _gateway_env_map_keys()
|
||||
assert "docker_persist_across_processes" in _save_config_env_sync_keys()
|
||||
assert "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES" in _terminal_tool_env_var_names()
|
||||
|
|
|
|||
|
|
@ -339,11 +339,13 @@ class DockerEnvironment(BaseEnvironment):
|
|||
auto_mount_cwd: bool = False,
|
||||
run_as_host_user: bool = False,
|
||||
extra_args: list = None,
|
||||
persist_across_processes: bool = True,
|
||||
):
|
||||
if cwd == "~":
|
||||
cwd = "/root"
|
||||
super().__init__(cwd=cwd, timeout=timeout)
|
||||
self._persistent = persistent_filesystem
|
||||
self._persist_across_processes = persist_across_processes
|
||||
self._task_id = task_id
|
||||
self._forward_env = _normalize_forward_env_names(forward_env)
|
||||
self._env = _normalize_env_dict(env)
|
||||
|
|
@ -561,26 +563,69 @@ class DockerEnvironment(BaseEnvironment):
|
|||
"hermes-task-id": task_label,
|
||||
"hermes-profile": profile_name,
|
||||
}
|
||||
run_cmd = [
|
||||
self._docker_exe, "run", "-d",
|
||||
"--init", # tini/catatonit as PID 1 — reaps zombie children
|
||||
"--name", container_name,
|
||||
*label_args,
|
||||
"-w", cwd,
|
||||
*all_run_args,
|
||||
image,
|
||||
"sleep", "infinity", # no fixed lifetime — idle reaper handles cleanup
|
||||
]
|
||||
logger.debug(f"Starting container: {' '.join(run_cmd)}")
|
||||
result = subprocess.run(
|
||||
run_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120, # image pull may take a while
|
||||
check=True,
|
||||
)
|
||||
self._container_id = result.stdout.strip()
|
||||
logger.info(f"Started container {container_name} ({self._container_id[:12]})")
|
||||
|
||||
# Cross-process reuse (issue #20561 — docs claim "ONE long-lived
|
||||
# container shared across sessions"). If a prior Hermes process
|
||||
# already started a container for this (task_id, profile) and it
|
||||
# still exists, attach to it instead of starting a fresh one. This
|
||||
# restores the documented contract; opt out via
|
||||
# ``terminal.docker_persist_across_processes: false``.
|
||||
#
|
||||
# Reuse matches on labels only — we deliberately do NOT compare image
|
||||
# / mounts / resources. Operators who need a fresh container after
|
||||
# changing those settings should set ``docker_persist_across_processes:
|
||||
# false`` (or run ``docker rm -f`` against the labeled container) to
|
||||
# force a clean start.
|
||||
reused = False
|
||||
if persist_across_processes:
|
||||
existing = self._find_reusable_container(task_label, profile_name)
|
||||
if existing is not None:
|
||||
container_id, state = existing
|
||||
self._container_id = container_id
|
||||
if state != "running":
|
||||
try:
|
||||
subprocess.run(
|
||||
[self._docker_exe, "start", container_id],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
check=True,
|
||||
)
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
||||
logger.warning(
|
||||
"Failed to start existing container %s (state=%s): "
|
||||
"%s — falling back to a fresh container.",
|
||||
container_id[:12], state, e,
|
||||
)
|
||||
self._container_id = None
|
||||
if self._container_id:
|
||||
logger.info(
|
||||
"Reusing container %s (task=%s, profile=%s, prior state=%s)",
|
||||
container_id[:12], task_label, profile_name, state,
|
||||
)
|
||||
reused = True
|
||||
|
||||
if not reused:
|
||||
run_cmd = [
|
||||
self._docker_exe, "run", "-d",
|
||||
"--init", # tini/catatonit as PID 1 — reaps zombie children
|
||||
"--name", container_name,
|
||||
*label_args,
|
||||
"-w", cwd,
|
||||
*all_run_args,
|
||||
image,
|
||||
"sleep", "infinity", # no fixed lifetime — idle reaper handles cleanup
|
||||
]
|
||||
logger.debug(f"Starting container: {' '.join(run_cmd)}")
|
||||
result = subprocess.run(
|
||||
run_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120, # image pull may take a while
|
||||
check=True,
|
||||
)
|
||||
self._container_id = result.stdout.strip()
|
||||
logger.info(f"Started container {container_name} ({self._container_id[:12]})")
|
||||
|
||||
# Build the init-time env forwarding args (used only by init_session
|
||||
# to inject host env vars into the snapshot; subsequent commands get
|
||||
|
|
@ -685,31 +730,143 @@ class DockerEnvironment(BaseEnvironment):
|
|||
logger.debug("Docker --storage-opt support: %s", _storage_opt_ok)
|
||||
return _storage_opt_ok
|
||||
|
||||
def cleanup(self):
|
||||
"""Stop and remove the container. Bind-mount dirs persist if persistent=True."""
|
||||
if self._container_id:
|
||||
try:
|
||||
# Stop in background so cleanup doesn't block
|
||||
stop_cmd = (
|
||||
f"(timeout 60 {self._docker_exe} stop {self._container_id} || "
|
||||
f"{self._docker_exe} rm -f {self._container_id}) >/dev/null 2>&1 &"
|
||||
)
|
||||
subprocess.Popen(stop_cmd, shell=True)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to stop container %s: %s", self._container_id, e)
|
||||
def _find_reusable_container(self, task_label: str, profile_label: str) -> Optional[tuple[str, str]]:
|
||||
"""Look for an existing container labeled for this (task, profile).
|
||||
|
||||
Returns ``(container_id, state)`` on hit, ``None`` on miss / on any
|
||||
failure (including ``docker ps`` itself failing). State is one of the
|
||||
values Docker reports via ``{{.State}}`` — e.g. ``running``, ``exited``,
|
||||
``created``, ``paused``, ``restarting``, ``dead``. The caller decides
|
||||
whether the state warrants ``docker start`` before reuse.
|
||||
|
||||
Restricted to the docker-stored label set this class creates; never
|
||||
matches containers that happened to be named ``hermes-*`` but were
|
||||
started by some other tool.
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
self._docker_exe, "ps", "-a",
|
||||
"--filter", "label=hermes-agent=1",
|
||||
"--filter", f"label=hermes-task-id={task_label}",
|
||||
"--filter", f"label=hermes-profile={profile_label}",
|
||||
"--format", "{{.ID}}\t{{.State}}",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
check=False,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, OSError) as e:
|
||||
logger.debug("docker ps probe failed: %s — will start a fresh container", e)
|
||||
return None
|
||||
if result.returncode != 0:
|
||||
logger.debug(
|
||||
"docker ps probe returned %d: %s — will start a fresh container",
|
||||
result.returncode, result.stderr.strip(),
|
||||
)
|
||||
return None
|
||||
lines = [ln.strip() for ln in result.stdout.splitlines() if ln.strip()]
|
||||
if not lines:
|
||||
return None
|
||||
# Multiple matches are unusual (one (task, profile) should produce one
|
||||
# container) but can happen if a previous Hermes process crashed
|
||||
# mid-cleanup. Prefer a running one if present; otherwise pick the
|
||||
# first listed. Stale duplicates get reaped by the orphan-reaper in a
|
||||
# follow-up commit; we don't try to be heroic about them here.
|
||||
running = None
|
||||
first = None
|
||||
for ln in lines:
|
||||
parts = ln.split("\t", 1)
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
cid, state = parts[0], parts[1].lower()
|
||||
if first is None:
|
||||
first = (cid, state)
|
||||
if state == "running" and running is None:
|
||||
running = (cid, state)
|
||||
return running or first
|
||||
|
||||
def cleanup(self):
|
||||
"""Stop (and optionally remove) the container.
|
||||
|
||||
Behavior depends on ``persist_across_processes`` (init kwarg):
|
||||
|
||||
* **True** (default) — only ``docker stop`` so the container is
|
||||
available for reuse by the next Hermes process. The orphan-reaper
|
||||
eventually removes it if no subsequent process picks it up.
|
||||
* **False** — ``docker stop`` followed by ``docker rm -f``, regardless
|
||||
of ``persistent_filesystem``. The previous ``rm`` path was gated on
|
||||
``not self._persistent`` which meant ``container_persistent: true``
|
||||
users (the default) leaked Exited containers forever (issue #20561).
|
||||
|
||||
Cleanup runs on a daemon thread with bounded ``subprocess.run`` calls,
|
||||
not the previous fire-and-forget ``Popen(... &)`` shell construct.
|
||||
That pattern raced with parent-process exit and silently dropped
|
||||
cleanup work when the parent didn't outlive the detached shell — the
|
||||
primary mechanism behind Exited-container accumulation under SIGTERM
|
||||
/ ``hermes /quit`` / dead terminals.
|
||||
"""
|
||||
container_id = self._container_id
|
||||
if not container_id:
|
||||
# Still drop the bind-mount dirs if any were allocated.
|
||||
if not self._persistent:
|
||||
# Also schedule removal (stop only leaves it as stopped)
|
||||
for d in (self._workspace_dir, self._home_dir):
|
||||
if d:
|
||||
shutil.rmtree(d, ignore_errors=True)
|
||||
return
|
||||
|
||||
# Capture state needed by the worker before we null out the attrs —
|
||||
# the worker thread can outlive ``self``.
|
||||
docker_exe = self._docker_exe
|
||||
should_remove = not self._persist_across_processes
|
||||
log_id = container_id[:12]
|
||||
|
||||
def _do_cleanup() -> None:
|
||||
try:
|
||||
subprocess.run(
|
||||
[docker_exe, "stop", "-t", "10", container_id],
|
||||
capture_output=True, timeout=30,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, OSError) as e:
|
||||
logger.warning("docker stop %s timed out / failed: %s", log_id, e)
|
||||
if should_remove:
|
||||
try:
|
||||
subprocess.Popen(
|
||||
f"sleep 3 && {self._docker_exe} rm -f {self._container_id} >/dev/null 2>&1 &",
|
||||
shell=True,
|
||||
subprocess.run(
|
||||
[docker_exe, "rm", "-f", container_id],
|
||||
capture_output=True, timeout=30,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
self._container_id = None
|
||||
except (subprocess.TimeoutExpired, OSError) as e:
|
||||
logger.warning("docker rm -f %s failed: %s", log_id, e)
|
||||
|
||||
# Daemon thread: doesn't block interpreter exit (atexit returns
|
||||
# promptly), but unlike the old ``Popen(... &)`` shell trick the
|
||||
# Python-level join semantics let the thread actually run to
|
||||
# completion if the interpreter is still alive. atexit registers
|
||||
# ``_atexit_cleanup`` in terminal_tool.py which waits up to ~60s for
|
||||
# outstanding cleanups, so most exits complete the work cleanly.
|
||||
import threading
|
||||
t = threading.Thread(target=_do_cleanup, daemon=True, name=f"hermes-cleanup-{log_id}")
|
||||
t.start()
|
||||
self._cleanup_thread = t
|
||||
self._container_id = None
|
||||
|
||||
if not self._persistent:
|
||||
for d in (self._workspace_dir, self._home_dir):
|
||||
if d:
|
||||
shutil.rmtree(d, ignore_errors=True)
|
||||
|
||||
def wait_for_cleanup(self, timeout: float = 30.0) -> bool:
|
||||
"""Block up to *timeout* seconds for the cleanup worker thread.
|
||||
|
||||
Returns ``True`` if the thread finished (or no thread was started),
|
||||
``False`` on timeout. The atexit hook in terminal_tool.py calls this
|
||||
on every active environment so docker stop/rm actually completes
|
||||
before the Python process exits — without this, ``hermes /quit``
|
||||
races the interpreter shutdown and leaves stopped containers behind.
|
||||
"""
|
||||
thread = getattr(self, "_cleanup_thread", None)
|
||||
if thread is None or not thread.is_alive():
|
||||
return True
|
||||
thread.join(timeout=timeout)
|
||||
return not thread.is_alive()
|
||||
|
|
|
|||
|
|
@ -1024,6 +1024,15 @@ def _get_env_config() -> Dict[str, Any]:
|
|||
"docker_env": _parse_env_var("TERMINAL_DOCKER_ENV", "{}", json.loads, "valid JSON"),
|
||||
"docker_run_as_host_user": os.getenv("TERMINAL_DOCKER_RUN_AS_HOST_USER", "false").lower() in {"true", "1", "yes"},
|
||||
"docker_extra_args": _parse_env_var("TERMINAL_DOCKER_EXTRA_ARGS", "[]", json.loads, "valid JSON"),
|
||||
# Cross-process container reuse (issue #20561). The docs claim
|
||||
# "ONE long-lived container shared across sessions" — this toggle
|
||||
# makes that real by probing for a labeled container at startup and
|
||||
# attaching to it instead of always starting a fresh one. Set to
|
||||
# ``false`` for hard per-process isolation (no reuse, container is
|
||||
# removed on exit).
|
||||
"docker_persist_across_processes": os.getenv(
|
||||
"TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES", "true"
|
||||
).lower() in {"true", "1", "yes"},
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1083,6 +1092,7 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int,
|
|||
env=docker_env,
|
||||
run_as_host_user=cc.get("docker_run_as_host_user", False),
|
||||
extra_args=docker_extra_args,
|
||||
persist_across_processes=cc.get("docker_persist_across_processes", True),
|
||||
)
|
||||
|
||||
elif env_type == "singularity":
|
||||
|
|
@ -1378,7 +1388,23 @@ def _atexit_cleanup():
|
|||
if _active_environments:
|
||||
count = len(_active_environments)
|
||||
logger.info("Shutting down %d remaining sandbox(es)...", count)
|
||||
# Snapshot the env objects BEFORE cleanup_all_environments empties
|
||||
# the dict; we need them to wait on docker cleanup threads after the
|
||||
# registry has been cleared.
|
||||
envs_to_wait = list(_active_environments.values())
|
||||
cleanup_all_environments()
|
||||
# Block briefly so docker stop/rm actually completes before the
|
||||
# interpreter exits. Issue #20561 — without this join, the daemon
|
||||
# cleanup threads were getting torn down mid-`docker stop`, leaving
|
||||
# Exited containers piled up on the host.
|
||||
for env in envs_to_wait:
|
||||
wait_fn = getattr(env, "wait_for_cleanup", None)
|
||||
if wait_fn is None:
|
||||
continue
|
||||
try:
|
||||
wait_fn(timeout=15.0)
|
||||
except Exception as e: # never block shutdown on a bad backend
|
||||
logger.debug("wait_for_cleanup raised on exit: %s", e)
|
||||
|
||||
atexit.register(_atexit_cleanup)
|
||||
|
||||
|
|
@ -1746,6 +1772,7 @@ def terminal_tool(
|
|||
"docker_env": config.get("docker_env", {}),
|
||||
"docker_run_as_host_user": config.get("docker_run_as_host_user", False),
|
||||
"docker_extra_args": config.get("docker_extra_args", []),
|
||||
"docker_persist_across_processes": config.get("docker_persist_across_processes", True),
|
||||
}
|
||||
|
||||
local_config = None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue