fix(docker): reuse containers across processes + fix cleanup leaks

The Docker backend docs claim "Single persistent container — ONE long- lived container shared across sessions, /new, /reset, and delegate_task subagents. Stopped/removed on shutdown." In practice the code only honored that contract within a single Python process via the in-memory \`_active_environments[task_id]\` cache. Every \`hermes chat\` invocation spawned a fresh \`hermes-<hex>\` container; older containers piled up in \`Exited\` state and accumulated until manual \`docker rm\` (issue #20561). Three root causes, all addressed by this commit: 1. No cross-process container discovery. 2. \`cleanup()\` used fire-and-forget \`subprocess.Popen("... &", shell=True)\` which raced with parent-process exit — when Python exited promptly the detached shell child got killed mid-\`docker stop\`, leaving stopped containers behind. 3. The \`docker rm\` step in cleanup was gated on \`not self._persistent\` (the bind-mount-persistence flag). Default config sets \`container_persistent: true\`, so the default happy path skipped \`rm\` entirely — even when the user explicitly didn't want cross-process reuse, containers leaked. Fix: * Add \`DockerEnvironment.__init__(persist_across_processes=True)\`. When true, init probes \`docker ps -a --filter label=hermes-agent=1 --filter label=hermes-task-id=<task> --filter label=hermes-profile=<profile>\` and reuses a matching container (running → attach; stopped → \`docker start\` → attach; \`docker start\` failure → fall through to a fresh \`docker run\`). Multiple matches prefer the running one, with the stragglers left for the orphan reaper (next commit) to clean up. * Rewrite \`cleanup()\`. Uses \`subprocess.run(..., timeout=30)\` on a daemon \`threading.Thread\`, not the racy \`Popen(... &)\`. The \`_persistent\` guard is dropped on the \`rm\` step — \`rm\` now runs whenever \`persist_across_processes\` is false, regardless of the bind-mount-persistence setting. The leak class is gone in all combinations. * Add \`wait_for_cleanup(timeout)\`. \`tools/terminal_tool.py\`'s atexit hook calls this on every active env, blocking up to 15s for the cleanup thread before interpreter exit. Without this, \`hermes /quit\` raced the daemon-thread teardown and dropped the stop/rm work. * New config \`terminal.docker_persist_across_processes\` (default \`true\` — restores the documented contract). Set \`false\` for hard per-process isolation. Wired through all four config-bridge sites (cli.py env_mappings, gateway/run.py _terminal_env_map, hermes_cli/config.py _config_to_env_sync, tests/conftest.py env-strip list); regression-pinned by \`test_docker_persist_across_processes_is_bridged_everywhere\` matching the existing pattern for docker_run_as_host_user / docker_env. Reuse intentionally does NOT compare image / mounts / resources — only the labels. Operators changing those settings should set \`docker_persist_across_processes: false\` (or \`docker rm -f\` the labeled container) to force a fresh start. This keeps the probe cheap and the failure mode obvious. Coverage: 12 new unit tests in tests/tools/test_docker_environment.py covering reuse paths (running, stopped, fallback, opt-out, duplicate preference) and cleanup behavior (persist-mode no-rm, opt-out always-rm, no-Popen, wait_for_cleanup semantics, partial-init safety). Plus one config-bridge regression pin. Refs #20561
2026-06-05 07:41:39 +00:00 · 2026-05-28 14:00:26 +10:00 · 2026-05-28 14:00:26 +10:00 · ac8e238bc8
commit ac8e238bc8
parent 8d129d013b
8 changed files with 612 additions and 51 deletions
--- a/tools/environments/docker.py
+++ b/tools/environments/docker.py
@ -339,11 +339,13 @@ class DockerEnvironment(BaseEnvironment):
        auto_mount_cwd: bool = False,
        run_as_host_user: bool = False,
        extra_args: list = None,
+        persist_across_processes: bool = True,
    ):
        if cwd == "~":
            cwd = "/root"
        super().__init__(cwd=cwd, timeout=timeout)
        self._persistent = persistent_filesystem
+        self._persist_across_processes = persist_across_processes
        self._task_id = task_id
        self._forward_env = _normalize_forward_env_names(forward_env)
        self._env = _normalize_env_dict(env)
@ -561,26 +563,69 @@ class DockerEnvironment(BaseEnvironment):
            "hermes-task-id": task_label,
            "hermes-profile": profile_name,
        }
-        run_cmd = [
-            self._docker_exe, "run", "-d",
-            "--init",           # tini/catatonit as PID 1 — reaps zombie children
-            "--name", container_name,
-            *label_args,
-            "-w", cwd,
-            *all_run_args,
-            image,
-            "sleep", "infinity",  # no fixed lifetime — idle reaper handles cleanup
-        ]
-        logger.debug(f"Starting container: {' '.join(run_cmd)}")
-        result = subprocess.run(
-            run_cmd,
-            capture_output=True,
-            text=True,
-            timeout=120,  # image pull may take a while
-            check=True,
-        )
-        self._container_id = result.stdout.strip()
-        logger.info(f"Started container {container_name} ({self._container_id[:12]})")
+
+        # Cross-process reuse (issue #20561 — docs claim "ONE long-lived
+        # container shared across sessions").  If a prior Hermes process
+        # already started a container for this (task_id, profile) and it
+        # still exists, attach to it instead of starting a fresh one.  This
+        # restores the documented contract; opt out via
+        # ``terminal.docker_persist_across_processes: false``.
+        #
+        # Reuse matches on labels only — we deliberately do NOT compare image
+        # / mounts / resources.  Operators who need a fresh container after
+        # changing those settings should set ``docker_persist_across_processes:
+        # false`` (or run ``docker rm -f`` against the labeled container) to
+        # force a clean start.
+        reused = False
+        if persist_across_processes:
+            existing = self._find_reusable_container(task_label, profile_name)
+            if existing is not None:
+                container_id, state = existing
+                self._container_id = container_id
+                if state != "running":
+                    try:
+                        subprocess.run(
+                            [self._docker_exe, "start", container_id],
+                            capture_output=True,
+                            text=True,
+                            timeout=30,
+                            check=True,
+                        )
+                    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
+                        logger.warning(
+                            "Failed to start existing container %s (state=%s): "
+                            "%s — falling back to a fresh container.",
+                            container_id[:12], state, e,
+                        )
+                        self._container_id = None
+                if self._container_id:
+                    logger.info(
+                        "Reusing container %s (task=%s, profile=%s, prior state=%s)",
+                        container_id[:12], task_label, profile_name, state,
+                    )
+                    reused = True
+
+        if not reused:
+            run_cmd = [
+                self._docker_exe, "run", "-d",
+                "--init",           # tini/catatonit as PID 1 — reaps zombie children
+                "--name", container_name,
+                *label_args,
+                "-w", cwd,
+                *all_run_args,
+                image,
+                "sleep", "infinity",  # no fixed lifetime — idle reaper handles cleanup
+            ]
+            logger.debug(f"Starting container: {' '.join(run_cmd)}")
+            result = subprocess.run(
+                run_cmd,
+                capture_output=True,
+                text=True,
+                timeout=120,  # image pull may take a while
+                check=True,
+            )
+            self._container_id = result.stdout.strip()
+            logger.info(f"Started container {container_name} ({self._container_id[:12]})")

        # Build the init-time env forwarding args (used only by init_session
        # to inject host env vars into the snapshot; subsequent commands get
@ -685,31 +730,143 @@ class DockerEnvironment(BaseEnvironment):
        logger.debug("Docker --storage-opt support: %s", _storage_opt_ok)
        return _storage_opt_ok

-    def cleanup(self):
-        """Stop and remove the container. Bind-mount dirs persist if persistent=True."""
-        if self._container_id:
-            try:
-                # Stop in background so cleanup doesn't block
-                stop_cmd = (
-                    f"(timeout 60 {self._docker_exe} stop {self._container_id} || "
-                    f"{self._docker_exe} rm -f {self._container_id}) >/dev/null 2>&1 &"
-                )
-                subprocess.Popen(stop_cmd, shell=True)
-            except Exception as e:
-                logger.warning("Failed to stop container %s: %s", self._container_id, e)
+    def _find_reusable_container(self, task_label: str, profile_label: str) -> Optional[tuple[str, str]]:
+        """Look for an existing container labeled for this (task, profile).

+        Returns ``(container_id, state)`` on hit, ``None`` on miss / on any
+        failure (including ``docker ps`` itself failing). State is one of the
+        values Docker reports via ``{{.State}}`` — e.g. ``running``, ``exited``,
+        ``created``, ``paused``, ``restarting``, ``dead``. The caller decides
+        whether the state warrants ``docker start`` before reuse.
+
+        Restricted to the docker-stored label set this class creates; never
+        matches containers that happened to be named ``hermes-*`` but were
+        started by some other tool.
+        """
+        try:
+            result = subprocess.run(
+                [
+                    self._docker_exe, "ps", "-a",
+                    "--filter", "label=hermes-agent=1",
+                    "--filter", f"label=hermes-task-id={task_label}",
+                    "--filter", f"label=hermes-profile={profile_label}",
+                    "--format", "{{.ID}}\t{{.State}}",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=10,
+                check=False,
+            )
+        except (subprocess.TimeoutExpired, OSError) as e:
+            logger.debug("docker ps probe failed: %s — will start a fresh container", e)
+            return None
+        if result.returncode != 0:
+            logger.debug(
+                "docker ps probe returned %d: %s — will start a fresh container",
+                result.returncode, result.stderr.strip(),
+            )
+            return None
+        lines = [ln.strip() for ln in result.stdout.splitlines() if ln.strip()]
+        if not lines:
+            return None
+        # Multiple matches are unusual (one (task, profile) should produce one
+        # container) but can happen if a previous Hermes process crashed
+        # mid-cleanup. Prefer a running one if present; otherwise pick the
+        # first listed. Stale duplicates get reaped by the orphan-reaper in a
+        # follow-up commit; we don't try to be heroic about them here.
+        running = None
+        first = None
+        for ln in lines:
+            parts = ln.split("\t", 1)
+            if len(parts) != 2:
+                continue
+            cid, state = parts[0], parts[1].lower()
+            if first is None:
+                first = (cid, state)
+            if state == "running" and running is None:
+                running = (cid, state)
+        return running or first
+
+    def cleanup(self):
+        """Stop (and optionally remove) the container.
+
+        Behavior depends on ``persist_across_processes`` (init kwarg):
+
+        * **True** (default) — only ``docker stop`` so the container is
+          available for reuse by the next Hermes process. The orphan-reaper
+          eventually removes it if no subsequent process picks it up.
+        * **False** — ``docker stop`` followed by ``docker rm -f``, regardless
+          of ``persistent_filesystem``. The previous ``rm`` path was gated on
+          ``not self._persistent`` which meant ``container_persistent: true``
+          users (the default) leaked Exited containers forever (issue #20561).
+
+        Cleanup runs on a daemon thread with bounded ``subprocess.run`` calls,
+        not the previous fire-and-forget ``Popen(... &)`` shell construct.
+        That pattern raced with parent-process exit and silently dropped
+        cleanup work when the parent didn't outlive the detached shell — the
+        primary mechanism behind Exited-container accumulation under SIGTERM
+        / ``hermes /quit`` / dead terminals.
+        """
+        container_id = self._container_id
+        if not container_id:
+            # Still drop the bind-mount dirs if any were allocated.
            if not self._persistent:
-                # Also schedule removal (stop only leaves it as stopped)
+                for d in (self._workspace_dir, self._home_dir):
+                    if d:
+                        shutil.rmtree(d, ignore_errors=True)
+            return
+
+        # Capture state needed by the worker before we null out the attrs —
+        # the worker thread can outlive ``self``.
+        docker_exe = self._docker_exe
+        should_remove = not self._persist_across_processes
+        log_id = container_id[:12]
+
+        def _do_cleanup() -> None:
+            try:
+                subprocess.run(
+                    [docker_exe, "stop", "-t", "10", container_id],
+                    capture_output=True, timeout=30,
+                )
+            except (subprocess.TimeoutExpired, OSError) as e:
+                logger.warning("docker stop %s timed out / failed: %s", log_id, e)
+            if should_remove:
                try:
-                    subprocess.Popen(
-                        f"sleep 3 && {self._docker_exe} rm -f {self._container_id} >/dev/null 2>&1 &",
-                        shell=True,
+                    subprocess.run(
+                        [docker_exe, "rm", "-f", container_id],
+                        capture_output=True, timeout=30,
                    )
-                except Exception:
-                    pass
-            self._container_id = None
+                except (subprocess.TimeoutExpired, OSError) as e:
+                    logger.warning("docker rm -f %s failed: %s", log_id, e)
+
+        # Daemon thread: doesn't block interpreter exit (atexit returns
+        # promptly), but unlike the old ``Popen(... &)`` shell trick the
+        # Python-level join semantics let the thread actually run to
+        # completion if the interpreter is still alive. atexit registers
+        # ``_atexit_cleanup`` in terminal_tool.py which waits up to ~60s for
+        # outstanding cleanups, so most exits complete the work cleanly.
+        import threading
+        t = threading.Thread(target=_do_cleanup, daemon=True, name=f"hermes-cleanup-{log_id}")
+        t.start()
+        self._cleanup_thread = t
+        self._container_id = None

        if not self._persistent:
            for d in (self._workspace_dir, self._home_dir):
                if d:
                    shutil.rmtree(d, ignore_errors=True)
+
+    def wait_for_cleanup(self, timeout: float = 30.0) -> bool:
+        """Block up to *timeout* seconds for the cleanup worker thread.
+
+        Returns ``True`` if the thread finished (or no thread was started),
+        ``False`` on timeout. The atexit hook in terminal_tool.py calls this
+        on every active environment so docker stop/rm actually completes
+        before the Python process exits — without this, ``hermes /quit``
+        races the interpreter shutdown and leaves stopped containers behind.
+        """
+        thread = getattr(self, "_cleanup_thread", None)
+        if thread is None or not thread.is_alive():
+            return True
+        thread.join(timeout=timeout)
+        return not thread.is_alive()
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@ -1024,6 +1024,15 @@ def _get_env_config() -> Dict[str, Any]:
        "docker_env": _parse_env_var("TERMINAL_DOCKER_ENV", "{}", json.loads, "valid JSON"),
        "docker_run_as_host_user": os.getenv("TERMINAL_DOCKER_RUN_AS_HOST_USER", "false").lower() in {"true", "1", "yes"},
        "docker_extra_args": _parse_env_var("TERMINAL_DOCKER_EXTRA_ARGS", "[]", json.loads, "valid JSON"),
+        # Cross-process container reuse (issue #20561).  The docs claim
+        # "ONE long-lived container shared across sessions" — this toggle
+        # makes that real by probing for a labeled container at startup and
+        # attaching to it instead of always starting a fresh one.  Set to
+        # ``false`` for hard per-process isolation (no reuse, container is
+        # removed on exit).
+        "docker_persist_across_processes": os.getenv(
+            "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES", "true"
+        ).lower() in {"true", "1", "yes"},
    }


@ -1083,6 +1092,7 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int,
            env=docker_env,
            run_as_host_user=cc.get("docker_run_as_host_user", False),
            extra_args=docker_extra_args,
+            persist_across_processes=cc.get("docker_persist_across_processes", True),
        )
    
    elif env_type == "singularity":
@ -1378,7 +1388,23 @@ def _atexit_cleanup():
    if _active_environments:
        count = len(_active_environments)
        logger.info("Shutting down %d remaining sandbox(es)...", count)
+        # Snapshot the env objects BEFORE cleanup_all_environments empties
+        # the dict; we need them to wait on docker cleanup threads after the
+        # registry has been cleared.
+        envs_to_wait = list(_active_environments.values())
        cleanup_all_environments()
+        # Block briefly so docker stop/rm actually completes before the
+        # interpreter exits. Issue #20561 — without this join, the daemon
+        # cleanup threads were getting torn down mid-`docker stop`, leaving
+        # Exited containers piled up on the host.
+        for env in envs_to_wait:
+            wait_fn = getattr(env, "wait_for_cleanup", None)
+            if wait_fn is None:
+                continue
+            try:
+                wait_fn(timeout=15.0)
+            except Exception as e:  # never block shutdown on a bad backend
+                logger.debug("wait_for_cleanup raised on exit: %s", e)

 atexit.register(_atexit_cleanup)

@ -1746,6 +1772,7 @@ def terminal_tool(
                                "docker_env": config.get("docker_env", {}),
                                "docker_run_as_host_user": config.get("docker_run_as_host_user", False),
                                "docker_extra_args": config.get("docker_extra_args", []),
+                                "docker_persist_across_processes": config.get("docker_persist_across_processes", True),
                            }

                        local_config = None