fix(docker): persist-mode cleanup is no-op; add force_remove kwarg (#20561)

The first iteration of this PR did docker stop on every cleanup in persist mode (only skipping docker rm). Ben caught this as contradicting the documented "ONE long-lived container shared across sessions" semantics: stopping the container on every Hermes /quit kills any background processes inside (npm watchers, pytest watchers, long-running scripts) — exactly the case persist mode is supposed to protect. This commit splits the cleanup paths cleanly: * **Persist mode (default)** — cleanup() is a NO-OP for the container. Container stays running, processes survive, next Hermes process attaches via the existing label probe in ~ms instead of waiting for docker start. Resource reclamation happens via the orphan reaper at next startup (2 × lifetime_seconds threshold), which covers the SIGKILL / OOM / abandoned-laptop cases. * **Opt-out mode (persist_across_processes=False)** — unchanged: docker stop + docker rm -f on cleanup as before. * **Explicit teardown** — new cleanup(force_remove=True) kwarg overrides persist mode and tears the container down unconditionally. cleanup_vm(task_id) now defaults to force_remove=True since it's the user-driven reset path (called from AIAgent.close(), /reset-style flows, and the idle reaper's per-turn cleanup). The idle reaper in _cleanup_inactive_envs calls env.cleanup() directly with no kwargs, so idle persist-mode envs are no-op'd — the container survives the in-process pop and the next tool call re-probes via labels. No state leak: _container_id is still cleared on the in-process handle. E2E verified against real Docker: ✓ Container is still running after cleanup() ✓ Background process (sleep loop) survived cleanup() ✓ Filesystem state preserved across cleanup() ✓ In-process container_id cleared (next __init__ will re-probe) ✓ Background process visible from reused env (no docker start happened) ✓ force_remove=True removed the container even in persist mode ✓ cleanup_vm() removed the container (defaults to force_remove=True) Test changes: * Replaces `test_cleanup_with_persist_only_stops_no_rm` with `test_cleanup_with_persist_is_noop_for_container` — asserts neither stop nor rm runs in persist mode, and the in-process handle is cleared so re-probe works. * Adds `test_cleanup_force_remove_stops_and_rms_even_in_persist_mode` — covers the new kwarg. * Updates `test_cleanup_uses_subprocess_run_not_detached_shell` and `test_wait_for_cleanup_after_cleanup_returns_true` to pass `force_remove=True` so they actually exercise the docker code path (default no-op would trivially pass). cleanup_vm() forwards `force_remove` only to backends whose cleanup() accepts the kwarg (currently just DockerEnvironment) via runtime signature inspection — Modal/Daytona/SSH `cleanup()` signatures are unchanged. Refs #20561
2026-07-18 14:52:04 +00:00 · 2026-05-28 17:39:35 +10:00 · 2026-05-28 17:39:35 +10:00 · 5c2170a7c6
commit 5c2170a7c6
parent d77d877665
3 changed files with 166 additions and 41 deletions
--- a/tests/tools/test_docker_environment.py
+++ b/tests/tools/test_docker_environment.py
@ -837,12 +837,16 @@ def _install_fake_thread(monkeypatch):
    monkeypatch.setattr(threading, "Thread", _FakeThread)


-def test_cleanup_with_persist_only_stops_no_rm(monkeypatch):
-    """``persist_across_processes=True`` (default) cleanup must docker stop
-    the container but NEVER docker rm — the container has to survive so the
-    next Hermes process can reuse it. Issue #20561 — the previous code
-    matched this on the `_persistent` flag instead of a dedicated
-    cross-process flag, which made reuse impossible."""
+def test_cleanup_with_persist_is_noop_for_container(monkeypatch):
+    """``persist_across_processes=True`` (default) cleanup must NEITHER stop
+    NOR remove the container — the docs promise "ONE long-lived container
+    shared across sessions", and any docker stop would kill background
+    processes inside the container (npm watchers, pytest watchers, etc.).
+
+    Resource reclamation in this mode happens via the orphan reaper on next
+    Hermes startup, not on graceful exit. Issue #20561 — the first iteration
+    of this PR did docker stop here, which Ben caught as contradicting the
+    "ONE long-lived container" semantics."""
    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
    monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
    _mock_subprocess_run(monkeypatch)
@ -866,11 +870,55 @@ def test_cleanup_with_persist_only_stops_no_rm(monkeypatch):

    stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
    rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
-    assert stops, f"expected docker stop call, got cleanup_calls: {cleanup_calls}"
+    assert not stops, (
+        f"docker stop must NOT be called when persist_across_processes=True; "
+        f"container has to stay running so background processes survive. "
+        f"Got: {stops}"
+    )
    assert not rms, (
        f"docker rm must NOT be called when persist_across_processes=True; "
        f"reuse would be impossible. Got: {rms}"
    )
+    # The in-process handle must still be cleared so the next __init__
+    # re-probes via labels (and reuses the still-running container).
+    assert env._container_id is None, (
+        "in-process container_id should be cleared even in no-op cleanup"
+    )
+
+
+def test_cleanup_force_remove_stops_and_rms_even_in_persist_mode(monkeypatch):
+    """``cleanup(force_remove=True)`` must stop AND rm the container even
+    when ``persist_across_processes=True``. This is the explicit-teardown
+    path for ``/reset``, ``cleanup_vm(task_id, force_remove=True)``, and any
+    future caller that wants a guaranteed fresh container.
+
+    Without this kwarg, callers in persist mode would have no way to force a
+    fresh container without also flipping the global config — too coarse for
+    a per-task reset.
+    """
+    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
+    monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
+    _mock_subprocess_run(monkeypatch)
+    _install_fake_thread(monkeypatch)
+
+    env = _make_dummy_env(task_id="cleanup-force", persistent_filesystem=False)
+    assert env._container_id
+
+    cleanup_calls = []
+    real_run = docker_env.subprocess.run
+
+    def _capturing_run(cmd, **kwargs):
+        cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
+        return real_run(cmd, **kwargs)
+
+    monkeypatch.setattr(docker_env.subprocess, "run", _capturing_run)
+
+    env.cleanup(force_remove=True)
+
+    stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
+    rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
+    assert stops, f"force_remove must docker stop; got: {cleanup_calls}"
+    assert rms, f"force_remove must docker rm; got: {cleanup_calls}"


 def test_cleanup_with_persist_disabled_stops_and_rms(monkeypatch):
@ -914,12 +962,15 @@ def test_cleanup_with_persist_disabled_stops_and_rms(monkeypatch):


 def test_cleanup_uses_subprocess_run_not_detached_shell(monkeypatch):
-    """The pre-fix code used ``subprocess.Popen(\"... &\", shell=True)`` which
+    """The pre-fix code used ``subprocess.Popen("... &", shell=True)`` which
    raced with parent-process exit and silently dropped cleanup work. The
    new code must use ``subprocess.run`` with bounded ``timeout=`` so the
    work actually completes within the process lifetime.

-    Asserts cleanup never reaches into shell-mode Popen.
+    Asserts cleanup never reaches into shell-mode Popen. Uses
+    ``force_remove=True`` so cleanup actually issues docker calls — the
+    default persist-mode path is now a no-op (commit 4) and would trivially
+    pass this assertion without exercising the docker code at all.
    """
    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
    monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
@ -935,7 +986,7 @@ def test_cleanup_uses_subprocess_run_not_detached_shell(monkeypatch):
    monkeypatch.setattr(docker_env.subprocess, "Popen", _forbidden_popen)

    env = _make_dummy_env(task_id="no-popen-cleanup")
-    env.cleanup()  # must not raise
+    env.cleanup(force_remove=True)  # must not raise


 def test_wait_for_cleanup_returns_true_when_no_thread_started():
@ -951,14 +1002,20 @@ def test_wait_for_cleanup_returns_true_when_no_thread_started():
 def test_wait_for_cleanup_after_cleanup_returns_true(monkeypatch):
    """End-to-end: cleanup() starts a thread, wait_for_cleanup() joins it
    and reports completion. Atexit relies on this contract to ensure docker
-    stop/rm actually finishes before the Python interpreter exits."""
+    stop/rm actually finishes before the Python interpreter exits.
+
+    Uses ``force_remove=True`` so cleanup actually starts a worker thread —
+    the default persist-mode cleanup is a no-op (commit 4) and never spawns
+    a thread, so the trivial "no thread" branch of wait_for_cleanup is
+    already covered by the previous test.
+    """
    monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
    monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
    _mock_subprocess_run(monkeypatch)
    _install_fake_thread(monkeypatch)

    env = _make_dummy_env(task_id="wait-test")
-    env.cleanup()
+    env.cleanup(force_remove=True)
    assert env.wait_for_cleanup(timeout=5.0) is True


--- a/tools/environments/docker.py
+++ b/tools/environments/docker.py
@ -913,49 +913,94 @@ class DockerEnvironment(BaseEnvironment):
                running = (cid, state)
        return running or first

-    def cleanup(self):
-        """Stop (and optionally remove) the container.
+    def cleanup(self, *, force_remove: bool = False):
+        """Tear down the container according to persist mode and *force_remove*.

-        Behavior depends on ``persist_across_processes`` (init kwarg):
+        Persist-mode (``persist_across_processes=True``, the default) leaves the
+        container **running** untouched. The docs promise "ONE long-lived
+        container shared across sessions" and stopping it on every Hermes exit
+        breaks that promise:

-        * **True** (default) — only ``docker stop`` so the container is
-          available for reuse by the next Hermes process. The orphan-reaper
-          eventually removes it if no subsequent process picks it up.
-        * **False** — ``docker stop`` followed by ``docker rm -f``, regardless
-          of ``persistent_filesystem``. The previous ``rm`` path was gated on
-          ``not self._persistent`` which meant ``container_persistent: true``
-          users (the default) leaked Exited containers forever (issue #20561).
+        * Background processes inside the container (``npm run dev``, watchers,
+          long-running pytest) get killed every time the user runs ``/quit``.
+        * Every reuse requires ``docker start`` + waiting for the container to
+          come back up, adding 1–2s to the first tool call of the new session.
+        * The user-visible difference between "ONE long-lived container" and
+          "a new container that happens to share state" is exactly this:
+          processes survive in the former, die in the latter.

-        Cleanup runs on a daemon thread with bounded ``subprocess.run`` calls,
-        not the previous fire-and-forget ``Popen(... &)`` shell construct.
-        That pattern raced with parent-process exit and silently dropped
-        cleanup work when the parent didn't outlive the detached shell — the
-        primary mechanism behind Exited-container accumulation under SIGTERM
-        / ``hermes /quit`` / dead terminals.
+        Resource reclamation for the persist-mode case lives in the
+        ``reap_orphan_containers()`` path (see issue #20561 commit 3): if no
+        Hermes process touches a labeled container for ``2 × lifetime_seconds``
+        it gets ``docker rm -f``'d at the next Hermes startup. That covers the
+        SIGKILL / OOM / abandoned-laptop cases without us needing to stop the
+        container on every graceful exit.
+
+        Opt-out mode (``persist_across_processes=False``) still does
+        ``docker stop`` + ``docker rm -f`` on every cleanup, matching the
+        pre-PR behavior for users who explicitly want per-process isolation.
+
+        ``force_remove=True`` overrides persist mode and always tears the
+        container down (``docker stop`` + ``docker rm -f``). This is the
+        explicit-teardown path for ``/reset``, ``cleanup_vm(task_id)``-driven
+        resets, or any caller that wants a guaranteed fresh container on next
+        ``DockerEnvironment(task_id=...)``. No current caller passes
+        ``force_remove=True``; the parameter is here so the explicit-teardown
+        semantics can be wired up later without changing this method's
+        signature.
+
+        Cleanup runs on a daemon thread with bounded ``subprocess.run`` calls
+        (not the racy ``Popen(... &)`` pattern from before PR #33645). The
+        atexit hook in ``tools/terminal_tool.py`` waits up to 15s for the
+        thread to finish before the interpreter exits, so ``docker stop`` /
+        ``docker rm`` actually completes when we do trigger it.
        """
        container_id = self._container_id
        if not container_id:
-            # Still drop the bind-mount dirs if any were allocated.
+            # Still drop the bind-mount dirs if any were allocated and we're
+            # NOT in persist mode (persist mode preserves them).
            if not self._persistent:
                for d in (self._workspace_dir, self._home_dir):
                    if d:
                        shutil.rmtree(d, ignore_errors=True)
            return

+        # Decide what to actually do. Three cases:
+        #
+        #   force_remove=True             → stop + rm (explicit teardown)
+        #   persist_across_processes=True → no-op (leave container running)
+        #   persist_across_processes=False → stop + rm (per-process isolation)
+        #
+        # The persist-mode no-op is the issue-#20561 contract: the container
+        # outlives Hermes processes, processes inside it stay alive, and
+        # reuse on next startup is instant.
+        if force_remove:
+            should_stop = True
+            should_remove = True
+        elif self._persist_across_processes:
+            # No-op for the container. Drop the in-process handle so a fresh
+            # __init__ will re-probe via labels (and find the running
+            # container) instead of trying to reuse a stale Python reference.
+            self._container_id = None
+            return
+        else:
+            should_stop = True
+            should_remove = True
+
        # Capture state needed by the worker before we null out the attrs —
        # the worker thread can outlive ``self``.
        docker_exe = self._docker_exe
-        should_remove = not self._persist_across_processes
        log_id = container_id[:12]

        def _do_cleanup() -> None:
-            try:
-                subprocess.run(
-                    [docker_exe, "stop", "-t", "10", container_id],
-                    capture_output=True, timeout=30,
-                )
-            except (subprocess.TimeoutExpired, OSError) as e:
-                logger.warning("docker stop %s timed out / failed: %s", log_id, e)
+            if should_stop:
+                try:
+                    subprocess.run(
+                        [docker_exe, "stop", "-t", "10", container_id],
+                        capture_output=True, timeout=30,
+                    )
+                except (subprocess.TimeoutExpired, OSError) as e:
+                    logger.warning("docker stop %s timed out / failed: %s", log_id, e)
            if should_remove:
                try:
                    subprocess.run(
@ -977,7 +1022,10 @@ class DockerEnvironment(BaseEnvironment):
        self._cleanup_thread = t
        self._container_id = None

-        if not self._persistent:
+        # Bind-mount dir teardown only runs when we actually removed the
+        # container (the dirs are the container's filesystem state; keeping
+        # them around with no container would orphan the data on disk).
+        if should_remove and not self._persistent:
            for d in (self._workspace_dir, self._home_dir):
                if d:
                    shutil.rmtree(d, ignore_errors=True)
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@ -1426,8 +1426,21 @@ def cleanup_all_environments():
    return cleaned


-def cleanup_vm(task_id: str):
-    """Manually clean up a specific environment by task_id."""
+def cleanup_vm(task_id: str, *, force_remove: bool = True):
+    """Manually clean up a specific environment by task_id.
+
+    *force_remove* (default True) is forwarded to backends that accept it
+    — currently only ``DockerEnvironment``. The default of True matches the
+    semantics callers expect from this function: ``cleanup_vm(task_id)`` is
+    the explicit-teardown path (called from ``/reset``-style flows,
+    ``AIAgent.close()``, and the idle reaper after a long inactivity window),
+    so the container should be removed regardless of persist mode.
+
+    The idle reaper passes the env through ``env.cleanup()`` directly (not
+    via this function), so persist-mode idle envs are NOT removed by idle
+    reaping — only by explicit ``cleanup_vm()`` or the orphan reaper at next
+    startup. See ``_cleanup_inactive_envs()`` for the idle path.
+    """
    # Remove from tracking dicts while holding the lock, but defer the
    # actual (potentially slow) env.cleanup() call to outside the lock
    # so other tool calls aren't blocked.
@ -1452,7 +1465,14 @@ def cleanup_vm(task_id: str):

    try:
        if hasattr(env, 'cleanup'):
-            env.cleanup()
+            # Pass force_remove only if the env's cleanup() accepts it
+            # (DockerEnvironment after issue #20561; other backends don't).
+            import inspect
+            sig = inspect.signature(env.cleanup)
+            if "force_remove" in sig.parameters:
+                env.cleanup(force_remove=force_remove)
+            else:
+                env.cleanup()
        elif hasattr(env, 'stop'):
            env.stop()
        elif hasattr(env, 'terminate'):