mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(docker): persist-mode cleanup is no-op; add force_remove kwarg (#20561)
The first iteration of this PR did docker stop on every cleanup in persist mode (only skipping docker rm). Ben caught this as contradicting the documented "ONE long-lived container shared across sessions" semantics: stopping the container on every Hermes /quit kills any background processes inside (npm watchers, pytest watchers, long-running scripts) — exactly the case persist mode is supposed to protect. This commit splits the cleanup paths cleanly: * **Persist mode (default)** — cleanup() is a NO-OP for the container. Container stays running, processes survive, next Hermes process attaches via the existing label probe in ~ms instead of waiting for docker start. Resource reclamation happens via the orphan reaper at next startup (2 × lifetime_seconds threshold), which covers the SIGKILL / OOM / abandoned-laptop cases. * **Opt-out mode (persist_across_processes=False)** — unchanged: docker stop + docker rm -f on cleanup as before. * **Explicit teardown** — new cleanup(force_remove=True) kwarg overrides persist mode and tears the container down unconditionally. cleanup_vm(task_id) now defaults to force_remove=True since it's the user-driven reset path (called from AIAgent.close(), /reset-style flows, and the idle reaper's per-turn cleanup). The idle reaper in _cleanup_inactive_envs calls env.cleanup() directly with no kwargs, so idle persist-mode envs are no-op'd — the container survives the in-process pop and the next tool call re-probes via labels. No state leak: _container_id is still cleared on the in-process handle. E2E verified against real Docker: ✓ Container is still running after cleanup() ✓ Background process (sleep loop) survived cleanup() ✓ Filesystem state preserved across cleanup() ✓ In-process container_id cleared (next __init__ will re-probe) ✓ Background process visible from reused env (no docker start happened) ✓ force_remove=True removed the container even in persist mode ✓ cleanup_vm() removed the container (defaults to force_remove=True) Test changes: * Replaces `test_cleanup_with_persist_only_stops_no_rm` with `test_cleanup_with_persist_is_noop_for_container` — asserts neither stop nor rm runs in persist mode, and the in-process handle is cleared so re-probe works. * Adds `test_cleanup_force_remove_stops_and_rms_even_in_persist_mode` — covers the new kwarg. * Updates `test_cleanup_uses_subprocess_run_not_detached_shell` and `test_wait_for_cleanup_after_cleanup_returns_true` to pass `force_remove=True` so they actually exercise the docker code path (default no-op would trivially pass). cleanup_vm() forwards `force_remove` only to backends whose cleanup() accepts the kwarg (currently just DockerEnvironment) via runtime signature inspection — Modal/Daytona/SSH `cleanup()` signatures are unchanged. Refs #20561
This commit is contained in:
parent
d77d877665
commit
5c2170a7c6
3 changed files with 166 additions and 41 deletions
|
|
@ -837,12 +837,16 @@ def _install_fake_thread(monkeypatch):
|
|||
monkeypatch.setattr(threading, "Thread", _FakeThread)
|
||||
|
||||
|
||||
def test_cleanup_with_persist_only_stops_no_rm(monkeypatch):
|
||||
"""``persist_across_processes=True`` (default) cleanup must docker stop
|
||||
the container but NEVER docker rm — the container has to survive so the
|
||||
next Hermes process can reuse it. Issue #20561 — the previous code
|
||||
matched this on the `_persistent` flag instead of a dedicated
|
||||
cross-process flag, which made reuse impossible."""
|
||||
def test_cleanup_with_persist_is_noop_for_container(monkeypatch):
|
||||
"""``persist_across_processes=True`` (default) cleanup must NEITHER stop
|
||||
NOR remove the container — the docs promise "ONE long-lived container
|
||||
shared across sessions", and any docker stop would kill background
|
||||
processes inside the container (npm watchers, pytest watchers, etc.).
|
||||
|
||||
Resource reclamation in this mode happens via the orphan reaper on next
|
||||
Hermes startup, not on graceful exit. Issue #20561 — the first iteration
|
||||
of this PR did docker stop here, which Ben caught as contradicting the
|
||||
"ONE long-lived container" semantics."""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
|
|
@ -866,11 +870,55 @@ def test_cleanup_with_persist_only_stops_no_rm(monkeypatch):
|
|||
|
||||
stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
|
||||
rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
|
||||
assert stops, f"expected docker stop call, got cleanup_calls: {cleanup_calls}"
|
||||
assert not stops, (
|
||||
f"docker stop must NOT be called when persist_across_processes=True; "
|
||||
f"container has to stay running so background processes survive. "
|
||||
f"Got: {stops}"
|
||||
)
|
||||
assert not rms, (
|
||||
f"docker rm must NOT be called when persist_across_processes=True; "
|
||||
f"reuse would be impossible. Got: {rms}"
|
||||
)
|
||||
# The in-process handle must still be cleared so the next __init__
|
||||
# re-probes via labels (and reuses the still-running container).
|
||||
assert env._container_id is None, (
|
||||
"in-process container_id should be cleared even in no-op cleanup"
|
||||
)
|
||||
|
||||
|
||||
def test_cleanup_force_remove_stops_and_rms_even_in_persist_mode(monkeypatch):
|
||||
"""``cleanup(force_remove=True)`` must stop AND rm the container even
|
||||
when ``persist_across_processes=True``. This is the explicit-teardown
|
||||
path for ``/reset``, ``cleanup_vm(task_id, force_remove=True)``, and any
|
||||
future caller that wants a guaranteed fresh container.
|
||||
|
||||
Without this kwarg, callers in persist mode would have no way to force a
|
||||
fresh container without also flipping the global config — too coarse for
|
||||
a per-task reset.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
_install_fake_thread(monkeypatch)
|
||||
|
||||
env = _make_dummy_env(task_id="cleanup-force", persistent_filesystem=False)
|
||||
assert env._container_id
|
||||
|
||||
cleanup_calls = []
|
||||
real_run = docker_env.subprocess.run
|
||||
|
||||
def _capturing_run(cmd, **kwargs):
|
||||
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
|
||||
return real_run(cmd, **kwargs)
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _capturing_run)
|
||||
|
||||
env.cleanup(force_remove=True)
|
||||
|
||||
stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
|
||||
rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
|
||||
assert stops, f"force_remove must docker stop; got: {cleanup_calls}"
|
||||
assert rms, f"force_remove must docker rm; got: {cleanup_calls}"
|
||||
|
||||
|
||||
def test_cleanup_with_persist_disabled_stops_and_rms(monkeypatch):
|
||||
|
|
@ -914,12 +962,15 @@ def test_cleanup_with_persist_disabled_stops_and_rms(monkeypatch):
|
|||
|
||||
|
||||
def test_cleanup_uses_subprocess_run_not_detached_shell(monkeypatch):
|
||||
"""The pre-fix code used ``subprocess.Popen(\"... &\", shell=True)`` which
|
||||
"""The pre-fix code used ``subprocess.Popen("... &", shell=True)`` which
|
||||
raced with parent-process exit and silently dropped cleanup work. The
|
||||
new code must use ``subprocess.run`` with bounded ``timeout=`` so the
|
||||
work actually completes within the process lifetime.
|
||||
|
||||
Asserts cleanup never reaches into shell-mode Popen.
|
||||
Asserts cleanup never reaches into shell-mode Popen. Uses
|
||||
``force_remove=True`` so cleanup actually issues docker calls — the
|
||||
default persist-mode path is now a no-op (commit 4) and would trivially
|
||||
pass this assertion without exercising the docker code at all.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
|
|
@ -935,7 +986,7 @@ def test_cleanup_uses_subprocess_run_not_detached_shell(monkeypatch):
|
|||
monkeypatch.setattr(docker_env.subprocess, "Popen", _forbidden_popen)
|
||||
|
||||
env = _make_dummy_env(task_id="no-popen-cleanup")
|
||||
env.cleanup() # must not raise
|
||||
env.cleanup(force_remove=True) # must not raise
|
||||
|
||||
|
||||
def test_wait_for_cleanup_returns_true_when_no_thread_started():
|
||||
|
|
@ -951,14 +1002,20 @@ def test_wait_for_cleanup_returns_true_when_no_thread_started():
|
|||
def test_wait_for_cleanup_after_cleanup_returns_true(monkeypatch):
|
||||
"""End-to-end: cleanup() starts a thread, wait_for_cleanup() joins it
|
||||
and reports completion. Atexit relies on this contract to ensure docker
|
||||
stop/rm actually finishes before the Python interpreter exits."""
|
||||
stop/rm actually finishes before the Python interpreter exits.
|
||||
|
||||
Uses ``force_remove=True`` so cleanup actually starts a worker thread —
|
||||
the default persist-mode cleanup is a no-op (commit 4) and never spawns
|
||||
a thread, so the trivial "no thread" branch of wait_for_cleanup is
|
||||
already covered by the previous test.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
_install_fake_thread(monkeypatch)
|
||||
|
||||
env = _make_dummy_env(task_id="wait-test")
|
||||
env.cleanup()
|
||||
env.cleanup(force_remove=True)
|
||||
assert env.wait_for_cleanup(timeout=5.0) is True
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -913,49 +913,94 @@ class DockerEnvironment(BaseEnvironment):
|
|||
running = (cid, state)
|
||||
return running or first
|
||||
|
||||
def cleanup(self):
|
||||
"""Stop (and optionally remove) the container.
|
||||
def cleanup(self, *, force_remove: bool = False):
|
||||
"""Tear down the container according to persist mode and *force_remove*.
|
||||
|
||||
Behavior depends on ``persist_across_processes`` (init kwarg):
|
||||
Persist-mode (``persist_across_processes=True``, the default) leaves the
|
||||
container **running** untouched. The docs promise "ONE long-lived
|
||||
container shared across sessions" and stopping it on every Hermes exit
|
||||
breaks that promise:
|
||||
|
||||
* **True** (default) — only ``docker stop`` so the container is
|
||||
available for reuse by the next Hermes process. The orphan-reaper
|
||||
eventually removes it if no subsequent process picks it up.
|
||||
* **False** — ``docker stop`` followed by ``docker rm -f``, regardless
|
||||
of ``persistent_filesystem``. The previous ``rm`` path was gated on
|
||||
``not self._persistent`` which meant ``container_persistent: true``
|
||||
users (the default) leaked Exited containers forever (issue #20561).
|
||||
* Background processes inside the container (``npm run dev``, watchers,
|
||||
long-running pytest) get killed every time the user runs ``/quit``.
|
||||
* Every reuse requires ``docker start`` + waiting for the container to
|
||||
come back up, adding 1–2s to the first tool call of the new session.
|
||||
* The user-visible difference between "ONE long-lived container" and
|
||||
"a new container that happens to share state" is exactly this:
|
||||
processes survive in the former, die in the latter.
|
||||
|
||||
Cleanup runs on a daemon thread with bounded ``subprocess.run`` calls,
|
||||
not the previous fire-and-forget ``Popen(... &)`` shell construct.
|
||||
That pattern raced with parent-process exit and silently dropped
|
||||
cleanup work when the parent didn't outlive the detached shell — the
|
||||
primary mechanism behind Exited-container accumulation under SIGTERM
|
||||
/ ``hermes /quit`` / dead terminals.
|
||||
Resource reclamation for the persist-mode case lives in the
|
||||
``reap_orphan_containers()`` path (see issue #20561 commit 3): if no
|
||||
Hermes process touches a labeled container for ``2 × lifetime_seconds``
|
||||
it gets ``docker rm -f``'d at the next Hermes startup. That covers the
|
||||
SIGKILL / OOM / abandoned-laptop cases without us needing to stop the
|
||||
container on every graceful exit.
|
||||
|
||||
Opt-out mode (``persist_across_processes=False``) still does
|
||||
``docker stop`` + ``docker rm -f`` on every cleanup, matching the
|
||||
pre-PR behavior for users who explicitly want per-process isolation.
|
||||
|
||||
``force_remove=True`` overrides persist mode and always tears the
|
||||
container down (``docker stop`` + ``docker rm -f``). This is the
|
||||
explicit-teardown path for ``/reset``, ``cleanup_vm(task_id)``-driven
|
||||
resets, or any caller that wants a guaranteed fresh container on next
|
||||
``DockerEnvironment(task_id=...)``. No current caller passes
|
||||
``force_remove=True``; the parameter is here so the explicit-teardown
|
||||
semantics can be wired up later without changing this method's
|
||||
signature.
|
||||
|
||||
Cleanup runs on a daemon thread with bounded ``subprocess.run`` calls
|
||||
(not the racy ``Popen(... &)`` pattern from before PR #33645). The
|
||||
atexit hook in ``tools/terminal_tool.py`` waits up to 15s for the
|
||||
thread to finish before the interpreter exits, so ``docker stop`` /
|
||||
``docker rm`` actually completes when we do trigger it.
|
||||
"""
|
||||
container_id = self._container_id
|
||||
if not container_id:
|
||||
# Still drop the bind-mount dirs if any were allocated.
|
||||
# Still drop the bind-mount dirs if any were allocated and we're
|
||||
# NOT in persist mode (persist mode preserves them).
|
||||
if not self._persistent:
|
||||
for d in (self._workspace_dir, self._home_dir):
|
||||
if d:
|
||||
shutil.rmtree(d, ignore_errors=True)
|
||||
return
|
||||
|
||||
# Decide what to actually do. Three cases:
|
||||
#
|
||||
# force_remove=True → stop + rm (explicit teardown)
|
||||
# persist_across_processes=True → no-op (leave container running)
|
||||
# persist_across_processes=False → stop + rm (per-process isolation)
|
||||
#
|
||||
# The persist-mode no-op is the issue-#20561 contract: the container
|
||||
# outlives Hermes processes, processes inside it stay alive, and
|
||||
# reuse on next startup is instant.
|
||||
if force_remove:
|
||||
should_stop = True
|
||||
should_remove = True
|
||||
elif self._persist_across_processes:
|
||||
# No-op for the container. Drop the in-process handle so a fresh
|
||||
# __init__ will re-probe via labels (and find the running
|
||||
# container) instead of trying to reuse a stale Python reference.
|
||||
self._container_id = None
|
||||
return
|
||||
else:
|
||||
should_stop = True
|
||||
should_remove = True
|
||||
|
||||
# Capture state needed by the worker before we null out the attrs —
|
||||
# the worker thread can outlive ``self``.
|
||||
docker_exe = self._docker_exe
|
||||
should_remove = not self._persist_across_processes
|
||||
log_id = container_id[:12]
|
||||
|
||||
def _do_cleanup() -> None:
|
||||
try:
|
||||
subprocess.run(
|
||||
[docker_exe, "stop", "-t", "10", container_id],
|
||||
capture_output=True, timeout=30,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, OSError) as e:
|
||||
logger.warning("docker stop %s timed out / failed: %s", log_id, e)
|
||||
if should_stop:
|
||||
try:
|
||||
subprocess.run(
|
||||
[docker_exe, "stop", "-t", "10", container_id],
|
||||
capture_output=True, timeout=30,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, OSError) as e:
|
||||
logger.warning("docker stop %s timed out / failed: %s", log_id, e)
|
||||
if should_remove:
|
||||
try:
|
||||
subprocess.run(
|
||||
|
|
@ -977,7 +1022,10 @@ class DockerEnvironment(BaseEnvironment):
|
|||
self._cleanup_thread = t
|
||||
self._container_id = None
|
||||
|
||||
if not self._persistent:
|
||||
# Bind-mount dir teardown only runs when we actually removed the
|
||||
# container (the dirs are the container's filesystem state; keeping
|
||||
# them around with no container would orphan the data on disk).
|
||||
if should_remove and not self._persistent:
|
||||
for d in (self._workspace_dir, self._home_dir):
|
||||
if d:
|
||||
shutil.rmtree(d, ignore_errors=True)
|
||||
|
|
|
|||
|
|
@ -1426,8 +1426,21 @@ def cleanup_all_environments():
|
|||
return cleaned
|
||||
|
||||
|
||||
def cleanup_vm(task_id: str):
|
||||
"""Manually clean up a specific environment by task_id."""
|
||||
def cleanup_vm(task_id: str, *, force_remove: bool = True):
|
||||
"""Manually clean up a specific environment by task_id.
|
||||
|
||||
*force_remove* (default True) is forwarded to backends that accept it
|
||||
— currently only ``DockerEnvironment``. The default of True matches the
|
||||
semantics callers expect from this function: ``cleanup_vm(task_id)`` is
|
||||
the explicit-teardown path (called from ``/reset``-style flows,
|
||||
``AIAgent.close()``, and the idle reaper after a long inactivity window),
|
||||
so the container should be removed regardless of persist mode.
|
||||
|
||||
The idle reaper passes the env through ``env.cleanup()`` directly (not
|
||||
via this function), so persist-mode idle envs are NOT removed by idle
|
||||
reaping — only by explicit ``cleanup_vm()`` or the orphan reaper at next
|
||||
startup. See ``_cleanup_inactive_envs()`` for the idle path.
|
||||
"""
|
||||
# Remove from tracking dicts while holding the lock, but defer the
|
||||
# actual (potentially slow) env.cleanup() call to outside the lock
|
||||
# so other tool calls aren't blocked.
|
||||
|
|
@ -1452,7 +1465,14 @@ def cleanup_vm(task_id: str):
|
|||
|
||||
try:
|
||||
if hasattr(env, 'cleanup'):
|
||||
env.cleanup()
|
||||
# Pass force_remove only if the env's cleanup() accepts it
|
||||
# (DockerEnvironment after issue #20561; other backends don't).
|
||||
import inspect
|
||||
sig = inspect.signature(env.cleanup)
|
||||
if "force_remove" in sig.parameters:
|
||||
env.cleanup(force_remove=force_remove)
|
||||
else:
|
||||
env.cleanup()
|
||||
elif hasattr(env, 'stop'):
|
||||
env.stop()
|
||||
elif hasattr(env, 'terminate'):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue