mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-02 07:11:49 +00:00
fix(docker): cleanup_vm() default honors persist mode (don't kill container on session close)
Commit 4 made cleanup_vm() default to force_remove=True, which was wrong: cleanup_vm() is called from AIAgent.close() (TUI session close at tui_gateway/server.py:2991, gateway session teardown at gateway/run.py:3569) and from per-turn cleanup (agent/chat_completion_helpers.py:1517). All three are session-lifecycle events that should honor persist mode, not explicit user-initiated teardown. Ben reported the symptom: container shared between multiple TUI sessions (good) but killed as soon as any session closed (bad). With force_remove=True as the default, every `session.close` JSON-RPC tore down the container. The fix is to flip cleanup_vm()'s force_remove default back to False. The kwarg still exists for future explicit-teardown paths (`/reset`-style flows, "destroy my sandbox" commands) that haven't been wired up yet. Two new unit tests pin the behavior: * `test_cleanup_vm_default_honors_persist_mode` — asserts `cleanup_vm(task_id)` does neither docker stop nor docker rm on a persist-mode container (the regression Ben caught). * `test_cleanup_vm_force_remove_tears_down_persist_container` — asserts the kwarg still flows through the runtime-signature-inspection plumbing to the backend's cleanup(). E2E verified against real Docker (in addition to all 17 existing checks): ✓ Default cleanup_vm() leaves persist-mode container running ✓ cleanup_vm(force_remove=True) removed the container Refs #20561
This commit is contained in:
parent
5c2170a7c6
commit
2f0f03c40d
2 changed files with 101 additions and 10 deletions
|
|
@ -921,6 +921,91 @@ def test_cleanup_force_remove_stops_and_rms_even_in_persist_mode(monkeypatch):
|
|||
assert rms, f"force_remove must docker rm; got: {cleanup_calls}"
|
||||
|
||||
|
||||
def test_cleanup_vm_default_honors_persist_mode(monkeypatch):
|
||||
"""``cleanup_vm(task_id)`` without ``force_remove=True`` must be a no-op
|
||||
for a persist-mode container.
|
||||
|
||||
Regression for the bug Ben caught after commit 4: ``AIAgent.close()``
|
||||
(which is called from ``tui_gateway/server.py`` on session.close, from
|
||||
``gateway/run.py`` on per-session teardown, and from per-turn cleanup)
|
||||
calls ``cleanup_vm(task_id)``. If that defaulted to ``force_remove=True``
|
||||
we'd tear down the container on every TUI session close, defeating the
|
||||
"ONE long-lived container shared across sessions" contract.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
_install_fake_thread(monkeypatch)
|
||||
|
||||
from tools import terminal_tool
|
||||
|
||||
env = _make_dummy_env(task_id="session-close-test")
|
||||
container_id = env._container_id
|
||||
terminal_tool._active_environments["session-close-test"] = env
|
||||
|
||||
cleanup_calls = []
|
||||
real_run = docker_env.subprocess.run
|
||||
|
||||
def _capturing_run(cmd, **kwargs):
|
||||
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
|
||||
return real_run(cmd, **kwargs)
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _capturing_run)
|
||||
|
||||
try:
|
||||
terminal_tool.cleanup_vm("session-close-test")
|
||||
finally:
|
||||
terminal_tool._active_environments.pop("session-close-test", None)
|
||||
|
||||
stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
|
||||
rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
|
||||
assert not stops, (
|
||||
f"cleanup_vm() default must not docker stop a persist-mode container; "
|
||||
f"got: {stops}"
|
||||
)
|
||||
assert not rms, (
|
||||
f"cleanup_vm() default must not docker rm a persist-mode container; "
|
||||
f"got: {rms}"
|
||||
)
|
||||
|
||||
|
||||
def test_cleanup_vm_force_remove_tears_down_persist_container(monkeypatch):
|
||||
"""``cleanup_vm(task_id, force_remove=True)`` tears down a persist-mode
|
||||
container — the explicit-teardown path for ``/reset``-style flows.
|
||||
|
||||
Also pins the runtime-signature-inspection plumbing: the kwarg must
|
||||
actually flow through ``cleanup_vm`` into the backend's ``cleanup()``.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
monkeypatch.setattr(docker_env, "_get_active_profile_name", lambda: "default")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
_install_fake_thread(monkeypatch)
|
||||
|
||||
from tools import terminal_tool
|
||||
|
||||
env = _make_dummy_env(task_id="explicit-teardown-test")
|
||||
terminal_tool._active_environments["explicit-teardown-test"] = env
|
||||
|
||||
cleanup_calls = []
|
||||
real_run = docker_env.subprocess.run
|
||||
|
||||
def _capturing_run(cmd, **kwargs):
|
||||
cleanup_calls.append((list(cmd) if isinstance(cmd, list) else cmd, kwargs))
|
||||
return real_run(cmd, **kwargs)
|
||||
|
||||
monkeypatch.setattr(docker_env.subprocess, "run", _capturing_run)
|
||||
|
||||
try:
|
||||
terminal_tool.cleanup_vm("explicit-teardown-test", force_remove=True)
|
||||
finally:
|
||||
terminal_tool._active_environments.pop("explicit-teardown-test", None)
|
||||
|
||||
stops = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "stop"]
|
||||
rms = [c for c in cleanup_calls if isinstance(c[0], list) and len(c[0]) >= 2 and c[0][1] == "rm"]
|
||||
assert stops, f"force_remove must reach docker stop; got: {cleanup_calls}"
|
||||
assert rms, f"force_remove must reach docker rm; got: {cleanup_calls}"
|
||||
|
||||
|
||||
def test_cleanup_with_persist_disabled_stops_and_rms(monkeypatch):
|
||||
"""``persist_across_processes=False`` cleanup must docker stop AND docker
|
||||
rm so containers don't leak. Crucially, this runs regardless of the
|
||||
|
|
|
|||
|
|
@ -1426,20 +1426,26 @@ def cleanup_all_environments():
|
|||
return cleaned
|
||||
|
||||
|
||||
def cleanup_vm(task_id: str, *, force_remove: bool = True):
|
||||
def cleanup_vm(task_id: str, *, force_remove: bool = False):
|
||||
"""Manually clean up a specific environment by task_id.
|
||||
|
||||
*force_remove* (default True) is forwarded to backends that accept it
|
||||
— currently only ``DockerEnvironment``. The default of True matches the
|
||||
semantics callers expect from this function: ``cleanup_vm(task_id)`` is
|
||||
the explicit-teardown path (called from ``/reset``-style flows,
|
||||
``AIAgent.close()``, and the idle reaper after a long inactivity window),
|
||||
so the container should be removed regardless of persist mode.
|
||||
*force_remove* (default False) is forwarded to backends that accept it
|
||||
— currently only ``DockerEnvironment``. The default of False matches
|
||||
session-lifecycle semantics: this function is called from
|
||||
``AIAgent.close()`` (TUI session close, gateway session teardown) and the
|
||||
per-turn cleanup branch for non-persistent envs, both of which should
|
||||
honor the user's persist-mode preference. Stopping the container here
|
||||
would defeat the "ONE long-lived container shared across sessions"
|
||||
contract — exactly the bug Ben reported when the container was killed
|
||||
on every TUI session close.
|
||||
|
||||
Pass ``force_remove=True`` for actual user-initiated teardown
|
||||
(e.g. ``/reset``-style flows that haven't been wired yet, or future
|
||||
"destroy my sandbox" commands).
|
||||
|
||||
The idle reaper passes the env through ``env.cleanup()`` directly (not
|
||||
via this function), so persist-mode idle envs are NOT removed by idle
|
||||
reaping — only by explicit ``cleanup_vm()`` or the orphan reaper at next
|
||||
startup. See ``_cleanup_inactive_envs()`` for the idle path.
|
||||
via this function), so persist-mode idle envs are similarly no-op'd —
|
||||
only the orphan reaper at next startup reclaims them.
|
||||
"""
|
||||
# Remove from tracking dicts while holding the lock, but defer the
|
||||
# actual (potentially slow) env.cleanup() call to outside the lock
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue