mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-10 08:32:09 +00:00
fix(docker): recover from out-of-band container removal in persistent mode (salvage #36631) (#39415)
Salvage of #36631 (@annguyenNous), rebased onto current main with regression tests added. Fixes #36266. When a persistent Docker sandbox container is removed out-of-band (idle reaper, `docker prune`, OOM kill, daemon restart), the gateway kept issuing `docker exec` against the dead container ID, returning "No such container" on every subsequent tool call — the agent was permanently blocked until the gateway process restarted. DockerEnvironment.execute() now detects the "No such container" / "is not running" error after a non-zero exit (gated on persist_across_processes) and calls _recreate_container(): it tries label-based reuse first, falls back to a fresh container replaying the same image + full all_run_args set, re-runs init_session(), and retries the command once. A genuine non-zero exit is NOT misclassified as container-gone. Differs from #36631 as submitted: adds the tests the original lacked. tests/tools/test_docker_environment.py covers _is_container_gone pattern matching (incl. the negative/control case), the recover-and-retry path, the persist_across_processes=False opt-out (no recovery), and the ordinary-failure passthrough (no spurious recreation). _make_dummy_env now forwards persist_across_processes. Verified: - Unit: 67/67 in test_docker_environment.py (4 new + existing). - Live E2E against the real docker daemon: started a persistent container, `docker rm -f`'d it out-of-band, and the next execute() transparently recreated a fresh container and succeeded; a follow-up command worked in the recovered container; a real `exit N` passed through without triggering recovery. Co-authored-by: annguyenNous <annguyenNous@users.noreply.github.com>
This commit is contained in:
parent
c54b935873
commit
8a888441d7
2 changed files with 247 additions and 0 deletions
|
|
@ -44,6 +44,7 @@ def _make_dummy_env(**kwargs):
|
|||
auto_mount_cwd=kwargs.get("auto_mount_cwd", False),
|
||||
env=kwargs.get("env"),
|
||||
run_as_host_user=kwargs.get("run_as_host_user", False),
|
||||
persist_across_processes=kwargs.get("persist_across_processes", True),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -1707,3 +1708,128 @@ def test_plain_image_keeps_docker_init_and_run_noexec(monkeypatch):
|
|||
assert "noexec" in run_mounts[0], (
|
||||
f"/run must stay noexec for non-s6 images, got: {run_mounts[0]}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Out-of-band container removal recovery (issue #36266, PR #36631)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_is_container_gone_matches_removal_errors(monkeypatch):
|
||||
"""``_is_container_gone`` recognizes the docker errors that mean the
|
||||
container no longer exists, and does NOT match ordinary command failures.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
env = _make_dummy_env()
|
||||
|
||||
# Positive: the daemon's "container gone" phrasings.
|
||||
assert env._is_container_gone(
|
||||
"Error response from daemon: No such container: hermes-abc123"
|
||||
)
|
||||
assert env._is_container_gone("Error: No such container: deadbeef")
|
||||
assert env._is_container_gone(
|
||||
"Error response from daemon: Container abc is not running"
|
||||
)
|
||||
|
||||
# Control / negative: a real command failure must NOT be misclassified as
|
||||
# the container being gone — otherwise every non-zero exit would trigger a
|
||||
# spurious container recreation.
|
||||
assert not env._is_container_gone("bash: nonsuch: command not found")
|
||||
assert not env._is_container_gone("Traceback (most recent call last): ...")
|
||||
assert not env._is_container_gone("")
|
||||
assert not env._is_container_gone("permission denied")
|
||||
|
||||
|
||||
def test_execute_recovers_from_out_of_band_removal(monkeypatch):
|
||||
"""When a persistent container is removed out-of-band, ``execute`` detects
|
||||
the "No such container" error, recreates the container, and retries once —
|
||||
returning success transparently.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
env = _make_dummy_env(
|
||||
persistent_filesystem=True,
|
||||
persist_across_processes=True,
|
||||
)
|
||||
|
||||
# First execute() sees a dead container; second (post-recovery) succeeds.
|
||||
outputs = iter([
|
||||
{"output": "Error response from daemon: No such container: hermes-x", "returncode": 1},
|
||||
{"output": "ok", "returncode": 0},
|
||||
])
|
||||
|
||||
def _fake_super_execute(self, command, cwd="", **kwargs):
|
||||
return next(outputs)
|
||||
|
||||
recreate_calls = []
|
||||
|
||||
def _fake_recreate(self):
|
||||
recreate_calls.append(True)
|
||||
self._container_id = "recovered-container-id"
|
||||
return True
|
||||
|
||||
monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute)
|
||||
monkeypatch.setattr(
|
||||
docker_env.DockerEnvironment, "_recreate_container", _fake_recreate
|
||||
)
|
||||
|
||||
result = env.execute("echo hi")
|
||||
|
||||
assert recreate_calls == [True], "recovery should have been attempted exactly once"
|
||||
assert result.get("returncode") == 0, f"expected success after recovery, got {result!r}"
|
||||
assert result.get("output") == "ok"
|
||||
|
||||
|
||||
def test_execute_does_not_recover_when_not_persistent(monkeypatch):
|
||||
"""A non-persistent session must NOT trigger container recreation on a
|
||||
"No such container" error — recovery is only meaningful for the persistent,
|
||||
cross-process container that can be removed out-of-band.
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
env = _make_dummy_env(
|
||||
persistent_filesystem=True,
|
||||
persist_across_processes=False,
|
||||
)
|
||||
|
||||
def _fake_super_execute(self, command, cwd="", **kwargs):
|
||||
return {"output": "No such container: x", "returncode": 1}
|
||||
|
||||
def _fail_recreate(self):
|
||||
pytest.fail("recreation must not run when persist_across_processes is False")
|
||||
|
||||
monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute)
|
||||
monkeypatch.setattr(
|
||||
docker_env.DockerEnvironment, "_recreate_container", _fail_recreate
|
||||
)
|
||||
|
||||
result = env.execute("echo hi")
|
||||
assert result.get("returncode") == 1, "the original error must pass through unchanged"
|
||||
|
||||
|
||||
def test_execute_does_not_recover_on_ordinary_failure(monkeypatch):
|
||||
"""A genuine non-zero exit that is NOT a container-gone error must pass
|
||||
through without triggering recovery (guards against over-eager recreation).
|
||||
"""
|
||||
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
|
||||
_mock_subprocess_run(monkeypatch)
|
||||
env = _make_dummy_env(
|
||||
persistent_filesystem=True,
|
||||
persist_across_processes=True,
|
||||
)
|
||||
|
||||
def _fake_super_execute(self, command, cwd="", **kwargs):
|
||||
return {"output": "bash: badcmd: command not found", "returncode": 127}
|
||||
|
||||
def _fail_recreate(self):
|
||||
pytest.fail("recreation must not run for an ordinary command failure")
|
||||
|
||||
monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute)
|
||||
monkeypatch.setattr(
|
||||
docker_env.DockerEnvironment, "_recreate_container", _fail_recreate
|
||||
)
|
||||
|
||||
result = env.execute("badcmd")
|
||||
assert result.get("returncode") == 127
|
||||
assert "command not found" in result.get("output", "")
|
||||
|
|
|
|||
|
|
@ -537,6 +537,10 @@ class DockerEnvironment(BaseEnvironment):
|
|||
self._env = _normalize_env_dict(env)
|
||||
self._container_id: Optional[str] = None
|
||||
self._labels: dict[str, str] = {}
|
||||
self._image: str = ""
|
||||
self._container_name: str = ""
|
||||
self._image_uses_s6_init: bool = False
|
||||
self._all_run_args: list[str] = []
|
||||
logger.info(f"DockerEnvironment volumes: {volumes}")
|
||||
# Ensure volumes is a list (config.yaml could be malformed)
|
||||
if volumes is not None and not isinstance(volumes, list):
|
||||
|
|
@ -791,6 +795,12 @@ class DockerEnvironment(BaseEnvironment):
|
|||
"--label", f"hermes-task-id={task_label}",
|
||||
"--label", f"hermes-profile={profile_name}",
|
||||
]
|
||||
# Save args for container recreation on "No such container" recovery.
|
||||
self._image = image
|
||||
self._container_name = container_name
|
||||
self._image_uses_s6_init = image_uses_s6_init
|
||||
self._all_run_args = all_run_args
|
||||
|
||||
self._labels = {
|
||||
"hermes-agent": "1",
|
||||
"hermes-task-id": task_label,
|
||||
|
|
@ -945,6 +955,117 @@ class DockerEnvironment(BaseEnvironment):
|
|||
|
||||
return _popen_bash(cmd, stdin_data)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# "No such container" recovery (issue #36266)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
_NO_CONTAINER_PATTERNS = (
|
||||
"No such container",
|
||||
"is not running",
|
||||
"no such container",
|
||||
)
|
||||
|
||||
def _is_container_gone(self, output: str) -> bool:
|
||||
"""Return True if the output indicates the container no longer exists."""
|
||||
return any(p in output for p in self._NO_CONTAINER_PATTERNS)
|
||||
|
||||
def _recreate_container(self) -> bool:
|
||||
"""Recreate the container after it was removed out-of-band.
|
||||
|
||||
Tries label-based reuse first; if no existing container is found,
|
||||
starts a fresh one with the same image and run-args. Returns True
|
||||
on success, False if recreation fails (caller should surface the
|
||||
original error).
|
||||
"""
|
||||
old_id = (self._container_id or "")[:12]
|
||||
logger.warning(
|
||||
"Container %s appears to be gone — attempting recovery", old_id,
|
||||
)
|
||||
self._container_id = None
|
||||
|
||||
# 1. Try label-based reuse (another process may have recreated it).
|
||||
task_label = self._labels.get("hermes-task-id", "")
|
||||
profile_label = self._labels.get("hermes-profile", "")
|
||||
existing = self._find_reusable_container(task_label, profile_label)
|
||||
if existing is not None:
|
||||
cid, state = existing
|
||||
if state == "running":
|
||||
self._container_id = cid
|
||||
logger.info("Recovery: reusing running container %s", cid[:12])
|
||||
else:
|
||||
try:
|
||||
subprocess.run(
|
||||
[self._docker_exe, "start", cid],
|
||||
capture_output=True, text=True, timeout=30, check=True,
|
||||
)
|
||||
self._container_id = cid
|
||||
logger.info("Recovery: restarted container %s", cid[:12])
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
||||
logger.warning("Recovery: failed to start container %s: %s", cid[:12], e)
|
||||
|
||||
# 2. No reusable container — create a fresh one.
|
||||
if not self._container_id:
|
||||
if not self._image:
|
||||
logger.error("Recovery: no saved image name, cannot recreate container")
|
||||
return False
|
||||
try:
|
||||
import uuid as _uuid
|
||||
new_name = f"hermes-{_uuid.uuid4().hex[:8]}"
|
||||
init_args = [] if self._image_uses_s6_init else ["--init"]
|
||||
label_args = []
|
||||
for k, v in self._labels.items():
|
||||
label_args.extend(["--label", f"{k}={v}"])
|
||||
run_cmd = [
|
||||
self._docker_exe, "run", "-d",
|
||||
*init_args,
|
||||
"--name", new_name,
|
||||
*label_args,
|
||||
"-w", self.cwd,
|
||||
*self._all_run_args,
|
||||
self._image,
|
||||
"sleep", "infinity",
|
||||
]
|
||||
result = subprocess.run(
|
||||
run_cmd, capture_output=True, text=True, timeout=120, check=True,
|
||||
)
|
||||
self._container_id = result.stdout.strip()
|
||||
self._container_name = new_name
|
||||
logger.info(
|
||||
"Recovery: created fresh container %s (%s)",
|
||||
new_name, self._container_id[:12],
|
||||
)
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, OSError) as e:
|
||||
logger.error("Recovery: failed to create new container: %s", e)
|
||||
return False
|
||||
|
||||
# 3. Re-initialize session snapshot in the (re)created container.
|
||||
try:
|
||||
self._snapshot_ready = False
|
||||
self.init_session()
|
||||
except Exception as e:
|
||||
logger.error("Recovery: init_session failed in new container: %s", e)
|
||||
return False
|
||||
|
||||
logger.info("Recovery successful — new container %s", (self._container_id or "")[:12])
|
||||
return True
|
||||
|
||||
def execute(self, command: str, cwd: str = "", **kwargs) -> dict:
|
||||
"""Execute a command, auto-recovering from dead containers.
|
||||
|
||||
If the container was removed out-of-band (idle reaper, docker prune,
|
||||
OOM kill, daemon restart), detect the error and recreate the container
|
||||
transparently before retrying once.
|
||||
"""
|
||||
result = super().execute(command, cwd, **kwargs)
|
||||
if (
|
||||
result.get("returncode", 0) != 0
|
||||
and self._is_container_gone(result.get("output", ""))
|
||||
and self._persist_across_processes
|
||||
):
|
||||
if self._recreate_container():
|
||||
result = super().execute(command, cwd, **kwargs)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _storage_opt_supported() -> bool:
|
||||
"""Check if Docker's storage driver supports --storage-opt size=.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue