fix(docker): recover from out-of-band container removal in persistent mode (salvage #36631) (#39415)

Salvage of #36631 (@annguyenNous), rebased onto current main with
regression tests added. Fixes #36266.

When a persistent Docker sandbox container is removed out-of-band (idle
reaper, `docker prune`, OOM kill, daemon restart), the gateway kept
issuing `docker exec` against the dead container ID, returning
"No such container" on every subsequent tool call — the agent was
permanently blocked until the gateway process restarted.

DockerEnvironment.execute() now detects the "No such container" /
"is not running" error after a non-zero exit (gated on
persist_across_processes) and calls _recreate_container(): it tries
label-based reuse first, falls back to a fresh container replaying the
same image + full all_run_args set, re-runs init_session(), and retries
the command once. A genuine non-zero exit is NOT misclassified as
container-gone.

Differs from #36631 as submitted: adds the tests the original lacked.
tests/tools/test_docker_environment.py covers _is_container_gone pattern
matching (incl. the negative/control case), the recover-and-retry path,
the persist_across_processes=False opt-out (no recovery), and the
ordinary-failure passthrough (no spurious recreation). _make_dummy_env
now forwards persist_across_processes.

Verified:
- Unit: 67/67 in test_docker_environment.py (4 new + existing).
- Live E2E against the real docker daemon: started a persistent
  container, `docker rm -f`'d it out-of-band, and the next execute()
  transparently recreated a fresh container and succeeded; a follow-up
  command worked in the recovered container; a real `exit N` passed
  through without triggering recovery.

Co-authored-by: annguyenNous <annguyenNous@users.noreply.github.com>
This commit is contained in:
Ben Barclay 2026-06-05 10:33:44 +10:00 committed by GitHub
parent c54b935873
commit 8a888441d7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 247 additions and 0 deletions

View file

@ -44,6 +44,7 @@ def _make_dummy_env(**kwargs):
auto_mount_cwd=kwargs.get("auto_mount_cwd", False),
env=kwargs.get("env"),
run_as_host_user=kwargs.get("run_as_host_user", False),
persist_across_processes=kwargs.get("persist_across_processes", True),
)
@ -1707,3 +1708,128 @@ def test_plain_image_keeps_docker_init_and_run_noexec(monkeypatch):
assert "noexec" in run_mounts[0], (
f"/run must stay noexec for non-s6 images, got: {run_mounts[0]}"
)
# ---------------------------------------------------------------------------
# Out-of-band container removal recovery (issue #36266, PR #36631)
# ---------------------------------------------------------------------------
def test_is_container_gone_matches_removal_errors(monkeypatch):
"""``_is_container_gone`` recognizes the docker errors that mean the
container no longer exists, and does NOT match ordinary command failures.
"""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
_mock_subprocess_run(monkeypatch)
env = _make_dummy_env()
# Positive: the daemon's "container gone" phrasings.
assert env._is_container_gone(
"Error response from daemon: No such container: hermes-abc123"
)
assert env._is_container_gone("Error: No such container: deadbeef")
assert env._is_container_gone(
"Error response from daemon: Container abc is not running"
)
# Control / negative: a real command failure must NOT be misclassified as
# the container being gone — otherwise every non-zero exit would trigger a
# spurious container recreation.
assert not env._is_container_gone("bash: nonsuch: command not found")
assert not env._is_container_gone("Traceback (most recent call last): ...")
assert not env._is_container_gone("")
assert not env._is_container_gone("permission denied")
def test_execute_recovers_from_out_of_band_removal(monkeypatch):
"""When a persistent container is removed out-of-band, ``execute`` detects
the "No such container" error, recreates the container, and retries once
returning success transparently.
"""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
_mock_subprocess_run(monkeypatch)
env = _make_dummy_env(
persistent_filesystem=True,
persist_across_processes=True,
)
# First execute() sees a dead container; second (post-recovery) succeeds.
outputs = iter([
{"output": "Error response from daemon: No such container: hermes-x", "returncode": 1},
{"output": "ok", "returncode": 0},
])
def _fake_super_execute(self, command, cwd="", **kwargs):
return next(outputs)
recreate_calls = []
def _fake_recreate(self):
recreate_calls.append(True)
self._container_id = "recovered-container-id"
return True
monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute)
monkeypatch.setattr(
docker_env.DockerEnvironment, "_recreate_container", _fake_recreate
)
result = env.execute("echo hi")
assert recreate_calls == [True], "recovery should have been attempted exactly once"
assert result.get("returncode") == 0, f"expected success after recovery, got {result!r}"
assert result.get("output") == "ok"
def test_execute_does_not_recover_when_not_persistent(monkeypatch):
"""A non-persistent session must NOT trigger container recreation on a
"No such container" error recovery is only meaningful for the persistent,
cross-process container that can be removed out-of-band.
"""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
_mock_subprocess_run(monkeypatch)
env = _make_dummy_env(
persistent_filesystem=True,
persist_across_processes=False,
)
def _fake_super_execute(self, command, cwd="", **kwargs):
return {"output": "No such container: x", "returncode": 1}
def _fail_recreate(self):
pytest.fail("recreation must not run when persist_across_processes is False")
monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute)
monkeypatch.setattr(
docker_env.DockerEnvironment, "_recreate_container", _fail_recreate
)
result = env.execute("echo hi")
assert result.get("returncode") == 1, "the original error must pass through unchanged"
def test_execute_does_not_recover_on_ordinary_failure(monkeypatch):
"""A genuine non-zero exit that is NOT a container-gone error must pass
through without triggering recovery (guards against over-eager recreation).
"""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
_mock_subprocess_run(monkeypatch)
env = _make_dummy_env(
persistent_filesystem=True,
persist_across_processes=True,
)
def _fake_super_execute(self, command, cwd="", **kwargs):
return {"output": "bash: badcmd: command not found", "returncode": 127}
def _fail_recreate(self):
pytest.fail("recreation must not run for an ordinary command failure")
monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute)
monkeypatch.setattr(
docker_env.DockerEnvironment, "_recreate_container", _fail_recreate
)
result = env.execute("badcmd")
assert result.get("returncode") == 127
assert "command not found" in result.get("output", "")

View file

@ -537,6 +537,10 @@ class DockerEnvironment(BaseEnvironment):
self._env = _normalize_env_dict(env)
self._container_id: Optional[str] = None
self._labels: dict[str, str] = {}
self._image: str = ""
self._container_name: str = ""
self._image_uses_s6_init: bool = False
self._all_run_args: list[str] = []
logger.info(f"DockerEnvironment volumes: {volumes}")
# Ensure volumes is a list (config.yaml could be malformed)
if volumes is not None and not isinstance(volumes, list):
@ -791,6 +795,12 @@ class DockerEnvironment(BaseEnvironment):
"--label", f"hermes-task-id={task_label}",
"--label", f"hermes-profile={profile_name}",
]
# Save args for container recreation on "No such container" recovery.
self._image = image
self._container_name = container_name
self._image_uses_s6_init = image_uses_s6_init
self._all_run_args = all_run_args
self._labels = {
"hermes-agent": "1",
"hermes-task-id": task_label,
@ -945,6 +955,117 @@ class DockerEnvironment(BaseEnvironment):
return _popen_bash(cmd, stdin_data)
# ------------------------------------------------------------------
# "No such container" recovery (issue #36266)
# ------------------------------------------------------------------
_NO_CONTAINER_PATTERNS = (
"No such container",
"is not running",
"no such container",
)
def _is_container_gone(self, output: str) -> bool:
"""Return True if the output indicates the container no longer exists."""
return any(p in output for p in self._NO_CONTAINER_PATTERNS)
def _recreate_container(self) -> bool:
"""Recreate the container after it was removed out-of-band.
Tries label-based reuse first; if no existing container is found,
starts a fresh one with the same image and run-args. Returns True
on success, False if recreation fails (caller should surface the
original error).
"""
old_id = (self._container_id or "")[:12]
logger.warning(
"Container %s appears to be gone — attempting recovery", old_id,
)
self._container_id = None
# 1. Try label-based reuse (another process may have recreated it).
task_label = self._labels.get("hermes-task-id", "")
profile_label = self._labels.get("hermes-profile", "")
existing = self._find_reusable_container(task_label, profile_label)
if existing is not None:
cid, state = existing
if state == "running":
self._container_id = cid
logger.info("Recovery: reusing running container %s", cid[:12])
else:
try:
subprocess.run(
[self._docker_exe, "start", cid],
capture_output=True, text=True, timeout=30, check=True,
)
self._container_id = cid
logger.info("Recovery: restarted container %s", cid[:12])
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
logger.warning("Recovery: failed to start container %s: %s", cid[:12], e)
# 2. No reusable container — create a fresh one.
if not self._container_id:
if not self._image:
logger.error("Recovery: no saved image name, cannot recreate container")
return False
try:
import uuid as _uuid
new_name = f"hermes-{_uuid.uuid4().hex[:8]}"
init_args = [] if self._image_uses_s6_init else ["--init"]
label_args = []
for k, v in self._labels.items():
label_args.extend(["--label", f"{k}={v}"])
run_cmd = [
self._docker_exe, "run", "-d",
*init_args,
"--name", new_name,
*label_args,
"-w", self.cwd,
*self._all_run_args,
self._image,
"sleep", "infinity",
]
result = subprocess.run(
run_cmd, capture_output=True, text=True, timeout=120, check=True,
)
self._container_id = result.stdout.strip()
self._container_name = new_name
logger.info(
"Recovery: created fresh container %s (%s)",
new_name, self._container_id[:12],
)
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, OSError) as e:
logger.error("Recovery: failed to create new container: %s", e)
return False
# 3. Re-initialize session snapshot in the (re)created container.
try:
self._snapshot_ready = False
self.init_session()
except Exception as e:
logger.error("Recovery: init_session failed in new container: %s", e)
return False
logger.info("Recovery successful — new container %s", (self._container_id or "")[:12])
return True
def execute(self, command: str, cwd: str = "", **kwargs) -> dict:
"""Execute a command, auto-recovering from dead containers.
If the container was removed out-of-band (idle reaper, docker prune,
OOM kill, daemon restart), detect the error and recreate the container
transparently before retrying once.
"""
result = super().execute(command, cwd, **kwargs)
if (
result.get("returncode", 0) != 0
and self._is_container_gone(result.get("output", ""))
and self._persist_across_processes
):
if self._recreate_container():
result = super().execute(command, cwd, **kwargs)
return result
@staticmethod
def _storage_opt_supported() -> bool:
"""Check if Docker's storage driver supports --storage-opt size=.