diff --git a/tests/tools/test_docker_environment.py b/tests/tools/test_docker_environment.py index 90f4d5dcd82..04935d81dfc 100644 --- a/tests/tools/test_docker_environment.py +++ b/tests/tools/test_docker_environment.py @@ -44,6 +44,7 @@ def _make_dummy_env(**kwargs): auto_mount_cwd=kwargs.get("auto_mount_cwd", False), env=kwargs.get("env"), run_as_host_user=kwargs.get("run_as_host_user", False), + persist_across_processes=kwargs.get("persist_across_processes", True), ) @@ -1707,3 +1708,128 @@ def test_plain_image_keeps_docker_init_and_run_noexec(monkeypatch): assert "noexec" in run_mounts[0], ( f"/run must stay noexec for non-s6 images, got: {run_mounts[0]}" ) + + +# --------------------------------------------------------------------------- +# Out-of-band container removal recovery (issue #36266, PR #36631) +# --------------------------------------------------------------------------- + + +def test_is_container_gone_matches_removal_errors(monkeypatch): + """``_is_container_gone`` recognizes the docker errors that mean the + container no longer exists, and does NOT match ordinary command failures. + """ + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + _mock_subprocess_run(monkeypatch) + env = _make_dummy_env() + + # Positive: the daemon's "container gone" phrasings. + assert env._is_container_gone( + "Error response from daemon: No such container: hermes-abc123" + ) + assert env._is_container_gone("Error: No such container: deadbeef") + assert env._is_container_gone( + "Error response from daemon: Container abc is not running" + ) + + # Control / negative: a real command failure must NOT be misclassified as + # the container being gone — otherwise every non-zero exit would trigger a + # spurious container recreation. + assert not env._is_container_gone("bash: nonsuch: command not found") + assert not env._is_container_gone("Traceback (most recent call last): ...") + assert not env._is_container_gone("") + assert not env._is_container_gone("permission denied") + + +def test_execute_recovers_from_out_of_band_removal(monkeypatch): + """When a persistent container is removed out-of-band, ``execute`` detects + the "No such container" error, recreates the container, and retries once — + returning success transparently. + """ + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + _mock_subprocess_run(monkeypatch) + env = _make_dummy_env( + persistent_filesystem=True, + persist_across_processes=True, + ) + + # First execute() sees a dead container; second (post-recovery) succeeds. + outputs = iter([ + {"output": "Error response from daemon: No such container: hermes-x", "returncode": 1}, + {"output": "ok", "returncode": 0}, + ]) + + def _fake_super_execute(self, command, cwd="", **kwargs): + return next(outputs) + + recreate_calls = [] + + def _fake_recreate(self): + recreate_calls.append(True) + self._container_id = "recovered-container-id" + return True + + monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute) + monkeypatch.setattr( + docker_env.DockerEnvironment, "_recreate_container", _fake_recreate + ) + + result = env.execute("echo hi") + + assert recreate_calls == [True], "recovery should have been attempted exactly once" + assert result.get("returncode") == 0, f"expected success after recovery, got {result!r}" + assert result.get("output") == "ok" + + +def test_execute_does_not_recover_when_not_persistent(monkeypatch): + """A non-persistent session must NOT trigger container recreation on a + "No such container" error — recovery is only meaningful for the persistent, + cross-process container that can be removed out-of-band. + """ + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + _mock_subprocess_run(monkeypatch) + env = _make_dummy_env( + persistent_filesystem=True, + persist_across_processes=False, + ) + + def _fake_super_execute(self, command, cwd="", **kwargs): + return {"output": "No such container: x", "returncode": 1} + + def _fail_recreate(self): + pytest.fail("recreation must not run when persist_across_processes is False") + + monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute) + monkeypatch.setattr( + docker_env.DockerEnvironment, "_recreate_container", _fail_recreate + ) + + result = env.execute("echo hi") + assert result.get("returncode") == 1, "the original error must pass through unchanged" + + +def test_execute_does_not_recover_on_ordinary_failure(monkeypatch): + """A genuine non-zero exit that is NOT a container-gone error must pass + through without triggering recovery (guards against over-eager recreation). + """ + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + _mock_subprocess_run(monkeypatch) + env = _make_dummy_env( + persistent_filesystem=True, + persist_across_processes=True, + ) + + def _fake_super_execute(self, command, cwd="", **kwargs): + return {"output": "bash: badcmd: command not found", "returncode": 127} + + def _fail_recreate(self): + pytest.fail("recreation must not run for an ordinary command failure") + + monkeypatch.setattr(docker_env.BaseEnvironment, "execute", _fake_super_execute) + monkeypatch.setattr( + docker_env.DockerEnvironment, "_recreate_container", _fail_recreate + ) + + result = env.execute("badcmd") + assert result.get("returncode") == 127 + assert "command not found" in result.get("output", "") diff --git a/tools/environments/docker.py b/tools/environments/docker.py index b87bdb125d4..421eb71be80 100644 --- a/tools/environments/docker.py +++ b/tools/environments/docker.py @@ -537,6 +537,10 @@ class DockerEnvironment(BaseEnvironment): self._env = _normalize_env_dict(env) self._container_id: Optional[str] = None self._labels: dict[str, str] = {} + self._image: str = "" + self._container_name: str = "" + self._image_uses_s6_init: bool = False + self._all_run_args: list[str] = [] logger.info(f"DockerEnvironment volumes: {volumes}") # Ensure volumes is a list (config.yaml could be malformed) if volumes is not None and not isinstance(volumes, list): @@ -791,6 +795,12 @@ class DockerEnvironment(BaseEnvironment): "--label", f"hermes-task-id={task_label}", "--label", f"hermes-profile={profile_name}", ] + # Save args for container recreation on "No such container" recovery. + self._image = image + self._container_name = container_name + self._image_uses_s6_init = image_uses_s6_init + self._all_run_args = all_run_args + self._labels = { "hermes-agent": "1", "hermes-task-id": task_label, @@ -945,6 +955,117 @@ class DockerEnvironment(BaseEnvironment): return _popen_bash(cmd, stdin_data) + # ------------------------------------------------------------------ + # "No such container" recovery (issue #36266) + # ------------------------------------------------------------------ + + _NO_CONTAINER_PATTERNS = ( + "No such container", + "is not running", + "no such container", + ) + + def _is_container_gone(self, output: str) -> bool: + """Return True if the output indicates the container no longer exists.""" + return any(p in output for p in self._NO_CONTAINER_PATTERNS) + + def _recreate_container(self) -> bool: + """Recreate the container after it was removed out-of-band. + + Tries label-based reuse first; if no existing container is found, + starts a fresh one with the same image and run-args. Returns True + on success, False if recreation fails (caller should surface the + original error). + """ + old_id = (self._container_id or "")[:12] + logger.warning( + "Container %s appears to be gone — attempting recovery", old_id, + ) + self._container_id = None + + # 1. Try label-based reuse (another process may have recreated it). + task_label = self._labels.get("hermes-task-id", "") + profile_label = self._labels.get("hermes-profile", "") + existing = self._find_reusable_container(task_label, profile_label) + if existing is not None: + cid, state = existing + if state == "running": + self._container_id = cid + logger.info("Recovery: reusing running container %s", cid[:12]) + else: + try: + subprocess.run( + [self._docker_exe, "start", cid], + capture_output=True, text=True, timeout=30, check=True, + ) + self._container_id = cid + logger.info("Recovery: restarted container %s", cid[:12]) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + logger.warning("Recovery: failed to start container %s: %s", cid[:12], e) + + # 2. No reusable container — create a fresh one. + if not self._container_id: + if not self._image: + logger.error("Recovery: no saved image name, cannot recreate container") + return False + try: + import uuid as _uuid + new_name = f"hermes-{_uuid.uuid4().hex[:8]}" + init_args = [] if self._image_uses_s6_init else ["--init"] + label_args = [] + for k, v in self._labels.items(): + label_args.extend(["--label", f"{k}={v}"]) + run_cmd = [ + self._docker_exe, "run", "-d", + *init_args, + "--name", new_name, + *label_args, + "-w", self.cwd, + *self._all_run_args, + self._image, + "sleep", "infinity", + ] + result = subprocess.run( + run_cmd, capture_output=True, text=True, timeout=120, check=True, + ) + self._container_id = result.stdout.strip() + self._container_name = new_name + logger.info( + "Recovery: created fresh container %s (%s)", + new_name, self._container_id[:12], + ) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired, OSError) as e: + logger.error("Recovery: failed to create new container: %s", e) + return False + + # 3. Re-initialize session snapshot in the (re)created container. + try: + self._snapshot_ready = False + self.init_session() + except Exception as e: + logger.error("Recovery: init_session failed in new container: %s", e) + return False + + logger.info("Recovery successful — new container %s", (self._container_id or "")[:12]) + return True + + def execute(self, command: str, cwd: str = "", **kwargs) -> dict: + """Execute a command, auto-recovering from dead containers. + + If the container was removed out-of-band (idle reaper, docker prune, + OOM kill, daemon restart), detect the error and recreate the container + transparently before retrying once. + """ + result = super().execute(command, cwd, **kwargs) + if ( + result.get("returncode", 0) != 0 + and self._is_container_gone(result.get("output", "")) + and self._persist_across_processes + ): + if self._recreate_container(): + result = super().execute(command, cwd, **kwargs) + return result + @staticmethod def _storage_opt_supported() -> bool: """Check if Docker's storage driver supports --storage-opt size=.