From c933880b7ee2ce4d1167e0f89caa2d233db5639f Mon Sep 17 00:00:00 2001 From: angelos Date: Sun, 28 Jun 2026 15:24:16 -0700 Subject: [PATCH] fix(docker): gate resource limit flags on cgroup controller availability On hosts where the cgroup v2 cpu/memory/pids controllers are not delegated to the docker/podman process (unprivileged Proxmox LXCs, some rootless and nested setups), --pids-limit/--cpus/--memory cause every container start to fail with OCI runtime error / exit 126, breaking terminal + execute_code. - Add _cgroup_limits_available(image): one-shot, host-wide cached probe that spawns a throwaway container from the sandbox image itself (sleep 0) with all three flags together, mirroring the existing _storage_opt_supported probe-and-degrade pattern. - Remove --pids-limit from static _BASE_SECURITY_ARGS; apply it (default 256 via _DEFAULT_PIDS_LIMIT) in resource_args gated on the probe. - Gate --cpus and --memory on the same probe. Behavior unchanged on cgroup-capable hosts; graceful degradation with a one-time warning where controllers aren't delegated. Fixes #6568. --- tests/tools/test_docker_cgroup_limits.py | 91 ++++++++++++++++++++++++ tests/tools/test_docker_environment.py | 6 ++ tools/environments/docker.py | 77 ++++++++++++++++++-- 3 files changed, 169 insertions(+), 5 deletions(-) create mode 100644 tests/tools/test_docker_cgroup_limits.py diff --git a/tests/tools/test_docker_cgroup_limits.py b/tests/tools/test_docker_cgroup_limits.py new file mode 100644 index 00000000000..cc73d4c4b6b --- /dev/null +++ b/tests/tools/test_docker_cgroup_limits.py @@ -0,0 +1,91 @@ +"""Tests for cgroup resource-limit gating in the docker backend. + +On hosts where the cgroup v2 cpu/memory/pids controllers are not delegated +(e.g. unprivileged Proxmox LXCs), passing ``--cpus``/``--memory``/``--pids-limit`` +to ``docker run`` fails every container start with OCI runtime error / exit 126. +``_cgroup_limits_available`` probes once and the resource flags are gated on it, +so the sandbox degrades gracefully instead of failing. +""" +import subprocess + +import pytest + +import tools.environments.docker as docker_env + + +@pytest.fixture(autouse=True) +def _reset_cgroup_cache(): + """The probe result is cached in a module-level global; reset per test.""" + docker_env._cgroup_limits_ok = None + yield + docker_env._cgroup_limits_ok = None + + +def test_pids_limit_not_in_base_security_args(): + """``--pids-limit`` must NOT be hardcoded in the static security args. + + It requires the pids cgroup controller and is gated on the probe instead. + """ + assert "--pids-limit" not in docker_env._BASE_SECURITY_ARGS + + +def test_probe_returns_true_when_container_starts(monkeypatch): + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + captured = {} + + def _run(cmd, *a, **k): + captured["cmd"] = cmd + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(docker_env.subprocess, "run", _run) + assert docker_env._cgroup_limits_available("hermes-agent:latest") is True + # Probes all three controllers together against the real sandbox image. + assert "--cpus" in captured["cmd"] + assert "--memory" in captured["cmd"] + assert "--pids-limit" in captured["cmd"] + assert "hermes-agent:latest" in captured["cmd"] + + +def test_probe_returns_false_and_warns_on_oci_error(monkeypatch, caplog): + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + + def _run(cmd, *a, **k): + return subprocess.CompletedProcess( + cmd, 126, stdout="", + stderr="crun: controller `pids` is not available", + ) + + monkeypatch.setattr(docker_env.subprocess, "run", _run) + with caplog.at_level("WARNING"): + assert docker_env._cgroup_limits_available("img") is False + assert "Cgroup resource limits" in caplog.text + + +def test_probe_returns_false_when_no_docker(monkeypatch): + monkeypatch.setattr(docker_env, "find_docker", lambda: None) + assert docker_env._cgroup_limits_available("img") is False + + +def test_probe_returns_false_on_empty_image(monkeypatch): + """An empty image string must not be probed (would be a malformed run).""" + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + monkeypatch.setattr( + docker_env.subprocess, "run", + lambda *a, **k: pytest.fail("should not probe with empty image"), + ) + assert docker_env._cgroup_limits_available("") is False + + +def test_probe_result_is_cached(monkeypatch): + monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") + calls = [] + + def _run(cmd, *a, **k): + calls.append(cmd) + return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="") + + monkeypatch.setattr(docker_env.subprocess, "run", _run) + docker_env._cgroup_limits_available("img") + docker_env._cgroup_limits_available("img") + docker_env._cgroup_limits_available("img") + assert len(calls) == 1 # probe runs once, then cached diff --git a/tests/tools/test_docker_environment.py b/tests/tools/test_docker_environment.py index cea9ae4e4ff..c85402a3c2d 100644 --- a/tests/tools/test_docker_environment.py +++ b/tests/tools/test_docker_environment.py @@ -11,7 +11,13 @@ def _mock_subprocess_run(monkeypatch): """Mock subprocess.run to intercept docker run -d and docker version calls. Returns a list of captured (cmd, kwargs) tuples for inspection. + + Pre-seeds the cgroup-limit probe cache to ``True`` so the throwaway probe + container (a ``docker run ... sleep 0``) does not run and pollute the + captured call list — these tests inspect the real sandbox-start ``run``. + Tests that exercise the probe itself live in test_docker_cgroup_limits.py. """ + docker_env._cgroup_limits_ok = True calls = [] def _run(cmd, **kwargs): diff --git a/tools/environments/docker.py b/tools/environments/docker.py index 8245d6879bf..cd4a3fcd86a 100644 --- a/tools/environments/docker.py +++ b/tools/environments/docker.py @@ -322,19 +322,28 @@ def find_docker() -> Optional[str]: # preserved. Omitted entirely when the container starts as a # non-root user via --user, since no privilege drop is needed # in that mode. -# Block privilege escalation and limit PIDs. +# Block privilege escalation. # /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds). +# +# Note: ``--pids-limit`` is *not* in this list — it lives in ``resource_args`` +# and is gated on ``_cgroup_limits_available(image)`` because it requires the +# ``pids`` cgroup controller to be delegated, which is not the case on hosts +# such as unprivileged LXCs. ``--cpus``/``--memory`` are gated for the same +# reason. _BASE_SECURITY_ARGS = [ "--cap-drop", "ALL", "--cap-add", "DAC_OVERRIDE", "--cap-add", "CHOWN", "--cap-add", "FOWNER", "--security-opt", "no-new-privileges", - "--pids-limit", "256", "--tmpfs", "/tmp:rw,nosuid,size=512m", "--tmpfs", "/var/tmp:rw,noexec,nosuid,size=256m", ] +# Default per-container PID limit. Applied as ``--pids-limit`` only when the +# cgroup ``pids`` controller is available (see ``_cgroup_limits_available``). +_DEFAULT_PIDS_LIMIT = "256" + # /run is split out from _BASE_SECURITY_ARGS because s6-overlay images need it # mounted ``exec``: s6 stage0 later runs ``exec /run/s6/basedir/bin/init``, which # fails with "Permission denied" (exit 126) on a ``noexec`` mount. For all other @@ -431,6 +440,59 @@ def _resolve_host_user_spec() -> Optional[str]: _storage_opt_ok: Optional[bool] = None # cached result across instances +_cgroup_limits_ok: Optional[bool] = None # cached result across instances + + +def _cgroup_limits_available(image: str) -> bool: + """Probe whether cgroup resource limits work in this environment. + + Tests ``--cpus``, ``--memory`` and ``--pids-limit`` together by spawning + a throwaway container from *image* (the same sandbox image we are about + to use for real, so no extra pull and no dependency on a public + registry). The container runs ``sleep 0`` — sleep is guaranteed to be + present because the sandbox itself uses ``sleep 2h`` as its long-lived + entrypoint. + + On hosts where the corresponding cgroup controllers are not delegated + to this process (typical inside unprivileged LXCs and some rootless + setups) these flags cause every container start to fail with ``OCI + runtime error`` / exit 126. The probe runs once per process and the + result — which is host-wide, not image-specific — is cached. + """ + global _cgroup_limits_ok + if _cgroup_limits_ok is not None: + return _cgroup_limits_ok + + docker_exe = find_docker() + if not docker_exe or not image: + _cgroup_limits_ok = False + return False + + try: + result = subprocess.run( + [docker_exe, "run", "--rm", + "--cpus", "0.5", "--memory", "64m", "--pids-limit", "32", + image, "sleep", "0"], + capture_output=True, + text=True, + timeout=60, + stdin=subprocess.DEVNULL, + ) + _cgroup_limits_ok = result.returncode == 0 + if not _cgroup_limits_ok: + logger.warning( + "Cgroup resource limits (--cpus/--memory/--pids-limit) not " + "available in this environment. Containers will run without " + "CPU, memory or PID limits. To enable, delegate the cpu, " + "memory and pids cgroup controllers to this container. " + "Probe stderr: %s", + (result.stderr or "").strip()[:500], + ) + except Exception as e: + _cgroup_limits_ok = False + logger.warning("Cgroup limit probe failed; disabling resource limits: %s", e) + + return _cgroup_limits_ok def _ensure_docker_available() -> None: @@ -555,12 +617,17 @@ class DockerEnvironment(BaseEnvironment): # Fail fast if Docker is not available. _ensure_docker_available() - # Build resource limit args + # Build resource limit args (gated by cgroup availability probe so + # they degrade gracefully on hosts without controller delegation, + # e.g. unprivileged LXCs). The probe runs once per process and is + # cached host-wide. resource_args = [] - if cpu > 0: + if cpu > 0 and _cgroup_limits_available(image): resource_args.extend(["--cpus", str(cpu)]) - if memory > 0: + if memory > 0 and _cgroup_limits_available(image): resource_args.extend(["--memory", f"{memory}m"]) + if _cgroup_limits_available(image): + resource_args.extend(["--pids-limit", _DEFAULT_PIDS_LIMIT]) if disk > 0 and sys.platform != "darwin": if self._storage_opt_supported(): resource_args.extend(["--storage-opt", f"size={disk}m"])