fix(docker): gate resource limit flags on cgroup controller availability

On hosts where the cgroup v2 cpu/memory/pids controllers are not delegated
to the docker/podman process (unprivileged Proxmox LXCs, some rootless and
nested setups), --pids-limit/--cpus/--memory cause every container start to
fail with OCI runtime error / exit 126, breaking terminal + execute_code.

- Add _cgroup_limits_available(image): one-shot, host-wide cached probe that
  spawns a throwaway container from the sandbox image itself (sleep 0) with
  all three flags together, mirroring the existing _storage_opt_supported
  probe-and-degrade pattern.
- Remove --pids-limit from static _BASE_SECURITY_ARGS; apply it (default 256
  via _DEFAULT_PIDS_LIMIT) in resource_args gated on the probe.
- Gate --cpus and --memory on the same probe.

Behavior unchanged on cgroup-capable hosts; graceful degradation with a
one-time warning where controllers aren't delegated.

Fixes #6568.
This commit is contained in:
angelos 2026-06-28 15:24:16 -07:00
parent 16ff1a3b93
commit c933880b7e
No known key found for this signature in database
3 changed files with 169 additions and 5 deletions

View file

@ -0,0 +1,91 @@
"""Tests for cgroup resource-limit gating in the docker backend.
On hosts where the cgroup v2 cpu/memory/pids controllers are not delegated
(e.g. unprivileged Proxmox LXCs), passing ``--cpus``/``--memory``/``--pids-limit``
to ``docker run`` fails every container start with OCI runtime error / exit 126.
``_cgroup_limits_available`` probes once and the resource flags are gated on it,
so the sandbox degrades gracefully instead of failing.
"""
import subprocess
import pytest
import tools.environments.docker as docker_env
@pytest.fixture(autouse=True)
def _reset_cgroup_cache():
"""The probe result is cached in a module-level global; reset per test."""
docker_env._cgroup_limits_ok = None
yield
docker_env._cgroup_limits_ok = None
def test_pids_limit_not_in_base_security_args():
"""``--pids-limit`` must NOT be hardcoded in the static security args.
It requires the pids cgroup controller and is gated on the probe instead.
"""
assert "--pids-limit" not in docker_env._BASE_SECURITY_ARGS
def test_probe_returns_true_when_container_starts(monkeypatch):
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
captured = {}
def _run(cmd, *a, **k):
captured["cmd"] = cmd
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
monkeypatch.setattr(docker_env.subprocess, "run", _run)
assert docker_env._cgroup_limits_available("hermes-agent:latest") is True
# Probes all three controllers together against the real sandbox image.
assert "--cpus" in captured["cmd"]
assert "--memory" in captured["cmd"]
assert "--pids-limit" in captured["cmd"]
assert "hermes-agent:latest" in captured["cmd"]
def test_probe_returns_false_and_warns_on_oci_error(monkeypatch, caplog):
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
def _run(cmd, *a, **k):
return subprocess.CompletedProcess(
cmd, 126, stdout="",
stderr="crun: controller `pids` is not available",
)
monkeypatch.setattr(docker_env.subprocess, "run", _run)
with caplog.at_level("WARNING"):
assert docker_env._cgroup_limits_available("img") is False
assert "Cgroup resource limits" in caplog.text
def test_probe_returns_false_when_no_docker(monkeypatch):
monkeypatch.setattr(docker_env, "find_docker", lambda: None)
assert docker_env._cgroup_limits_available("img") is False
def test_probe_returns_false_on_empty_image(monkeypatch):
"""An empty image string must not be probed (would be a malformed run)."""
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
monkeypatch.setattr(
docker_env.subprocess, "run",
lambda *a, **k: pytest.fail("should not probe with empty image"),
)
assert docker_env._cgroup_limits_available("") is False
def test_probe_result_is_cached(monkeypatch):
monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker")
calls = []
def _run(cmd, *a, **k):
calls.append(cmd)
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
monkeypatch.setattr(docker_env.subprocess, "run", _run)
docker_env._cgroup_limits_available("img")
docker_env._cgroup_limits_available("img")
docker_env._cgroup_limits_available("img")
assert len(calls) == 1 # probe runs once, then cached

View file

@ -11,7 +11,13 @@ def _mock_subprocess_run(monkeypatch):
"""Mock subprocess.run to intercept docker run -d and docker version calls.
Returns a list of captured (cmd, kwargs) tuples for inspection.
Pre-seeds the cgroup-limit probe cache to ``True`` so the throwaway probe
container (a ``docker run ... sleep 0``) does not run and pollute the
captured call list these tests inspect the real sandbox-start ``run``.
Tests that exercise the probe itself live in test_docker_cgroup_limits.py.
"""
docker_env._cgroup_limits_ok = True
calls = []
def _run(cmd, **kwargs):

View file

@ -322,19 +322,28 @@ def find_docker() -> Optional[str]:
# preserved. Omitted entirely when the container starts as a
# non-root user via --user, since no privilege drop is needed
# in that mode.
# Block privilege escalation and limit PIDs.
# Block privilege escalation.
# /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds).
#
# Note: ``--pids-limit`` is *not* in this list — it lives in ``resource_args``
# and is gated on ``_cgroup_limits_available(image)`` because it requires the
# ``pids`` cgroup controller to be delegated, which is not the case on hosts
# such as unprivileged LXCs. ``--cpus``/``--memory`` are gated for the same
# reason.
_BASE_SECURITY_ARGS = [
"--cap-drop", "ALL",
"--cap-add", "DAC_OVERRIDE",
"--cap-add", "CHOWN",
"--cap-add", "FOWNER",
"--security-opt", "no-new-privileges",
"--pids-limit", "256",
"--tmpfs", "/tmp:rw,nosuid,size=512m",
"--tmpfs", "/var/tmp:rw,noexec,nosuid,size=256m",
]
# Default per-container PID limit. Applied as ``--pids-limit`` only when the
# cgroup ``pids`` controller is available (see ``_cgroup_limits_available``).
_DEFAULT_PIDS_LIMIT = "256"
# /run is split out from _BASE_SECURITY_ARGS because s6-overlay images need it
# mounted ``exec``: s6 stage0 later runs ``exec /run/s6/basedir/bin/init``, which
# fails with "Permission denied" (exit 126) on a ``noexec`` mount. For all other
@ -431,6 +440,59 @@ def _resolve_host_user_spec() -> Optional[str]:
_storage_opt_ok: Optional[bool] = None # cached result across instances
_cgroup_limits_ok: Optional[bool] = None # cached result across instances
def _cgroup_limits_available(image: str) -> bool:
"""Probe whether cgroup resource limits work in this environment.
Tests ``--cpus``, ``--memory`` and ``--pids-limit`` together by spawning
a throwaway container from *image* (the same sandbox image we are about
to use for real, so no extra pull and no dependency on a public
registry). The container runs ``sleep 0`` sleep is guaranteed to be
present because the sandbox itself uses ``sleep 2h`` as its long-lived
entrypoint.
On hosts where the corresponding cgroup controllers are not delegated
to this process (typical inside unprivileged LXCs and some rootless
setups) these flags cause every container start to fail with ``OCI
runtime error`` / exit 126. The probe runs once per process and the
result which is host-wide, not image-specific is cached.
"""
global _cgroup_limits_ok
if _cgroup_limits_ok is not None:
return _cgroup_limits_ok
docker_exe = find_docker()
if not docker_exe or not image:
_cgroup_limits_ok = False
return False
try:
result = subprocess.run(
[docker_exe, "run", "--rm",
"--cpus", "0.5", "--memory", "64m", "--pids-limit", "32",
image, "sleep", "0"],
capture_output=True,
text=True,
timeout=60,
stdin=subprocess.DEVNULL,
)
_cgroup_limits_ok = result.returncode == 0
if not _cgroup_limits_ok:
logger.warning(
"Cgroup resource limits (--cpus/--memory/--pids-limit) not "
"available in this environment. Containers will run without "
"CPU, memory or PID limits. To enable, delegate the cpu, "
"memory and pids cgroup controllers to this container. "
"Probe stderr: %s",
(result.stderr or "").strip()[:500],
)
except Exception as e:
_cgroup_limits_ok = False
logger.warning("Cgroup limit probe failed; disabling resource limits: %s", e)
return _cgroup_limits_ok
def _ensure_docker_available() -> None:
@ -555,12 +617,17 @@ class DockerEnvironment(BaseEnvironment):
# Fail fast if Docker is not available.
_ensure_docker_available()
# Build resource limit args
# Build resource limit args (gated by cgroup availability probe so
# they degrade gracefully on hosts without controller delegation,
# e.g. unprivileged LXCs). The probe runs once per process and is
# cached host-wide.
resource_args = []
if cpu > 0:
if cpu > 0 and _cgroup_limits_available(image):
resource_args.extend(["--cpus", str(cpu)])
if memory > 0:
if memory > 0 and _cgroup_limits_available(image):
resource_args.extend(["--memory", f"{memory}m"])
if _cgroup_limits_available(image):
resource_args.extend(["--pids-limit", _DEFAULT_PIDS_LIMIT])
if disk > 0 and sys.platform != "darwin":
if self._storage_opt_supported():
resource_args.extend(["--storage-opt", f"size={disk}m"])