fix(docker): gate resource limit flags on cgroup controller availability (#54516)

On hosts where the cgroup v2 cpu/memory/pids controllers are not delegated
to the docker/podman process (unprivileged Proxmox LXCs, some rootless and
nested setups), --pids-limit/--cpus/--memory cause every container start to
fail with OCI runtime error / exit 126, breaking terminal + execute_code.

- Add _cgroup_limits_available(image): one-shot, host-wide cached probe that
  spawns a throwaway container from the sandbox image itself (sleep 0) with
  all three flags together, mirroring the existing _storage_opt_supported
  probe-and-degrade pattern.
- Remove --pids-limit from static _BASE_SECURITY_ARGS; apply it (default 256
  via _DEFAULT_PIDS_LIMIT) in resource_args gated on the probe.
- Gate --cpus and --memory on the same probe.

Behavior unchanged on cgroup-capable hosts; graceful degradation with a
one-time warning where controllers aren't delegated.

Fixes #6568.

(cherry picked from commit c933880b7e)

Co-authored-by: angelos <angelos@oikos.lan.home.malaiwah.com>
This commit is contained in:
Ben Barclay 2026-06-29 11:01:08 +10:00 committed by GitHub
parent 10043c6d0c
commit 7cfa2fa13f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 169 additions and 5 deletions

View file

@ -322,19 +322,28 @@ def find_docker() -> Optional[str]:
# preserved. Omitted entirely when the container starts as a
# non-root user via --user, since no privilege drop is needed
# in that mode.
# Block privilege escalation and limit PIDs.
# Block privilege escalation.
# /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds).
#
# Note: ``--pids-limit`` is *not* in this list — it lives in ``resource_args``
# and is gated on ``_cgroup_limits_available(image)`` because it requires the
# ``pids`` cgroup controller to be delegated, which is not the case on hosts
# such as unprivileged LXCs. ``--cpus``/``--memory`` are gated for the same
# reason.
_BASE_SECURITY_ARGS = [
"--cap-drop", "ALL",
"--cap-add", "DAC_OVERRIDE",
"--cap-add", "CHOWN",
"--cap-add", "FOWNER",
"--security-opt", "no-new-privileges",
"--pids-limit", "256",
"--tmpfs", "/tmp:rw,nosuid,size=512m",
"--tmpfs", "/var/tmp:rw,noexec,nosuid,size=256m",
]
# Default per-container PID limit. Applied as ``--pids-limit`` only when the
# cgroup ``pids`` controller is available (see ``_cgroup_limits_available``).
_DEFAULT_PIDS_LIMIT = "256"
# /run is split out from _BASE_SECURITY_ARGS because s6-overlay images need it
# mounted ``exec``: s6 stage0 later runs ``exec /run/s6/basedir/bin/init``, which
# fails with "Permission denied" (exit 126) on a ``noexec`` mount. For all other
@ -431,6 +440,59 @@ def _resolve_host_user_spec() -> Optional[str]:
_storage_opt_ok: Optional[bool] = None # cached result across instances
_cgroup_limits_ok: Optional[bool] = None # cached result across instances
def _cgroup_limits_available(image: str) -> bool:
"""Probe whether cgroup resource limits work in this environment.
Tests ``--cpus``, ``--memory`` and ``--pids-limit`` together by spawning
a throwaway container from *image* (the same sandbox image we are about
to use for real, so no extra pull and no dependency on a public
registry). The container runs ``sleep 0`` sleep is guaranteed to be
present because the sandbox itself uses ``sleep 2h`` as its long-lived
entrypoint.
On hosts where the corresponding cgroup controllers are not delegated
to this process (typical inside unprivileged LXCs and some rootless
setups) these flags cause every container start to fail with ``OCI
runtime error`` / exit 126. The probe runs once per process and the
result which is host-wide, not image-specific is cached.
"""
global _cgroup_limits_ok
if _cgroup_limits_ok is not None:
return _cgroup_limits_ok
docker_exe = find_docker()
if not docker_exe or not image:
_cgroup_limits_ok = False
return False
try:
result = subprocess.run(
[docker_exe, "run", "--rm",
"--cpus", "0.5", "--memory", "64m", "--pids-limit", "32",
image, "sleep", "0"],
capture_output=True,
text=True,
timeout=60,
stdin=subprocess.DEVNULL,
)
_cgroup_limits_ok = result.returncode == 0
if not _cgroup_limits_ok:
logger.warning(
"Cgroup resource limits (--cpus/--memory/--pids-limit) not "
"available in this environment. Containers will run without "
"CPU, memory or PID limits. To enable, delegate the cpu, "
"memory and pids cgroup controllers to this container. "
"Probe stderr: %s",
(result.stderr or "").strip()[:500],
)
except Exception as e:
_cgroup_limits_ok = False
logger.warning("Cgroup limit probe failed; disabling resource limits: %s", e)
return _cgroup_limits_ok
def _ensure_docker_available() -> None:
@ -555,12 +617,17 @@ class DockerEnvironment(BaseEnvironment):
# Fail fast if Docker is not available.
_ensure_docker_available()
# Build resource limit args
# Build resource limit args (gated by cgroup availability probe so
# they degrade gracefully on hosts without controller delegation,
# e.g. unprivileged LXCs). The probe runs once per process and is
# cached host-wide.
resource_args = []
if cpu > 0:
if cpu > 0 and _cgroup_limits_available(image):
resource_args.extend(["--cpus", str(cpu)])
if memory > 0:
if memory > 0 and _cgroup_limits_available(image):
resource_args.extend(["--memory", f"{memory}m"])
if _cgroup_limits_available(image):
resource_args.extend(["--pids-limit", _DEFAULT_PIDS_LIMIT])
if disk > 0 and sys.platform != "darwin":
if self._storage_opt_supported():
resource_args.extend(["--storage-opt", f"size={disk}m"])