From 1466eab4eeeeb0689a0183c77113e5fa4c5c0a1b Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Sun, 28 Jun 2026 17:06:26 +1000 Subject: [PATCH] test(docker): wait for cont-init to finish before privilege-drop shim tests (#54026) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docker-exec privilege-drop shim tests started a sleep container and released the fixture as soon as `docker exec true` returned 0. On s6-overlay that succeeds almost immediately — ~0.05s in measurement — long before the `01-hermes-setup` cont-init hook (docker/stage2-hook.sh) has finished seeding + `chown hermes:hermes` config.yaml and running the Python config migration (cont-init only fully settles at ~9.8s under arm64 QEMU emulation). `test_shim_opt_out_keeps_root` wipes config.yaml, writes it as root with HERMES_DOCKER_EXEC_AS_ROOT=1, and asserts root:root ownership. When the fixture released the test inside that ~10s window, stage2-hook's boot-time `chown hermes:hermes config.yaml` raced the root-written file and reset it to hermes:hermes — failing the assertion. The window is invisible on native amd64 (stage2-hook completes in a blink) but wide open under the arm64 build's QEMU emulation, which is why only build-arm64 flaked while build-amd64 stayed green. Replace the responsiveness poll with a wait on the canonical 'cont-init finished' signal: $HERMES_HOME/logs/container-boot.log gaining a `profile=default` line, written by 02-reconcile-profiles which s6 runs strictly after 01-hermes-setup. Mirrors the readiness pattern already used in test_container_restart.py. Also bumps the readiness timeout 20s->60s to cover slow emulation. No production code change — test-only hardening of a timing race. --- .../docker/test_docker_exec_privilege_drop.py | 47 +++++++++++++++---- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/tests/docker/test_docker_exec_privilege_drop.py b/tests/docker/test_docker_exec_privilege_drop.py index cd06eaf7624..76c2da20961 100644 --- a/tests/docker/test_docker_exec_privilege_drop.py +++ b/tests/docker/test_docker_exec_privilege_drop.py @@ -32,21 +32,52 @@ import pytest # How long to give a `docker run -d` container before declaring it not ready. -_RUN_READY_TIMEOUT_S = 20 +# Generous because under arm64 QEMU emulation cont-init (a Python config +# migration + chowns) runs several times slower than on native amd64. +_RUN_READY_TIMEOUT_S = 60 -def _wait_for_init(container: str) -> None: - """Block until /init is up enough that `docker exec` is responsive.""" +def _wait_for_cont_init(container: str) -> None: + """Block until s6 cont-init has fully finished, not merely until + ``docker exec`` is responsive. + + The earlier ``_wait_for_init`` only polled ``docker exec true``, + which succeeds almost immediately on s6-overlay — long before the + ``01-hermes-setup`` cont-init hook (docker/stage2-hook.sh) has + finished seeding + ``chown hermes:hermes`` config.yaml and running the + Python config migration. A test that wipes config.yaml and then writes + it as root would then race that boot-time chown: on native amd64 + stage2-hook wins in a blink and the test always passed, but under arm64 + QEMU emulation the slow Python migration was still in flight and + clobbered the root-written file's ownership back to hermes:hermes, + failing ``test_shim_opt_out_keeps_root`` non-deterministically. + + The reliable "cont-init is done" signal is + ``$HERMES_HOME/logs/container-boot.log``: it is written by + ``02-reconcile-profiles`` (hermes_cli.container_boot), which s6 runs + *strictly after* ``01-hermes-setup`` in lexicographic order. The + reconciler always logs at least one ``profile=default`` line even for a + bare ``sleep infinity`` container, so once that marker appears every + stage2-hook side effect (seed, chown, migrate) is guaranteed complete. + Mirrors the readiness pattern in test_container_restart.py. + """ deadline = time.monotonic() + _RUN_READY_TIMEOUT_S + last = "" while time.monotonic() < deadline: r = subprocess.run( - ["docker", "exec", container, "true"], - capture_output=True, timeout=5, + ["docker", "exec", container, + "cat", "/opt/data/logs/container-boot.log"], + capture_output=True, text=True, timeout=5, ) if r.returncode == 0: - return + last = r.stdout + if "profile=default" in last: + return time.sleep(0.2) - pytest.fail(f"container {container} not responsive to docker exec within {_RUN_READY_TIMEOUT_S}s") + pytest.fail( + f"container {container} did not finish cont-init within " + f"{_RUN_READY_TIMEOUT_S}s (container-boot.log so far: {last!r})" + ) @pytest.fixture @@ -63,7 +94,7 @@ def sleep_container(built_image: str, container_name: str) -> Iterator[str]: ) assert r.returncode == 0, f"docker run failed: {r.stderr}" try: - _wait_for_init(container_name) + _wait_for_cont_init(container_name) yield container_name finally: subprocess.run(