mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
test(docker): wait for cont-init to finish before privilege-drop shim tests (#54026)
The docker-exec privilege-drop shim tests started a sleep container and released the fixture as soon as `docker exec <c> true` returned 0. On s6-overlay that succeeds almost immediately — ~0.05s in measurement — long before the `01-hermes-setup` cont-init hook (docker/stage2-hook.sh) has finished seeding + `chown hermes:hermes` config.yaml and running the Python config migration (cont-init only fully settles at ~9.8s under arm64 QEMU emulation). `test_shim_opt_out_keeps_root` wipes config.yaml, writes it as root with HERMES_DOCKER_EXEC_AS_ROOT=1, and asserts root:root ownership. When the fixture released the test inside that ~10s window, stage2-hook's boot-time `chown hermes:hermes config.yaml` raced the root-written file and reset it to hermes:hermes — failing the assertion. The window is invisible on native amd64 (stage2-hook completes in a blink) but wide open under the arm64 build's QEMU emulation, which is why only build-arm64 flaked while build-amd64 stayed green. Replace the responsiveness poll with a wait on the canonical 'cont-init finished' signal: $HERMES_HOME/logs/container-boot.log gaining a `profile=default` line, written by 02-reconcile-profiles which s6 runs strictly after 01-hermes-setup. Mirrors the readiness pattern already used in test_container_restart.py. Also bumps the readiness timeout 20s->60s to cover slow emulation. No production code change — test-only hardening of a timing race.
This commit is contained in:
parent
2c9b017696
commit
1466eab4ee
1 changed files with 39 additions and 8 deletions
|
|
@ -32,21 +32,52 @@ import pytest
|
|||
|
||||
|
||||
# How long to give a `docker run -d` container before declaring it not ready.
|
||||
_RUN_READY_TIMEOUT_S = 20
|
||||
# Generous because under arm64 QEMU emulation cont-init (a Python config
|
||||
# migration + chowns) runs several times slower than on native amd64.
|
||||
_RUN_READY_TIMEOUT_S = 60
|
||||
|
||||
|
||||
def _wait_for_init(container: str) -> None:
|
||||
"""Block until /init is up enough that `docker exec` is responsive."""
|
||||
def _wait_for_cont_init(container: str) -> None:
|
||||
"""Block until s6 cont-init has fully finished, not merely until
|
||||
``docker exec`` is responsive.
|
||||
|
||||
The earlier ``_wait_for_init`` only polled ``docker exec <c> true``,
|
||||
which succeeds almost immediately on s6-overlay — long before the
|
||||
``01-hermes-setup`` cont-init hook (docker/stage2-hook.sh) has
|
||||
finished seeding + ``chown hermes:hermes`` config.yaml and running the
|
||||
Python config migration. A test that wipes config.yaml and then writes
|
||||
it as root would then race that boot-time chown: on native amd64
|
||||
stage2-hook wins in a blink and the test always passed, but under arm64
|
||||
QEMU emulation the slow Python migration was still in flight and
|
||||
clobbered the root-written file's ownership back to hermes:hermes,
|
||||
failing ``test_shim_opt_out_keeps_root`` non-deterministically.
|
||||
|
||||
The reliable "cont-init is done" signal is
|
||||
``$HERMES_HOME/logs/container-boot.log``: it is written by
|
||||
``02-reconcile-profiles`` (hermes_cli.container_boot), which s6 runs
|
||||
*strictly after* ``01-hermes-setup`` in lexicographic order. The
|
||||
reconciler always logs at least one ``profile=default`` line even for a
|
||||
bare ``sleep infinity`` container, so once that marker appears every
|
||||
stage2-hook side effect (seed, chown, migrate) is guaranteed complete.
|
||||
Mirrors the readiness pattern in test_container_restart.py.
|
||||
"""
|
||||
deadline = time.monotonic() + _RUN_READY_TIMEOUT_S
|
||||
last = ""
|
||||
while time.monotonic() < deadline:
|
||||
r = subprocess.run(
|
||||
["docker", "exec", container, "true"],
|
||||
capture_output=True, timeout=5,
|
||||
["docker", "exec", container,
|
||||
"cat", "/opt/data/logs/container-boot.log"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
if r.returncode == 0:
|
||||
return
|
||||
last = r.stdout
|
||||
if "profile=default" in last:
|
||||
return
|
||||
time.sleep(0.2)
|
||||
pytest.fail(f"container {container} not responsive to docker exec within {_RUN_READY_TIMEOUT_S}s")
|
||||
pytest.fail(
|
||||
f"container {container} did not finish cont-init within "
|
||||
f"{_RUN_READY_TIMEOUT_S}s (container-boot.log so far: {last!r})"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
@ -63,7 +94,7 @@ def sleep_container(built_image: str, container_name: str) -> Iterator[str]:
|
|||
)
|
||||
assert r.returncode == 0, f"docker run failed: {r.stderr}"
|
||||
try:
|
||||
_wait_for_init(container_name)
|
||||
_wait_for_cont_init(container_name)
|
||||
yield container_name
|
||||
finally:
|
||||
subprocess.run(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue