hermes-agent/tests/docker/test_profile_gateway.py
Ben c524b8a4dc test(docker): fix svstat 'want up' assertion in profile-gateway lifecycle test
After the supervise-perms fix lands, the s6 lifecycle actually works
for the hermes user — hermes -p <profile> gateway start now genuinely
brings the supervised gateway up rather than silently no-op'ing on
EACCES. That exposes a latent bug in this test's assertion: it
expected 'want up' to appear literally in s6-svstat output, but
s6-svstat elides redundancies — when the slot is currently up AND
s6 wants it up, the output is just 'up (pid N pgid N) X seconds';
the explicit 'want up' token only appears when current ≠ wanted
(e.g. 'down (exitcode 1) … , want up' on a crash-loop).

Add a small helper _svstat_wants_up() that reads the want-state
correctly across both spellings:
  * 'up …'                       → wanted up (unless explicit 'want down')
  * 'down …, want up'            → wanted up explicitly
  * 'down …'                     → wanted down

Both stop and start assertions now use the helper. Also rewords
the module docstring to acknowledge that the supervised process
may succeed OR crash-loop depending on environment, but the want-
state contract holds either way.

(cherry picked from commit 02c933aedc)
2026-05-25 12:25:06 +10:00

138 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Harness: per-profile gateway start/stop inside the container.
Phase 4 wires `hermes -p <profile> gateway start/stop` through the s6
ServiceManager dispatch path inside the container — so the lifecycle
commands now bring up an s6-supervised gateway rather than refusing
with the pre-Phase-4 informational message.
These tests were marked ``xfail(strict=True)`` through Phase 03 and
flip to plain ``test_…`` once Phase 4 lands (now).
NB: The harness profile has no model/auth configured. Depending on
how the gateway run script handles missing config, the supervised
process may either spin up successfully (and svstat reports ``up``)
or exit fast and get throttled by s6 (and svstat reports ``down …,
want up``). Both states are valid "user asked for gateway up" results
— what we assert is the *want* intent the lifecycle command set, NOT
the supervised process's health. ``s6-svc -u`` records ``want up`` in
the supervise/status file regardless of the run-script outcome.
Every ``docker exec`` here runs as the unprivileged ``hermes`` user
(via :func:`docker_exec_sh` in conftest); see the conftest module
docstring.
"""
from __future__ import annotations
import subprocess
import time
from tests.docker.conftest import docker_exec_sh
PROFILE = "test-harness-profile"
def _sh(
container: str, command: str, timeout: int = 30,
) -> subprocess.CompletedProcess[str]:
return docker_exec_sh(container, command, timeout=timeout)
def _svstat(container: str) -> str:
"""Returns the raw s6-svstat output for the test profile's slot.
/command/s6-svstat is called by absolute path because /command/
isn't on PATH for docker-exec sessions."""
r = _sh(container, f"/command/s6-svstat /run/service/gateway-{PROFILE}")
return r.stdout if r.returncode == 0 else ""
def _svstat_wants_up(container: str) -> bool:
"""Read the slot's want-state from s6-svstat output.
s6-svstat formats the output to elide redundancies — when the
service is currently up AND s6 wants it up, the literal token
``want up`` doesn't appear (it's implicit from the leading ``up``).
When the service is down but s6 wants it back up, ``, want up``
appears explicitly. So a comprehensive "is the want-intent set to
up" check has to accept both spellings.
"""
state = _svstat(container)
if not state:
return False
head = state.split()[0] if state.split() else ""
if head == "up":
# Currently up implies wanted-up unless ``want down`` is set.
return "want down" not in state
# Currently down — ``want up`` only shows up when explicitly set.
return "want up" in state
def test_profile_create_then_gateway_start(
built_image: str, container_name: str,
) -> None:
subprocess.run(
["docker", "run", "-d", "--name", container_name, built_image,
"sleep", "120"],
check=True, capture_output=True, timeout=30,
)
time.sleep(3)
r = _sh(container_name, f"hermes profile create {PROFILE}")
assert r.returncode == 0, f"profile create failed: {r.stderr}"
# Profile create's s6-register hook should have produced a service slot.
r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}")
assert r.returncode == 0, "s6 service slot not created on profile create"
r = _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60)
assert r.returncode == 0, (
f"gateway start failed: stderr={r.stderr!r} stdout={r.stdout!r}"
)
# After start, s6's intent is "up" — even if the supervised gateway
# process spin-fails (no model/auth in the test profile), the
# supervision-state contract holds. See ``_svstat_wants_up`` for
# why we accept both ``up …`` (currently up) and ``down …, want
# up`` (down but s6 wants up).
time.sleep(2)
assert _svstat_wants_up(container_name), (
f"slot want-state is not up after gateway start: "
f"{_svstat(container_name)!r}"
)
r = _sh(container_name, f"hermes -p {PROFILE} gateway stop", timeout=30)
assert r.returncode == 0
time.sleep(2)
assert not _svstat_wants_up(container_name), (
f"slot want-state still up after gateway stop: "
f"{_svstat(container_name)!r}"
)
def test_profile_delete_stops_gateway(
built_image: str, container_name: str,
) -> None:
"""Deleting a profile should stop its gateway and remove the s6
service slot."""
subprocess.run(
["docker", "run", "-d", "--name", container_name, built_image,
"sleep", "120"],
check=True, capture_output=True, timeout=30,
)
time.sleep(3)
_sh(container_name, f"hermes profile create {PROFILE}")
_sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60)
time.sleep(3)
r = _sh(
container_name,
f"hermes profile delete {PROFILE} --yes",
timeout=30,
)
assert r.returncode == 0, f"profile delete failed: {r.stderr}"
time.sleep(2)
# Service slot should be gone.
r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}")
assert r.returncode != 0, "s6 service slot still present after profile delete"