test(docker): lock baseline behavior for Phase 0 harness

Tasks 0.2-0.6 of the s6-overlay supervision plan. Locks the
user-visible behavior we must preserve through the Phase 2 init-
system swap:

- test_main_invocation.py (Task 0.2): docker run <image> with no
  args, chat subcommand passthrough, bare executable passthrough,
  bash pattern, exit-code propagation
- test_tui_passthrough.py (Task 0.3): TTY allocation via docker -t
  using the host's script(1) for a PTY
- test_dashboard.py (Task 0.4): HERMES_DASHBOARD=1 opt-in,
  HERMES_DASHBOARD_PORT override
- test_profile_gateway.py (Task 0.5): per-profile gateway
  start/stop and profile-delete-stops-gateway. Both marked
  xfail(strict=True) because the current tini image refuses
  gateway lifecycle commands inside the container; Phase 4
  Task 4.3 flips them to passing.
- test_zombie_reaping.py (Task 0.6): PID 1 reaps orphaned
  zombies. tini does this today; s6-overlay's /init must
  continue to.

Refs: docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md
This commit is contained in:
Ben 2026-05-21 12:53:05 +10:00 committed by teknium1
parent 08302135b6
commit 6e6acdea2a
No known key found for this signature in database
5 changed files with 346 additions and 0 deletions

View file

@ -0,0 +1,75 @@
"""Harness: dashboard opt-in via HERMES_DASHBOARD.
Today (tini): dashboard starts once when HERMES_DASHBOARD=1; if it crashes
it stays dead. After Phase 2 (s6): dashboard starts once; if it crashes
it is restarted under supervision. The restart-after-crash test lives in
Phase 2 Task 2.5; this file only locks the opt-in surface (which must
not change between tini and s6).
"""
from __future__ import annotations
import subprocess
import time
def test_dashboard_not_running_by_default(
built_image: str, container_name: str,
) -> None:
"""Without HERMES_DASHBOARD, no dashboard process should be running."""
subprocess.run(
["docker", "run", "-d", "--name", container_name, built_image,
"sleep", "30"],
check=True, capture_output=True, timeout=30,
)
time.sleep(3)
r = subprocess.run(
["docker", "exec", container_name,
"pgrep", "-f", "hermes dashboard"],
capture_output=True, text=True, timeout=10,
)
# pgrep exits non-zero when no match found
assert r.returncode != 0, (
"Dashboard should not be running without HERMES_DASHBOARD"
)
def test_dashboard_opt_in_starts(
built_image: str, container_name: str,
) -> None:
"""With HERMES_DASHBOARD=1, a dashboard process should be visible."""
subprocess.run(
["docker", "run", "-d", "--name", container_name,
"-e", "HERMES_DASHBOARD=1", built_image, "sleep", "30"],
check=True, capture_output=True, timeout=30,
)
time.sleep(5)
r = subprocess.run(
["docker", "exec", container_name,
"pgrep", "-f", "hermes dashboard"],
capture_output=True, text=True, timeout=10,
)
assert r.returncode == 0, (
"Dashboard should be running with HERMES_DASHBOARD=1"
)
def test_dashboard_port_override(
built_image: str, container_name: str,
) -> None:
"""HERMES_DASHBOARD_PORT changes the dashboard's listen port."""
subprocess.run(
["docker", "run", "-d", "--name", container_name,
"-e", "HERMES_DASHBOARD=1", "-e", "HERMES_DASHBOARD_PORT=9120",
built_image, "sleep", "30"],
check=True, capture_output=True, timeout=30,
)
time.sleep(5)
r = subprocess.run(
["docker", "exec", container_name, "sh", "-c",
"ss -tlnp 2>/dev/null | grep ':9120' "
"|| netstat -tln 2>/dev/null | grep ':9120'"],
capture_output=True, text=True, timeout=10,
)
assert "9120" in r.stdout, (
f"Dashboard not listening on port 9120: stdout={r.stdout!r}"
)

View file

@ -0,0 +1,79 @@
"""Harness: docker run <image> [cmd...] invocation patterns.
These tests MUST pass on the current tini-based image AND continue to
pass after the Phase 2 s6 migration. Any behavior drift is a regression.
The harness expects ``built_image`` and ``container_name`` fixtures from
``tests/docker/conftest.py``. When Docker isn't available every test
here is skipped at collection time.
"""
from __future__ import annotations
import subprocess
def test_no_args_starts_hermes(built_image: str) -> None:
"""``docker run <image>`` should start hermes cleanly.
We invoke ``--version`` so the call exits without needing a configured
model. Exit code may be 0 (printed version) or 1 (config bootstrapping
failure on a fresh volume), but never a stack trace.
"""
r = subprocess.run(
["docker", "run", "--rm", built_image, "--version"],
capture_output=True, text=True, timeout=60,
)
assert r.returncode in (0, 1), (
f"Unexpected exit {r.returncode}: stderr={r.stderr!r}"
)
assert "Traceback" not in r.stderr
def test_chat_subcommand_passthrough(built_image: str) -> None:
"""``docker run <image> chat --help`` should exec ``hermes chat --help``.
Uses ``--help`` so the call doesn't need an upstream model configured.
"""
r = subprocess.run(
["docker", "run", "--rm", built_image, "chat", "--help"],
capture_output=True, text=True, timeout=60,
)
assert r.returncode == 0
combined = (r.stdout + r.stderr).lower()
assert "chat" in combined or "usage" in combined
def test_bare_executable_passthrough(built_image: str) -> None:
"""``docker run <image> sleep 1`` should exec ``sleep`` directly.
The entrypoint detects that ``sleep`` is on PATH and routes around the
hermes wrapper. Useful for long-lived sandbox mode and for testing.
"""
r = subprocess.run(
["docker", "run", "--rm", built_image, "sleep", "1"],
capture_output=True, text=True, timeout=30,
)
assert r.returncode == 0
def test_bash_pattern(built_image: str) -> None:
"""``docker run <image> bash -c 'echo ok'`` should exec bash directly."""
r = subprocess.run(
["docker", "run", "--rm", built_image, "bash", "-c", "echo ok"],
capture_output=True, text=True, timeout=30,
)
assert r.returncode == 0
assert "ok" in r.stdout
def test_container_exit_code_matches_inner_exit(built_image: str) -> None:
"""The container exit code must match the inner process's exit code.
Critical for CI: ``docker run <image> hermes batch ...`` returns a
non-zero status when batch fails. Phase 2 (s6) must preserve this.
"""
r = subprocess.run(
["docker", "run", "--rm", built_image, "sh", "-c", "exit 42"],
capture_output=True, text=True, timeout=30,
)
assert r.returncode == 42

View file

@ -0,0 +1,97 @@
"""Harness: per-profile gateway start/stop inside the container.
Phase 4 will change the *implementation* of these commands inside the
container they'll talk to s6 instead of refusing. The user-visible
surface that should result is locked here.
NOTE: These tests are marked ``xfail(strict=True)`` until Phase 4 lands.
The current tini image deliberately refuses gateway start/stop inside
containers ``pgrep`` finds nothing and the tests fail. After Phase 4
they should flip to passing automatically; ``strict=True`` means an
unexpected pass also fails the test, protecting against side-channel
fixes outside the planned Phase 4 mechanism.
"""
from __future__ import annotations
import subprocess
import time
import pytest
PROFILE = "test-harness-profile"
_PHASE4_REASON = (
"Phase 4 not yet landed: container-side `hermes gateway start` "
"currently exits 0 with an informational message instead of "
"spawning/supervising a gateway. Remove this marker after Task 4.3."
)
def _sh(
container: str, command: str, timeout: int = 30,
) -> subprocess.CompletedProcess[str]:
return subprocess.run(
["docker", "exec", container, "sh", "-c", command],
capture_output=True, text=True, timeout=timeout,
)
@pytest.mark.xfail(reason=_PHASE4_REASON, strict=True)
def test_profile_create_then_gateway_start(
built_image: str, container_name: str,
) -> None:
subprocess.run(
["docker", "run", "-d", "--name", container_name, built_image,
"sleep", "120"],
check=True, capture_output=True, timeout=30,
)
time.sleep(3)
r = _sh(container_name, f"hermes profile create {PROFILE}")
assert r.returncode == 0, f"profile create failed: {r.stderr}"
r = _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60)
assert r.returncode == 0, (
f"gateway start failed: stderr={r.stderr!r} stdout={r.stdout!r}"
)
time.sleep(3)
r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'")
assert r.returncode == 0, "gateway process not running"
r = _sh(container_name, f"hermes -p {PROFILE} gateway stop", timeout=30)
assert r.returncode == 0
time.sleep(2)
r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'")
assert r.returncode != 0, "gateway process still running after stop"
@pytest.mark.xfail(reason=_PHASE4_REASON, strict=True)
def test_profile_delete_stops_gateway(
built_image: str, container_name: str,
) -> None:
"""Deleting a profile should stop its gateway if running."""
subprocess.run(
["docker", "run", "-d", "--name", container_name, built_image,
"sleep", "120"],
check=True, capture_output=True, timeout=30,
)
time.sleep(3)
_sh(container_name, f"hermes profile create {PROFILE}")
_sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60)
time.sleep(3)
r = _sh(
container_name,
f"hermes profile delete {PROFILE} --yes",
timeout=30,
)
assert r.returncode == 0
time.sleep(2)
r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'")
assert r.returncode != 0, "gateway still running after profile delete"

View file

@ -0,0 +1,51 @@
"""Harness: interactive TUI TTY passthrough.
Uses ``script -qc`` on the host to allocate a PTY for the docker client,
which then allocates a container-side PTY via ``-t``. The probe inside
the container is ``tput cols``, which returns a real column count when
stdout is a TTY and either prints ``80`` (the terminfo fallback) or
nothing when it is not.
These tests MUST pass on the current tini-based image AND continue to
pass after the Phase 2 s6 migration. Any drift is a regression.
"""
from __future__ import annotations
import shlex
import shutil
import subprocess
import pytest
pytestmark = pytest.mark.skipif(
shutil.which("script") is None,
reason="`script` command not available on this host",
)
def test_tty_passthrough_to_container(built_image: str) -> None:
"""``docker run -t`` must deliver a real TTY to the container process."""
probe = "if [ -t 1 ]; then tput cols; else echo NO_TTY; fi"
cmd = (
f"docker run --rm -t -e COLUMNS=123 {built_image} "
f"sh -c {shlex.quote(probe)}"
)
r = subprocess.run(
["script", "-qc", cmd, "/dev/null"],
capture_output=True, text=True, timeout=120,
)
output = r.stdout.strip()
assert "NO_TTY" not in output, f"TTY passthrough failed: {output!r}"
numeric_lines = [s for s in output.split() if s.strip().isdigit()]
assert numeric_lines, f"No numeric width in output: {output!r}"
assert int(numeric_lines[0]) > 0
def test_tui_flag_recognized(built_image: str) -> None:
"""``docker run -it <image> --help`` should run without crashing."""
cmd = f"docker run --rm -t {built_image} --help"
r = subprocess.run(
["script", "-qc", cmd, "/dev/null"],
capture_output=True, text=True, timeout=60,
)
assert r.returncode == 0

View file

@ -0,0 +1,44 @@
"""Harness: PID 1 must reap orphaned zombie processes.
tini (current PID 1) reaps zombies via its built-in subreaper behavior.
s6-overlay's ``/init`` (Phase 2 PID 1) does the same. This invariant is
required for long-running containers spawning subprocesses (subagents,
dashboard, dynamic gateways) otherwise the process table fills with
defunct entries and eventually exhausts the kernel PID space.
"""
from __future__ import annotations
import subprocess
import time
def test_orphan_zombies_reaped(
built_image: str, container_name: str,
) -> None:
"""Spawn an orphan child that exits immediately. PID 1 must reap it."""
subprocess.run(
["docker", "run", "-d", "--name", container_name, built_image,
"sleep", "60"],
check=True, capture_output=True, timeout=30,
)
time.sleep(2)
# `( ( sleep 0.1 & ) & ); sleep 1` creates a grandchild detached from
# the original docker exec session — it becomes an orphan reparented
# to PID 1 in the container. When it exits, PID 1 must reap it.
subprocess.run(
["docker", "exec", container_name, "sh", "-c",
"( ( sleep 0.1 & ) & ); sleep 1"],
capture_output=True, text=True, timeout=10,
)
time.sleep(1)
r = subprocess.run(
["docker", "exec", container_name, "ps", "axo", "stat,pid,comm"],
capture_output=True, text=True, timeout=10,
)
zombies = [
line for line in r.stdout.split("\n")
if line.strip().startswith("Z")
]
assert not zombies, f"Zombies not reaped by PID 1: {zombies}"