From 6e6acdea2a128f700a4940d9998c31eac8126f5e Mon Sep 17 00:00:00 2001 From: Ben Date: Thu, 21 May 2026 12:53:05 +1000 Subject: [PATCH] test(docker): lock baseline behavior for Phase 0 harness Tasks 0.2-0.6 of the s6-overlay supervision plan. Locks the user-visible behavior we must preserve through the Phase 2 init- system swap: - test_main_invocation.py (Task 0.2): docker run with no args, chat subcommand passthrough, bare executable passthrough, bash pattern, exit-code propagation - test_tui_passthrough.py (Task 0.3): TTY allocation via docker -t using the host's script(1) for a PTY - test_dashboard.py (Task 0.4): HERMES_DASHBOARD=1 opt-in, HERMES_DASHBOARD_PORT override - test_profile_gateway.py (Task 0.5): per-profile gateway start/stop and profile-delete-stops-gateway. Both marked xfail(strict=True) because the current tini image refuses gateway lifecycle commands inside the container; Phase 4 Task 4.3 flips them to passing. - test_zombie_reaping.py (Task 0.6): PID 1 reaps orphaned zombies. tini does this today; s6-overlay's /init must continue to. Refs: docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md --- tests/docker/test_dashboard.py | 75 +++++++++++++++++++++ tests/docker/test_main_invocation.py | 79 ++++++++++++++++++++++ tests/docker/test_profile_gateway.py | 97 ++++++++++++++++++++++++++++ tests/docker/test_tui_passthrough.py | 51 +++++++++++++++ tests/docker/test_zombie_reaping.py | 44 +++++++++++++ 5 files changed, 346 insertions(+) create mode 100644 tests/docker/test_dashboard.py create mode 100644 tests/docker/test_main_invocation.py create mode 100644 tests/docker/test_profile_gateway.py create mode 100644 tests/docker/test_tui_passthrough.py create mode 100644 tests/docker/test_zombie_reaping.py diff --git a/tests/docker/test_dashboard.py b/tests/docker/test_dashboard.py new file mode 100644 index 00000000000..ff2d2e42e0d --- /dev/null +++ b/tests/docker/test_dashboard.py @@ -0,0 +1,75 @@ +"""Harness: dashboard opt-in via HERMES_DASHBOARD. + +Today (tini): dashboard starts once when HERMES_DASHBOARD=1; if it crashes +it stays dead. After Phase 2 (s6): dashboard starts once; if it crashes +it is restarted under supervision. The restart-after-crash test lives in +Phase 2 Task 2.5; this file only locks the opt-in surface (which must +not change between tini and s6). +""" +from __future__ import annotations + +import subprocess +import time + + +def test_dashboard_not_running_by_default( + built_image: str, container_name: str, +) -> None: + """Without HERMES_DASHBOARD, no dashboard process should be running.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "30"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(3) + r = subprocess.run( + ["docker", "exec", container_name, + "pgrep", "-f", "hermes dashboard"], + capture_output=True, text=True, timeout=10, + ) + # pgrep exits non-zero when no match found + assert r.returncode != 0, ( + "Dashboard should not be running without HERMES_DASHBOARD" + ) + + +def test_dashboard_opt_in_starts( + built_image: str, container_name: str, +) -> None: + """With HERMES_DASHBOARD=1, a dashboard process should be visible.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", built_image, "sleep", "30"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(5) + r = subprocess.run( + ["docker", "exec", container_name, + "pgrep", "-f", "hermes dashboard"], + capture_output=True, text=True, timeout=10, + ) + assert r.returncode == 0, ( + "Dashboard should be running with HERMES_DASHBOARD=1" + ) + + +def test_dashboard_port_override( + built_image: str, container_name: str, +) -> None: + """HERMES_DASHBOARD_PORT changes the dashboard's listen port.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", "-e", "HERMES_DASHBOARD_PORT=9120", + built_image, "sleep", "30"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(5) + r = subprocess.run( + ["docker", "exec", container_name, "sh", "-c", + "ss -tlnp 2>/dev/null | grep ':9120' " + "|| netstat -tln 2>/dev/null | grep ':9120'"], + capture_output=True, text=True, timeout=10, + ) + assert "9120" in r.stdout, ( + f"Dashboard not listening on port 9120: stdout={r.stdout!r}" + ) diff --git a/tests/docker/test_main_invocation.py b/tests/docker/test_main_invocation.py new file mode 100644 index 00000000000..884b939153d --- /dev/null +++ b/tests/docker/test_main_invocation.py @@ -0,0 +1,79 @@ +"""Harness: docker run [cmd...] invocation patterns. + +These tests MUST pass on the current tini-based image AND continue to +pass after the Phase 2 s6 migration. Any behavior drift is a regression. + +The harness expects ``built_image`` and ``container_name`` fixtures from +``tests/docker/conftest.py``. When Docker isn't available every test +here is skipped at collection time. +""" +from __future__ import annotations + +import subprocess + + +def test_no_args_starts_hermes(built_image: str) -> None: + """``docker run `` should start hermes cleanly. + + We invoke ``--version`` so the call exits without needing a configured + model. Exit code may be 0 (printed version) or 1 (config bootstrapping + failure on a fresh volume), but never a stack trace. + """ + r = subprocess.run( + ["docker", "run", "--rm", built_image, "--version"], + capture_output=True, text=True, timeout=60, + ) + assert r.returncode in (0, 1), ( + f"Unexpected exit {r.returncode}: stderr={r.stderr!r}" + ) + assert "Traceback" not in r.stderr + + +def test_chat_subcommand_passthrough(built_image: str) -> None: + """``docker run chat --help`` should exec ``hermes chat --help``. + + Uses ``--help`` so the call doesn't need an upstream model configured. + """ + r = subprocess.run( + ["docker", "run", "--rm", built_image, "chat", "--help"], + capture_output=True, text=True, timeout=60, + ) + assert r.returncode == 0 + combined = (r.stdout + r.stderr).lower() + assert "chat" in combined or "usage" in combined + + +def test_bare_executable_passthrough(built_image: str) -> None: + """``docker run sleep 1`` should exec ``sleep`` directly. + + The entrypoint detects that ``sleep`` is on PATH and routes around the + hermes wrapper. Useful for long-lived sandbox mode and for testing. + """ + r = subprocess.run( + ["docker", "run", "--rm", built_image, "sleep", "1"], + capture_output=True, text=True, timeout=30, + ) + assert r.returncode == 0 + + +def test_bash_pattern(built_image: str) -> None: + """``docker run bash -c 'echo ok'`` should exec bash directly.""" + r = subprocess.run( + ["docker", "run", "--rm", built_image, "bash", "-c", "echo ok"], + capture_output=True, text=True, timeout=30, + ) + assert r.returncode == 0 + assert "ok" in r.stdout + + +def test_container_exit_code_matches_inner_exit(built_image: str) -> None: + """The container exit code must match the inner process's exit code. + + Critical for CI: ``docker run hermes batch ...`` returns a + non-zero status when batch fails. Phase 2 (s6) must preserve this. + """ + r = subprocess.run( + ["docker", "run", "--rm", built_image, "sh", "-c", "exit 42"], + capture_output=True, text=True, timeout=30, + ) + assert r.returncode == 42 diff --git a/tests/docker/test_profile_gateway.py b/tests/docker/test_profile_gateway.py new file mode 100644 index 00000000000..2e93f1f3b7b --- /dev/null +++ b/tests/docker/test_profile_gateway.py @@ -0,0 +1,97 @@ +"""Harness: per-profile gateway start/stop inside the container. + +Phase 4 will change the *implementation* of these commands inside the +container — they'll talk to s6 instead of refusing. The user-visible +surface that should result is locked here. + +NOTE: These tests are marked ``xfail(strict=True)`` until Phase 4 lands. +The current tini image deliberately refuses gateway start/stop inside +containers — ``pgrep`` finds nothing and the tests fail. After Phase 4 +they should flip to passing automatically; ``strict=True`` means an +unexpected pass also fails the test, protecting against side-channel +fixes outside the planned Phase 4 mechanism. +""" +from __future__ import annotations + +import subprocess +import time + +import pytest + +PROFILE = "test-harness-profile" + +_PHASE4_REASON = ( + "Phase 4 not yet landed: container-side `hermes gateway start` " + "currently exits 0 with an informational message instead of " + "spawning/supervising a gateway. Remove this marker after Task 4.3." +) + + +def _sh( + container: str, command: str, timeout: int = 30, +) -> subprocess.CompletedProcess[str]: + return subprocess.run( + ["docker", "exec", container, "sh", "-c", command], + capture_output=True, text=True, timeout=timeout, + ) + + +@pytest.mark.xfail(reason=_PHASE4_REASON, strict=True) +def test_profile_create_then_gateway_start( + built_image: str, container_name: str, +) -> None: + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(3) + + r = _sh(container_name, f"hermes profile create {PROFILE}") + assert r.returncode == 0, f"profile create failed: {r.stderr}" + + r = _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60) + assert r.returncode == 0, ( + f"gateway start failed: stderr={r.stderr!r} stdout={r.stdout!r}" + ) + + time.sleep(3) + + r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'") + assert r.returncode == 0, "gateway process not running" + + r = _sh(container_name, f"hermes -p {PROFILE} gateway stop", timeout=30) + assert r.returncode == 0 + + time.sleep(2) + + r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'") + assert r.returncode != 0, "gateway process still running after stop" + + +@pytest.mark.xfail(reason=_PHASE4_REASON, strict=True) +def test_profile_delete_stops_gateway( + built_image: str, container_name: str, +) -> None: + """Deleting a profile should stop its gateway if running.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(3) + + _sh(container_name, f"hermes profile create {PROFILE}") + _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60) + time.sleep(3) + + r = _sh( + container_name, + f"hermes profile delete {PROFILE} --yes", + timeout=30, + ) + assert r.returncode == 0 + + time.sleep(2) + r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'") + assert r.returncode != 0, "gateway still running after profile delete" diff --git a/tests/docker/test_tui_passthrough.py b/tests/docker/test_tui_passthrough.py new file mode 100644 index 00000000000..6de78216fd5 --- /dev/null +++ b/tests/docker/test_tui_passthrough.py @@ -0,0 +1,51 @@ +"""Harness: interactive TUI TTY passthrough. + +Uses ``script -qc`` on the host to allocate a PTY for the docker client, +which then allocates a container-side PTY via ``-t``. The probe inside +the container is ``tput cols``, which returns a real column count when +stdout is a TTY and either prints ``80`` (the terminfo fallback) or +nothing when it is not. + +These tests MUST pass on the current tini-based image AND continue to +pass after the Phase 2 s6 migration. Any drift is a regression. +""" +from __future__ import annotations + +import shlex +import shutil +import subprocess + +import pytest + +pytestmark = pytest.mark.skipif( + shutil.which("script") is None, + reason="`script` command not available on this host", +) + + +def test_tty_passthrough_to_container(built_image: str) -> None: + """``docker run -t`` must deliver a real TTY to the container process.""" + probe = "if [ -t 1 ]; then tput cols; else echo NO_TTY; fi" + cmd = ( + f"docker run --rm -t -e COLUMNS=123 {built_image} " + f"sh -c {shlex.quote(probe)}" + ) + r = subprocess.run( + ["script", "-qc", cmd, "/dev/null"], + capture_output=True, text=True, timeout=120, + ) + output = r.stdout.strip() + assert "NO_TTY" not in output, f"TTY passthrough failed: {output!r}" + numeric_lines = [s for s in output.split() if s.strip().isdigit()] + assert numeric_lines, f"No numeric width in output: {output!r}" + assert int(numeric_lines[0]) > 0 + + +def test_tui_flag_recognized(built_image: str) -> None: + """``docker run -it --help`` should run without crashing.""" + cmd = f"docker run --rm -t {built_image} --help" + r = subprocess.run( + ["script", "-qc", cmd, "/dev/null"], + capture_output=True, text=True, timeout=60, + ) + assert r.returncode == 0 diff --git a/tests/docker/test_zombie_reaping.py b/tests/docker/test_zombie_reaping.py new file mode 100644 index 00000000000..8aa797b57d1 --- /dev/null +++ b/tests/docker/test_zombie_reaping.py @@ -0,0 +1,44 @@ +"""Harness: PID 1 must reap orphaned zombie processes. + +tini (current PID 1) reaps zombies via its built-in subreaper behavior. +s6-overlay's ``/init`` (Phase 2 PID 1) does the same. This invariant is +required for long-running containers spawning subprocesses (subagents, +dashboard, dynamic gateways) — otherwise the process table fills with +defunct entries and eventually exhausts the kernel PID space. +""" +from __future__ import annotations + +import subprocess +import time + + +def test_orphan_zombies_reaped( + built_image: str, container_name: str, +) -> None: + """Spawn an orphan child that exits immediately. PID 1 must reap it.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "60"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(2) + + # `( ( sleep 0.1 & ) & ); sleep 1` creates a grandchild detached from + # the original docker exec session — it becomes an orphan reparented + # to PID 1 in the container. When it exits, PID 1 must reap it. + subprocess.run( + ["docker", "exec", container_name, "sh", "-c", + "( ( sleep 0.1 & ) & ); sleep 1"], + capture_output=True, text=True, timeout=10, + ) + time.sleep(1) + + r = subprocess.run( + ["docker", "exec", container_name, "ps", "axo", "stat,pid,comm"], + capture_output=True, text=True, timeout=10, + ) + zombies = [ + line for line in r.stdout.split("\n") + if line.strip().startswith("Z") + ] + assert not zombies, f"Zombies not reaped by PID 1: {zombies}"