From 1dfabe47b3b59b7def98efb72a4b5d62201ec3ff Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 23 May 2026 15:24:17 +1000 Subject: [PATCH] fix(docker): dashboard slot stays 'down' when HERMES_DASHBOARD unset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #30136 review caught a false positive: when HERMES_DASHBOARD was unset, the dashboard run script did `exec sleep infinity`, so `s6-svstat /run/service/dashboard` reported the slot as 'up'. `hermes doctor` and any other s6-svstat-based health check saw the dashboard as supervised-running even though no dashboard process existed. Add cont-init.d/03-dashboard-toggle: writes a `down` marker file into `/run/service/dashboard/` when HERMES_DASHBOARD is falsy, removes any leftover marker when it's truthy. s6-supervise honors `down` by not starting the service, so s6-svstat reports 'down' — matching reality. The run script's HERMES_DASHBOARD case-statement stays in place as a belt-and-suspenders guard, so the two layers can never disagree. Two new integration tests lock the behavior: slot reports down when unset; slot reports up when set to 1. --- Dockerfile | 1 + docker/cont-init.d/03-dashboard-toggle | 55 ++++++++++++++++++++++++++ tests/docker/test_dashboard.py | 54 +++++++++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100755 docker/cont-init.d/03-dashboard-toggle diff --git a/Dockerfile b/Dockerfile index eb5d9fb7e15..c51bca29e58 100644 --- a/Dockerfile +++ b/Dockerfile @@ -183,6 +183,7 @@ RUN mkdir -p /etc/cont-init.d && \ > /etc/cont-init.d/01-hermes-setup && \ chmod +x /etc/cont-init.d/01-hermes-setup COPY --chmod=0755 docker/cont-init.d/02-reconcile-profiles /etc/cont-init.d/02-reconcile-profiles +COPY --chmod=0755 docker/cont-init.d/03-dashboard-toggle /etc/cont-init.d/03-dashboard-toggle # ---------- Runtime ---------- ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist diff --git a/docker/cont-init.d/03-dashboard-toggle b/docker/cont-init.d/03-dashboard-toggle new file mode 100755 index 00000000000..59095f9c534 --- /dev/null +++ b/docker/cont-init.d/03-dashboard-toggle @@ -0,0 +1,55 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Toggle the dashboard s6-rc service slot based on HERMES_DASHBOARD. +# +# Runs as root in cont-init.d, after 01-hermes-setup (stage2) and +# 02-reconcile-profiles, BEFORE s6-rc starts user services. +# +# Background (PR #30136 review item I3): the dashboard service was +# always declared as an s6-rc longrun, with its run script checking +# HERMES_DASHBOARD and `exec sleep infinity` when unset. Trouble: +# s6-svstat then reports the dashboard slot as "up" (because sleep +# IS running) even though no dashboard process exists. `hermes +# doctor` and any other s6-svstat-based health check sees a +# false-positive up-state. +# +# Fix: write a `down` marker file into the live service-dir when +# HERMES_DASHBOARD is unset / falsy. s6-supervise honors `down` by +# not starting the service at all, so s6-svstat reports `down` — +# matching reality. +# +# The run script's HERMES_DASHBOARD case-statement stays in place +# as a belt-and-suspenders guard: even if the down marker is +# removed at runtime and the service is brought up, the run script +# still bails when HERMES_DASHBOARD is unset. Both layers agree. + +set -eu + +# Live service directory for the dashboard longrun. s6-overlay +# compiles /etc/s6-overlay/s6-rc.d/dashboard/ into this location +# at boot, before cont-init.d scripts run. +DASHBOARD_LIVE_DIR="/run/service/dashboard" + +# If the live directory hasn't materialized yet (e.g. running in a +# stripped-down test image), nothing to do — the run script's env +# check still keeps things safe. +if [ ! -d "$DASHBOARD_LIVE_DIR" ]; then + echo "[dashboard-toggle] $DASHBOARD_LIVE_DIR not present; skipping" + exit 0 +fi + +case "${HERMES_DASHBOARD:-}" in + 1|true|TRUE|True|yes|YES|Yes) + # Enabled — remove any leftover down marker from a previous boot. + if [ -e "$DASHBOARD_LIVE_DIR/down" ]; then + rm -f "$DASHBOARD_LIVE_DIR/down" + echo "[dashboard-toggle] HERMES_DASHBOARD enabled; removed down marker" + fi + ;; + *) + # Disabled — write a down marker so s6-supervise won't start + # the service. s6-svstat will report it as down, matching reality. + touch "$DASHBOARD_LIVE_DIR/down" + echo "[dashboard-toggle] HERMES_DASHBOARD unset; marked dashboard slot down" + ;; +esac diff --git a/tests/docker/test_dashboard.py b/tests/docker/test_dashboard.py index 652a2333851..56d4fa41c8a 100644 --- a/tests/docker/test_dashboard.py +++ b/tests/docker/test_dashboard.py @@ -52,6 +52,60 @@ def test_dashboard_not_running_by_default( ) +def test_dashboard_slot_reports_down_when_disabled( + built_image: str, container_name: str, +) -> None: + """Without HERMES_DASHBOARD, s6-svstat should report the dashboard + slot as DOWN (not up-with-sleep-infinity, which would + false-positive `hermes doctor` and any other health check). + + Locks the PR #30136 review item I3 fix: cont-init.d/03-dashboard-toggle + writes a `down` marker file in the live service-dir when + HERMES_DASHBOARD is unset, so the slot reflects reality. + """ + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "60"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(5) + # /command/ isn't on PATH for docker-exec sessions, so call by + # absolute path. + r = docker_exec( + container_name, "/command/s6-svstat", "/run/service/dashboard", + ) + assert r.returncode == 0, f"s6-svstat failed: {r.stderr!r} / {r.stdout!r}" + assert "down" in r.stdout, ( + f"Dashboard slot should be 'down' without HERMES_DASHBOARD; " + f"svstat reports: {r.stdout!r}" + ) + + +def test_dashboard_slot_reports_up_when_enabled( + built_image: str, container_name: str, +) -> None: + """Symmetry: with HERMES_DASHBOARD=1, s6-svstat reports the slot as up.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", built_image, "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + # uvicorn takes a moment to bind; poll svstat. + deadline = time.monotonic() + 30.0 + last = "" + while time.monotonic() < deadline: + r = docker_exec( + container_name, "/command/s6-svstat", "/run/service/dashboard", + ) + last = r.stdout + if r.returncode == 0 and "up " in r.stdout: + return # success + time.sleep(0.5) + raise AssertionError( + f"Dashboard slot never reached up state; last svstat: {last!r}" + ) + + def test_dashboard_opt_in_starts( built_image: str, container_name: str, ) -> None: