From 4826ea7b413feaa05a27f85cb546ddbf2a099d21 Mon Sep 17 00:00:00 2001 From: Ben Date: Thu, 21 May 2026 15:33:25 +1000 Subject: [PATCH] feat(docker)!: replace tini with s6-overlay as PID 1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BREAKING CHANGE: the container ENTRYPOINT is now /init (s6-overlay) instead of /usr/bin/tini. Main hermes runs as the container CMD with TTY inherited (preserving --tui), dashboard runs as a supervised s6-rc service (HERMES_DASHBOARD=1 starts it; crashes auto-restart), and the ground is laid for per-profile gateway supervision (Phase 3+4). All five pre-s6 docker run invocation patterns continue to work identically — verified by the Phase 0 docker harness: docker run → `hermes` with no args docker run chat -q "..." → `hermes chat -q ...` passthrough docker run sleep infinity → `sleep infinity` direct docker run bash → interactive bash docker run -it --tui → interactive Ink TUI Phase 2 harness result: 12 passed, 2 xfailed (Phase 4 target). Hadolint + shellcheck pass cleanly. Architecture pivot from plan v3 (documented in main-hermes/run header): the plan called for main hermes to be an s6-supervised service, but two real s6-overlay v3 mechanics blocked that — cont-init.d scripts receive no arguments (CMD args are not visible to stage2-hook), and `/run/s6/basedir/bin/halt` after writing the exit code did not propagate the desired exit code (container exits 143). We use the s6-overlay-native CMD pattern instead: main-wrapper.sh is the container's main program (ENTRYPOINT prepends it so leading-dash args like --version aren't intercepted by /init), exec's the final program with stdin/stdout/stderr inherited, and the program's exit code becomes the container exit code. main-hermes is now a no-op `sleep infinity` slot kept for future supervised-gateway-container modes. This trades "supervised restart of main hermes" for arg- parity with the pre-s6 contract — main hermes was already unsupervised under tini, so we lose nothing functional. Dashboard supervision is the only new guarantee added by this phase. Files added: docker/main-wrapper.sh # arg routing + s6-setuidgid drop docker/stage2-hook.sh # gosu-equivalent + chown + seed docker/s6-rc.d/main-hermes/{type,run,dependencies.d/base} docker/s6-rc.d/dashboard/{type,run,dependencies.d/base} docker/s6-rc.d/user/contents.d/{main-hermes,dashboard} Files changed: Dockerfile: tini → s6-overlay install + ENTRYPOINT flip + service wiring docker/entrypoint.sh: thin shim to stage2-hook.sh for back-compat tests/docker/test_dashboard.py: add test_dashboard_restarts_after_crash Refs: docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md --- Dockerfile | 69 ++++++- docker/entrypoint.sh | 168 +----------------- docker/main-wrapper.sh | 30 ++++ docker/s6-rc.d/dashboard/dependencies.d/base | 0 docker/s6-rc.d/dashboard/run | 30 ++++ docker/s6-rc.d/dashboard/type | 1 + .../s6-rc.d/main-hermes/dependencies.d/base | 0 docker/s6-rc.d/main-hermes/run | 27 +++ docker/s6-rc.d/main-hermes/type | 1 + docker/s6-rc.d/user/contents.d/dashboard | 0 docker/s6-rc.d/user/contents.d/main-hermes | 0 docker/stage2-hook.sh | 105 +++++++++++ tests/docker/test_dashboard.py | 64 +++++++ 13 files changed, 331 insertions(+), 164 deletions(-) create mode 100755 docker/main-wrapper.sh create mode 100644 docker/s6-rc.d/dashboard/dependencies.d/base create mode 100755 docker/s6-rc.d/dashboard/run create mode 100644 docker/s6-rc.d/dashboard/type create mode 100644 docker/s6-rc.d/main-hermes/dependencies.d/base create mode 100755 docker/s6-rc.d/main-hermes/run create mode 100644 docker/s6-rc.d/main-hermes/type create mode 100644 docker/s6-rc.d/user/contents.d/dashboard create mode 100644 docker/s6-rc.d/user/contents.d/main-hermes create mode 100755 docker/stage2-hook.sh diff --git a/Dockerfile b/Dockerfile index 6e8f0209636..1db0e1c8d5e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,14 +9,32 @@ ENV PYTHONUNBUFFERED=1 # install survives the /opt/data volume overlay at runtime. ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright -# Install system dependencies in one layer, clear APT cache -# tini reaps orphaned zombie processes (MCP stdio subprocesses, git, bun, etc.) -# that would otherwise accumulate when hermes runs as PID 1. See #15012. +# Install system dependencies in one layer, clear APT cache. +# tini was previously PID 1 to reap orphaned zombie processes (MCP stdio +# subprocesses, git, bun, etc.) that would otherwise accumulate when hermes +# ran as PID 1. See #15012. Phase 2 of the s6-overlay supervision plan +# replaces tini with s6-overlay's /init (PID 1 = s6-svscan), which reaps +# zombies non-blockingly on SIGCHLD and additionally supervises the main +# hermes process, the dashboard, and per-profile gateways. RUN apt-get update && \ apt-get install -y --no-install-recommends \ - build-essential curl nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli tini && \ + build-essential curl nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli xz-utils && \ rm -rf /var/lib/apt/lists/* +# ---------- s6-overlay install ---------- +# s6-overlay provides supervision for the main hermes process, the dashboard, +# and per-profile gateways. /init becomes PID 1 below — see ENTRYPOINT. +# x86_64 only for now; aarch64 (Apple Silicon, ARM servers) is a follow-up +# that needs TARGETARCH plumbing across all three ADDs. +ARG S6_OVERLAY_VERSION=3.2.3.0 +ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz /tmp/ +ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-x86_64.tar.xz /tmp/ +ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-symlinks-noarch.tar.xz /tmp/ +RUN tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz && \ + tar -C / -Jxpf /tmp/s6-overlay-x86_64.tar.xz && \ + tar -C / -Jxpf /tmp/s6-overlay-symlinks-noarch.tar.xz && \ + rm /tmp/s6-overlay-*.tar.xz + # Non-root user for runtime; UID can be overridden via HERMES_UID at runtime RUN useradd -u 10000 -m -d /opt/data hermes @@ -111,10 +129,51 @@ RUN chmod -R a+rX /opt/hermes && \ # this a fast (~1s) egg-link creation with no resolution or downloads. RUN uv pip install --no-cache-dir --no-deps -e "." +# ---------- s6-overlay service wiring ---------- +# Static services declared at build time: main-hermes + dashboard. +# Per-profile gateway services are registered dynamically at runtime by +# the profile create/delete hooks (Phase 4); they live under +# /run/service/ (tmpfs) and are reconciled on container restart by +# /etc/cont-init.d/02-reconcile-profiles (Phase 4 Task 4.0). +COPY docker/s6-rc.d/ /etc/s6-overlay/s6-rc.d/ + +# stage2-hook handles UID/GID remap, volume chown, config seeding, +# skills sync, and TUI detection — all the work the old entrypoint.sh +# did between gosu-drop and `exec hermes`. Wired in as cont-init.d/01- +# so it runs before any user services start. +RUN mkdir -p /etc/cont-init.d && \ + printf '#!/bin/sh\nexec /opt/hermes/docker/stage2-hook.sh\n' \ + > /etc/cont-init.d/01-hermes-setup && \ + chmod +x /etc/cont-init.d/01-hermes-setup + # ---------- Runtime ---------- ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist ENV HERMES_HOME=/opt/data ENV PATH="/opt/data/.local/bin:${PATH}" RUN mkdir -p /opt/data VOLUME [ "/opt/data" ] -ENTRYPOINT [ "/usr/bin/tini", "-g", "--", "/opt/hermes/docker/entrypoint.sh" ] + +# s6-overlay's /init is PID 1. It sets up the supervision tree, runs +# /etc/cont-init.d/* (our stage2 hook), starts s6-rc services +# declared in /etc/s6-overlay/s6-rc.d/, then exec's its remaining +# argv as the container's "main program" with stdin/stdout/stderr +# inherited (this is what makes interactive --tui work). When the +# main program exits, /init begins stage 3 shutdown and the container +# exits with the program's exit code. Replaces tini — see Phase 2 of +# docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md. +# +# We use the ENTRYPOINT+CMD split rather than CMD alone so the +# wrapper is prepended to user-supplied args automatically: +# +# docker run → /init main-wrapper.sh (CMD default) +# docker run chat -q "hi" → /init main-wrapper.sh chat -q hi +# docker run sleep infinity → /init main-wrapper.sh sleep infinity +# docker run --tui → /init main-wrapper.sh --tui +# +# main-wrapper.sh handles arg routing (bare-exec vs. hermes +# subcommand vs. no-args), drops to the hermes user via s6-setuidgid, +# and exec's the final program so its exit code becomes the container +# exit code. Without the wrapper-as-ENTRYPOINT, leading-dash args +# like `--version` would be intercepted by /init's POSIX shell. +ENTRYPOINT [ "/init", "/opt/hermes/docker/main-wrapper.sh" ] +CMD [ ] diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 9af045e226f..b1b44d8abf0 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -1,160 +1,10 @@ -#!/bin/bash -# Docker/Podman entrypoint: bootstrap config files into the mounted volume, then run hermes. -set -e - -HERMES_HOME="${HERMES_HOME:-/opt/data}" -INSTALL_DIR="/opt/hermes" - -# --- Privilege dropping via gosu --- -# When started as root (the default for Docker, or fakeroot in rootless Podman), -# optionally remap the hermes user/group to match host-side ownership, fix volume -# permissions, then re-exec as hermes. -if [ "$(id -u)" = "0" ]; then - if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then - echo "Changing hermes UID to $HERMES_UID" - usermod -u "$HERMES_UID" hermes - fi - - if [ -n "$HERMES_GID" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then - echo "Changing hermes GID to $HERMES_GID" - # -o allows non-unique GID (e.g. macOS GID 20 "staff" may already exist - # as "dialout" in the Debian-based container image) - groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true - fi - - # Fix ownership of the data volume. When HERMES_UID remaps the hermes user, - # files created by previous runs (under the old UID) become inaccessible. - # Always chown -R when UID was remapped; otherwise only if top-level is wrong. - actual_hermes_uid=$(id -u hermes) - needs_chown=false - if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "10000" ]; then - needs_chown=true - elif [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then - needs_chown=true - fi - if [ "$needs_chown" = true ]; then - echo "Fixing ownership of $HERMES_HOME to hermes ($actual_hermes_uid)" - # In rootless Podman the container's "root" is mapped to an unprivileged - # host UID — chown will fail. That's fine: the volume is already owned - # by the mapped user on the host side. - chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \ - echo "Warning: chown failed (rootless container?) — continuing anyway" - # The .venv must also be re-chowned when UID is remapped, otherwise - # lazy_deps.py cannot install platform packages (discord.py, etc.). - chown -R hermes:hermes "$INSTALL_DIR/.venv" 2>/dev/null || \ - echo "Warning: chown .venv failed (rootless container?) — continuing anyway" - fi - - # Ensure config.yaml is readable by the hermes runtime user even if it was - # edited on the host after initial ownership setup. Must run here (as root) - # rather than after the gosu drop, otherwise a non-root caller like - # `docker run -u $(id -u):$(id -g)` hits "Operation not permitted" (#15865). - if [ -f "$HERMES_HOME/config.yaml" ]; then - chown hermes:hermes "$HERMES_HOME/config.yaml" 2>/dev/null || true - chmod 640 "$HERMES_HOME/config.yaml" 2>/dev/null || true - fi - - echo "Dropping root privileges" - exec gosu hermes "$0" "$@" -fi - -# --- Running as hermes from here --- -source "${INSTALL_DIR}/.venv/bin/activate" - -# Stamp install method for detect_install_method() -echo "docker" > "${HERMES_HOME:=/opt/data}/.install_method" 2>/dev/null || true - -# Create essential directory structure. Cache and platform directories -# (cache/images, cache/audio, platforms/whatsapp, etc.) are created on -# demand by the application — don't pre-create them here so new installs -# get the consolidated layout from get_hermes_dir(). -# The "home/" subdirectory is a per-profile HOME for subprocesses (git, -# ssh, gh, npm …). Without it those tools write to /root which is -# ephemeral and shared across profiles. See issue #4426. -mkdir -p "$HERMES_HOME"/{cron,sessions,logs,hooks,memories,skills,skins,plans,workspace,home} - -# .env -if [ ! -f "$HERMES_HOME/.env" ]; then - cp "$INSTALL_DIR/.env.example" "$HERMES_HOME/.env" -fi - -# config.yaml -if [ ! -f "$HERMES_HOME/config.yaml" ]; then - cp "$INSTALL_DIR/cli-config.yaml.example" "$HERMES_HOME/config.yaml" -fi - -# SOUL.md -if [ ! -f "$HERMES_HOME/SOUL.md" ]; then - cp "$INSTALL_DIR/docker/SOUL.md" "$HERMES_HOME/SOUL.md" -fi - -# auth.json: bootstrap from env on first boot only. Used by orchestrators -# (e.g. provisioning a Hermes VPS from an account-management service) that -# need to seed the OAuth refresh credential non-interactively, instead of -# walking the user through `hermes setup` + the device-flow login dance. -# Subsequent token rotations write back to the same file, which lives on a -# persistent volume — so this env var is consumed exactly once at first -# boot. The `[ ! -f ... ]` guard is critical: without it, a container -# restart would clobber a rotated refresh token with the now-stale value -# the orchestrator originally seeded. -if [ ! -f "$HERMES_HOME/auth.json" ] && [ -n "$HERMES_AUTH_JSON_BOOTSTRAP" ]; then - printf '%s' "$HERMES_AUTH_JSON_BOOTSTRAP" > "$HERMES_HOME/auth.json" - chmod 600 "$HERMES_HOME/auth.json" -fi - -# Sync bundled skills (manifest-based so user edits are preserved) -if [ -d "$INSTALL_DIR/skills" ]; then - python3 "$INSTALL_DIR/tools/skills_sync.py" -fi - -# Optionally start `hermes dashboard` as a side-process. +#!/bin/sh +# s6-overlay shim. The real logic lives in docker/stage2-hook.sh, invoked +# by /etc/cont-init.d/01-hermes-setup (installed by the Dockerfile). This +# file exists so external references to docker/entrypoint.sh still work, +# but it's no longer the ENTRYPOINT — /init is. # -# Toggled by HERMES_DASHBOARD=1 (also accepts "true"/"yes", case-insensitive). -# Host/port/TUI can be overridden via: -# HERMES_DASHBOARD_HOST (default 0.0.0.0 — exposed outside the container) -# HERMES_DASHBOARD_PORT (default 9119, matches `hermes dashboard` default) -# HERMES_DASHBOARD_TUI (already honored by `hermes dashboard` itself) -# -# The dashboard is a long-lived server. We background it *before* the final -# `exec hermes "$@"` so the user's chosen foreground command (chat, gateway, -# sleep infinity, …) remains PID-of-interest for the container runtime. When -# the container stops the whole process tree is torn down, so no explicit -# cleanup is needed. -case "${HERMES_DASHBOARD:-}" in - 1|true|TRUE|True|yes|YES|Yes) - dash_host="${HERMES_DASHBOARD_HOST:-0.0.0.0}" - dash_port="${HERMES_DASHBOARD_PORT:-9119}" - dash_args=(--host "$dash_host" --port "$dash_port" --no-open) - # Binding to anything other than localhost requires --insecure — the - # dashboard refuses otherwise because it exposes API keys. Inside a - # container this is the expected deployment (host reaches it via - # published port), so opt in automatically. - if [ "$dash_host" != "127.0.0.1" ] && [ "$dash_host" != "localhost" ]; then - dash_args+=(--insecure) - fi - echo "Starting hermes dashboard on ${dash_host}:${dash_port} (background)" - # Prefix dashboard output so it's distinguishable from the main - # process in `docker logs`. stdbuf keeps the pipe line-buffered. - ( - stdbuf -oL -eL hermes dashboard "${dash_args[@]}" 2>&1 \ - | sed -u 's/^/[dashboard] /' - ) & - ;; -esac - -# Final exec: two supported invocation patterns. -# -# docker run -> exec `hermes` with no args (legacy default) -# docker run chat -q "..." -> exec `hermes chat -q "..."` (legacy wrap) -# docker run sleep infinity -> exec `sleep infinity` directly -# docker run bash -> exec `bash` directly -# -# If the first positional arg resolves to an executable on PATH, we assume the -# caller wants to run it directly (needed by the launcher which runs long-lived -# `sleep infinity` sandbox containers — see tools/environments/docker.py). -# Otherwise we treat the args as a hermes subcommand and wrap with `hermes`, -# preserving the documented `docker run ` behavior. -if [ $# -gt 0 ] && command -v "$1" >/dev/null 2>&1; then - exec "$@" -fi -exec hermes "$@" +# When called directly (e.g. by an old wrapper script that hard-coded +# docker/entrypoint.sh), forward to the stage2 hook for parity with the +# pre-s6 entrypoint behavior. +exec /opt/hermes/docker/stage2-hook.sh "$@" diff --git a/docker/main-wrapper.sh b/docker/main-wrapper.sh new file mode 100755 index 00000000000..8a430ba6b06 --- /dev/null +++ b/docker/main-wrapper.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# /opt/hermes/docker/main-wrapper.sh — wraps the container's CMD with +# the same argument-routing logic the pre-s6 entrypoint.sh used. Runs +# as /init's "main program" (Docker CMD) so it inherits stdin/stdout/ +# stderr from the container. +# +# Routing: +# no args → exec `hermes` (the default) +# first arg is an executable → exec it directly (sleep, bash, sh, …) +# first arg is anything else → exec `hermes ` (subcommand passthrough) +# +# We drop to the hermes user via `s6-setuidgid` — running as that +# user matches the pre-s6 contract (gosu drop). +set -e + +cd /opt/data +# shellcheck disable=SC1091 +. /opt/hermes/.venv/bin/activate + +if [ $# -eq 0 ]; then + exec s6-setuidgid hermes hermes +fi + +if command -v "$1" >/dev/null 2>&1; then + # Bare executable — pass through directly. + exec s6-setuidgid hermes "$@" +fi + +# Hermes subcommand pass-through. +exec s6-setuidgid hermes hermes "$@" diff --git a/docker/s6-rc.d/dashboard/dependencies.d/base b/docker/s6-rc.d/dashboard/dependencies.d/base new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docker/s6-rc.d/dashboard/run b/docker/s6-rc.d/dashboard/run new file mode 100755 index 00000000000..62ffac37a87 --- /dev/null +++ b/docker/s6-rc.d/dashboard/run @@ -0,0 +1,30 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Dashboard service. Always declared so s6 has a supervised slot; if +# HERMES_DASHBOARD isn't set to a truthy value we sleep forever and do +# nothing. See OQ3-A in the plan. + +case "${HERMES_DASHBOARD:-}" in + 1|true|TRUE|True|yes|YES|Yes) ;; + *) exec sleep infinity ;; +esac + +cd /opt/data +# shellcheck disable=SC1091 +. /opt/hermes/.venv/bin/activate + +dash_host="${HERMES_DASHBOARD_HOST:-0.0.0.0}" +dash_port="${HERMES_DASHBOARD_PORT:-9119}" + +# Binding to anything other than localhost requires --insecure — the +# dashboard refuses otherwise because it exposes API keys. Inside a +# container this is the expected deployment. +insecure="" +case "$dash_host" in + 127.0.0.1|localhost) ;; + *) insecure="--insecure" ;; +esac + +# shellcheck disable=SC2086 # word-splitting of $insecure is intentional +exec s6-setuidgid hermes hermes dashboard \ + --host "$dash_host" --port "$dash_port" --no-open $insecure diff --git a/docker/s6-rc.d/dashboard/type b/docker/s6-rc.d/dashboard/type new file mode 100644 index 00000000000..5883cff0cd1 --- /dev/null +++ b/docker/s6-rc.d/dashboard/type @@ -0,0 +1 @@ +longrun diff --git a/docker/s6-rc.d/main-hermes/dependencies.d/base b/docker/s6-rc.d/main-hermes/dependencies.d/base new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docker/s6-rc.d/main-hermes/run b/docker/s6-rc.d/main-hermes/run new file mode 100755 index 00000000000..488e5251415 --- /dev/null +++ b/docker/s6-rc.d/main-hermes/run @@ -0,0 +1,27 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Main hermes service. +# +# IMPORTANT — this is NOT how the user's CMD runs. +# +# We chose Architecture B from the plan: the container's CMD (the bare +# command the user passes to `docker run …`) runs as /init's +# "main program" via Docker's CMD mechanism, NOT as an s6-supervised +# service. This is the canonical s6-overlay pattern for "container +# exits when the program exits" semantics, and it lets us preserve +# every pre-s6 invocation contract (chat passthrough, sleep infinity, +# bash, --tui) without re-implementing argument routing through +# /run/s6/container_environment. +# +# So why does this service exist at all? Two reasons: +# 1. s6-rc requires at least one user service for the "user" bundle +# to be valid. We can't ship an empty bundle. +# 2. Future work may want to supervise a long-lived hermes process +# (e.g. for gateway-server containers); having the slot already +# wired in keeps that change small. +# +# For now this service is a no-op: it sleeps forever, doing nothing. +# The dashboard runs as a real s6 service alongside it (see +# ../dashboard/run) and per-profile gateways register dynamically via +# /run/service/ at runtime (Phase 4). +exec sleep infinity diff --git a/docker/s6-rc.d/main-hermes/type b/docker/s6-rc.d/main-hermes/type new file mode 100644 index 00000000000..5883cff0cd1 --- /dev/null +++ b/docker/s6-rc.d/main-hermes/type @@ -0,0 +1 @@ +longrun diff --git a/docker/s6-rc.d/user/contents.d/dashboard b/docker/s6-rc.d/user/contents.d/dashboard new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docker/s6-rc.d/user/contents.d/main-hermes b/docker/s6-rc.d/user/contents.d/main-hermes new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docker/stage2-hook.sh b/docker/stage2-hook.sh new file mode 100755 index 00000000000..f8c964801ad --- /dev/null +++ b/docker/stage2-hook.sh @@ -0,0 +1,105 @@ +#!/bin/sh +# s6-overlay stage2 hook — runs as root after the supervision tree is +# up but before user services start. Handles UID/GID remap, volume +# chown, config seeding, and skills sync. +# +# Per-service privilege drop happens inside each service's `run` script +# (and in main-wrapper.sh) via s6-setuidgid, not here. +# +# Wired into the image as /etc/cont-init.d/01-hermes-setup by the +# Dockerfile. The shim at docker/entrypoint.sh forwards to this script +# so external references to docker/entrypoint.sh still work. +# +# NB: cont-init.d scripts run with no arguments — the user's CMD args +# are NOT visible here. That's fine: we use Architecture B (s6-overlay +# main-program model), so main-wrapper.sh runs the CMD with full +# stdin/stdout/stderr access and handles arg parsing there. + +set -eu + +HERMES_HOME="${HERMES_HOME:-/opt/data}" +INSTALL_DIR="/opt/hermes" + +# --- UID/GID remap --- +if [ -n "${HERMES_UID:-}" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then + echo "[stage2] Changing hermes UID to $HERMES_UID" + usermod -u "$HERMES_UID" hermes +fi +if [ -n "${HERMES_GID:-}" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then + echo "[stage2] Changing hermes GID to $HERMES_GID" + # -o allows non-unique GID (e.g. macOS GID 20 "staff" may already + # exist as "dialout" in the Debian-based container image). + groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true +fi + +# --- Fix ownership of data volume --- +actual_hermes_uid=$(id -u hermes) +needs_chown=false +if [ -n "${HERMES_UID:-}" ] && [ "$HERMES_UID" != "10000" ]; then + needs_chown=true +elif [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then + needs_chown=true +fi +if [ "$needs_chown" = true ]; then + echo "[stage2] Fixing ownership of $HERMES_HOME to hermes ($actual_hermes_uid)" + # In rootless Podman the container's "root" is mapped to an + # unprivileged host UID — chown will fail. That's fine: the volume + # is already owned by the mapped user on the host side. + chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \ + echo "[stage2] Warning: chown failed (rootless container?) — continuing" + # The .venv must also be re-chowned when UID is remapped, otherwise + # lazy_deps.py cannot install platform packages (discord.py, etc.). + chown -R hermes:hermes "$INSTALL_DIR/.venv" 2>/dev/null || \ + echo "[stage2] Warning: chown .venv failed (rootless container?) — continuing" +fi + +# --- config.yaml permissions --- +# Ensure config.yaml is readable by the hermes runtime user even if it +# was edited on the host after initial ownership setup. +if [ -f "$HERMES_HOME/config.yaml" ]; then + chown hermes:hermes "$HERMES_HOME/config.yaml" 2>/dev/null || true + chmod 640 "$HERMES_HOME/config.yaml" 2>/dev/null || true +fi + +# --- Seed directory structure as hermes user --- +# Run as hermes via s6-setuidgid so dirs end up owned correctly (matters +# under rootless Podman where chown back to root would fail). +s6-setuidgid hermes sh -c "mkdir -p \"$HERMES_HOME\"/cron \ + \"$HERMES_HOME\"/sessions \"$HERMES_HOME\"/logs \"$HERMES_HOME\"/hooks \ + \"$HERMES_HOME\"/memories \"$HERMES_HOME\"/skills \"$HERMES_HOME\"/skins \ + \"$HERMES_HOME\"/plans \"$HERMES_HOME\"/workspace \"$HERMES_HOME\"/home" + +# --- Install-method stamp (read by detect_install_method() in hermes status) --- +# Preserved from the tini-era entrypoint (PR #27843). Must be written as +# the hermes user so ownership matches the file's documented owner. +s6-setuidgid hermes sh -c "echo docker > \"$HERMES_HOME/.install_method\"" 2>/dev/null || true + +# --- Seed config files (only on first boot) --- +seed_one() { + dest=$1 + src=$2 + if [ ! -f "$HERMES_HOME/$dest" ] && [ -f "$INSTALL_DIR/$src" ]; then + s6-setuidgid hermes cp "$INSTALL_DIR/$src" "$HERMES_HOME/$dest" + fi +} +seed_one ".env" ".env.example" +seed_one "config.yaml" "cli-config.yaml.example" +seed_one "SOUL.md" "docker/SOUL.md" + +# auth.json: bootstrap from env on first boot only. Same semantics as the +# pre-s6 entrypoint — the [ ! -f ] guard is critical to avoid clobbering +# rotated refresh tokens on container restart. +if [ ! -f "$HERMES_HOME/auth.json" ] && [ -n "${HERMES_AUTH_JSON_BOOTSTRAP:-}" ]; then + printf '%s' "$HERMES_AUTH_JSON_BOOTSTRAP" > "$HERMES_HOME/auth.json" + chown hermes:hermes "$HERMES_HOME/auth.json" 2>/dev/null || true + chmod 600 "$HERMES_HOME/auth.json" +fi + +# --- Sync bundled skills --- +if [ -d "$INSTALL_DIR/skills" ]; then + s6-setuidgid hermes sh -c \ + ". $INSTALL_DIR/.venv/bin/activate && python3 $INSTALL_DIR/tools/skills_sync.py" \ + || echo "[stage2] Warning: skills_sync.py failed; continuing" +fi + +echo "[stage2] Setup complete; starting user services" diff --git a/tests/docker/test_dashboard.py b/tests/docker/test_dashboard.py index d68c81b2525..8f965d5bf05 100644 --- a/tests/docker/test_dashboard.py +++ b/tests/docker/test_dashboard.py @@ -92,3 +92,67 @@ def test_dashboard_port_override( deadline_s=60.0, ) assert ok, f"Dashboard not listening on port 9120: stdout={stdout!r}" + + +def test_dashboard_restarts_after_crash( + built_image: str, container_name: str, +) -> None: + """Phase 2 invariant: under s6 supervision, killing the dashboard + process should be recovered automatically. + + Pre-s6 (tini) behavior was "stays dead" — the test wouldn't have + passed against that image. After the s6-overlay migration the + dashboard runs as a longrun s6-rc service and s6-supervise restarts + it after a ~1s backoff (the default). + """ + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", built_image, "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + # Wait for the first dashboard to come up. + ok, _ = _poll( + container_name, "pgrep -f 'hermes dashboard'", deadline_s=30.0, + ) + assert ok, "Dashboard never started initially" + + # Grab the initial PID. s6 may briefly transition through restart + # state between our poll-success and the follow-up pgrep, so retry + # a couple of times before giving up. + first_pid: str | None = None + for _attempt in range(10): + first_pid_result = subprocess.run( + ["docker", "exec", container_name, + "pgrep", "-f", "hermes dashboard"], + capture_output=True, text=True, timeout=10, + ) + first_pids = first_pid_result.stdout.strip().split() + if first_pids: + first_pid = first_pids[0] + break + time.sleep(0.5) + assert first_pid is not None, "Could not capture initial dashboard PID" + + # Kill the dashboard. + subprocess.run( + ["docker", "exec", container_name, "kill", "-9", first_pid], + capture_output=True, timeout=10, + ) + + # s6 backs off ~1s before restart; allow up to 15s for the new + # process to appear with a different PID. + deadline = time.monotonic() + 15.0 + while time.monotonic() < deadline: + r = subprocess.run( + ["docker", "exec", container_name, + "pgrep", "-f", "hermes dashboard"], + capture_output=True, text=True, timeout=10, + ) + pids = r.stdout.strip().split() if r.returncode == 0 else [] + if pids and pids[0] != first_pid: + return # success + time.sleep(0.5) + + raise AssertionError( + f"Dashboard not restarted after kill (first_pid={first_pid})" + )