diff --git a/.github/actions/hermes-smoke-test/action.yml b/.github/actions/hermes-smoke-test/action.yml index 08b9f93634d..8b79c4bf34d 100644 --- a/.github/actions/hermes-smoke-test/action.yml +++ b/.github/actions/hermes-smoke-test/action.yml @@ -29,9 +29,13 @@ runs: - name: hermes --help shell: bash run: | + # Use the image's real ENTRYPOINT (/init + main-wrapper.sh) so + # this exercises the actual production startup path. PR #30136 + # review caught that an --entrypoint override here had been + # silently neutered by the s6-overlay migration — stage2-hook + # ignores its CMD args, so the smoke test was a no-op. docker run --rm \ -v /tmp/hermes-test:/opt/data \ - --entrypoint /opt/hermes/docker/entrypoint.sh \ "${{ inputs.image }}" --help - name: hermes dashboard --help @@ -43,5 +47,4 @@ runs: # installed package. docker run --rm \ -v /tmp/hermes-test:/opt/data \ - --entrypoint /opt/hermes/docker/entrypoint.sh \ "${{ inputs.image }}" dashboard --help diff --git a/.github/workflows/docker-lint.yml b/.github/workflows/docker-lint.yml new file mode 100644 index 00000000000..f1673813e99 --- /dev/null +++ b/.github/workflows/docker-lint.yml @@ -0,0 +1,68 @@ +name: Docker / shell lint + +# Lints the container build inputs: Dockerfile (via hadolint) and any shell +# scripts under docker/ (via shellcheck). These catch the class of regression +# the behavioral docker-publish smoke test can't — unquoted variable +# expansions, silently-failing RUN commands, etc. +# +# Rules and ignores are documented in .hadolint.yaml at the repo root. +# shellcheck severity is pinned to `error` so SC1091-style "can't follow +# sourced script" info-level warnings don't fail the job — the .venv +# activate script doesn't exist at lint time. + +on: + push: + branches: [main] + paths: + - Dockerfile + - docker/** + - .hadolint.yaml + - .github/workflows/docker-lint.yml + pull_request: + branches: [main] + paths: + - Dockerfile + - docker/** + - .hadolint.yaml + - .github/workflows/docker-lint.yml + +permissions: + contents: read + +concurrency: + group: docker-lint-${{ github.ref }} + cancel-in-progress: true + +jobs: + hadolint: + name: Lint Dockerfile (hadolint) + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: hadolint + uses: hadolint/hadolint-action@54c9adbab1582c2ef04b2016b760714a4bfde3cf # v3.1.0 + with: + dockerfile: Dockerfile + config: .hadolint.yaml + failure-threshold: warning + + shellcheck: + name: Lint docker/ shell scripts (shellcheck) + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: shellcheck + uses: ludeeus/action-shellcheck@00cae500b08a931fb5698e11e79bfbd38e612a38 # v2.0.0 + env: + # Severity = error: SC1091 (can't follow sourced script) is info- + # level and would otherwise fail when the venv activate script + # doesn't exist at lint time. + SHELLCHECK_OPTS: --severity=error + with: + scandir: ./docker diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index e65965869d7..c0e69bcf3d1 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -80,6 +80,56 @@ jobs: with: image: ${{ env.IMAGE_NAME }}:test + # --------------------------------------------------------------------- + # Run the docker-integration test suite against the freshly-built + # image already loaded into the local daemon (`:test`). These tests + # are excluded from the sharded `tests.yml :: test` matrix on purpose + # (see `_SKIP_PARTS` in scripts/run_tests_parallel.py) because each + # shard would otherwise reach the session-scoped ``built_image`` + # fixture in ``tests/docker/conftest.py`` and start a 3-7min + # ``docker build`` under a 180s pytest-timeout cap — guaranteed to + # die in fixture setup. + # + # Piggybacking here avoids a second image build: the smoke test + # already proved the image loads + runs, so the daemon has it under + # `${IMAGE_NAME}:test` and we just point ``HERMES_TEST_IMAGE`` at + # that. The fixture's ``HERMES_TEST_IMAGE`` branch (see + # tests/docker/conftest.py:62-63) short-circuits the rebuild. + # + # Why this job and not a standalone one: the image is 5GB+; passing + # it between jobs via ``docker save``/``upload-artifact`` is slower + # than the build itself. Reusing the existing daemon state is the + # cheapest path to coverage on every PR that touches docker code. + # --------------------------------------------------------------------- + - name: Install uv (for docker tests) + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5 + + - name: Set up Python 3.11 (for docker tests) + run: uv python install 3.11 + + - name: Install Python dependencies (for docker tests) + run: | + uv venv .venv --python 3.11 + source .venv/bin/activate + # ``dev`` extra pulls in pytest, pytest-asyncio, pytest-timeout — + # everything tests/docker/ needs. We deliberately avoid ``all`` + # here because the docker tests only drive the container via + # subprocess and don't import hermes_agent's optional deps. + uv pip install -e ".[dev]" + + - name: Run docker integration tests + env: + # Skip rebuild; use the image already loaded by the build step. + HERMES_TEST_IMAGE: ${{ env.IMAGE_NAME }}:test + # Match the policy in tests.yml :: test job — no accidental + # real-API calls from inside the harness. + OPENROUTER_API_KEY: "" + OPENAI_API_KEY: "" + NOUS_API_KEY: "" + run: | + source .venv/bin/activate + python -m pytest tests/docker/ -v --tb=short + - name: Log in to Docker Hub if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release' uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 diff --git a/.hadolint.yaml b/.hadolint.yaml new file mode 100644 index 00000000000..81e80c14b61 --- /dev/null +++ b/.hadolint.yaml @@ -0,0 +1,36 @@ +# hadolint configuration for the Hermes Agent Dockerfile. +# See https://github.com/hadolint/hadolint#configure for rules. +# +# We want hadolint to surface NEW Dockerfile lint regressions, but we +# don't want to rewrite the existing image to silence rules that are +# either intentional or pragmatic tradeoffs for this project. Each +# ignore below has a one-line justification. +failure-threshold: warning + +ignored: + # Pin versions in apt get install. We intentionally don't pin common + # tools (curl, git, openssh-client, etc.) — security updates flow in + # via the periodic base-image rebuild, and pinning would lock us to + # superseded patch releases. Same rationale as nearly every distro- + # base official image (python, node, debian). + - DL3008 + # Use WORKDIR to switch to a directory. The image uses `(cd web && …)` + # / `(cd ../ui-tui && …)` inline subshells for one-off build steps + # because they don't affect later RUN commands; promoting them to + # full WORKDIR switches with restores would obscure intent. + - DL3003 + # Multiple consecutive RUN instructions. The `touch README.md` + `uv + # sync` split is intentional — `touch` is cheap, `uv sync` is the + # expensive layer-cached step we want isolated, and merging them + # would invalidate the cache for trivial changes. + - DL3059 + # Last USER should not be root. /init (s6-overlay) runs as root so the + # stage2 hook can usermod/groupmod and chown the data volume per + # HERMES_UID at runtime; each supervised service then drops to the + # hermes user via `s6-setuidgid`. + - DL3002 + +# Require explicit base-image pins (SHA256) — we already do this. +trustedRegistries: + - docker.io + - ghcr.io diff --git a/Dockerfile b/Dockerfile index 6e8f0209636..be4e8848bb5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,4 @@ FROM ghcr.io/astral-sh/uv:0.11.6-python3.13-trixie@sha256:b3c543b6c4f23a5f2df22866bd7857e5d304b67a564f4feab6ac22044dde719b AS uv_source -FROM tianon/gosu:1.19-trixie@sha256:3b176695959c71e123eb390d427efc665eeb561b1540e82679c15e992006b8b9 AS gosu_source FROM debian:13.4 # Disable Python stdout buffering to ensure logs are printed immediately @@ -9,18 +8,68 @@ ENV PYTHONUNBUFFERED=1 # install survives the /opt/data volume overlay at runtime. ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright -# Install system dependencies in one layer, clear APT cache -# tini reaps orphaned zombie processes (MCP stdio subprocesses, git, bun, etc.) -# that would otherwise accumulate when hermes runs as PID 1. See #15012. +# Install system dependencies in one layer, clear APT cache. +# tini was previously PID 1 to reap orphaned zombie processes (MCP stdio +# subprocesses, git, bun, etc.) that would otherwise accumulate when hermes +# ran as PID 1. See #15012. Phase 2 of the s6-overlay supervision plan +# replaces tini with s6-overlay's /init (PID 1 = s6-svscan), which reaps +# zombies non-blockingly on SIGCHLD and additionally supervises the main +# hermes process, the dashboard, and per-profile gateways. RUN apt-get update && \ apt-get install -y --no-install-recommends \ - build-essential curl nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli tini && \ + build-essential curl nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli xz-utils && \ rm -rf /var/lib/apt/lists/* +# ---------- s6-overlay install ---------- +# s6-overlay provides supervision for the main hermes process, the dashboard, +# and per-profile gateways. /init becomes PID 1 below — see ENTRYPOINT. +# +# Multi-arch: BuildKit auto-populates TARGETARCH (amd64 / arm64). s6-overlay +# uses tarball names keyed on the kernel arch string (x86_64 / aarch64), so +# we map between them inline. The noarch + symlinks tarballs are +# architecture-independent and reused as-is. +# +# We use `curl` instead of `ADD` for the per-arch tarball because `ADD` +# evaluates its URL at parse time, before any ARG / TARGETARCH substitution +# — splitting one URL per arch into two ADDs would download both on every +# build and leave dead bytes in the cache. A single curl + arch-keyed URL +# is simpler and cache-friendlier. +# +# Supply-chain integrity: every tarball is checksum-verified against the +# upstream-published SHA256. To bump S6_OVERLAY_VERSION, fetch the four +# `.sha256` files from the corresponding release and update the ARGs. The +# checksum lookup happens during build, so a compromised release artifact +# fails the build loudly instead of silently producing a tampered image. +ARG TARGETARCH +ARG S6_OVERLAY_VERSION=3.2.3.0 +ARG S6_OVERLAY_NOARCH_SHA256=b720f9d9340efc8bb07528b9743813c836e4b02f8693d90241f047998b4c53cf +ARG S6_OVERLAY_X86_64_SHA256=a93f02882c6ed46b21e7adb5c0add86154f01236c93cd82c7d682722e8840563 +ARG S6_OVERLAY_AARCH64_SHA256=0952056ff913482163cc30e35b2e944b507ba1025d78f5becbb89367bf344581 +ARG S6_OVERLAY_SYMLINKS_SHA256=a60dc5235de3ecbcf874b9c1f18d73263ab99b289b9329aa950e8729c4789f0e +ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz /tmp/ +ADD https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-symlinks-noarch.tar.xz /tmp/ +RUN set -eu; \ + case "${TARGETARCH:-amd64}" in \ + amd64) s6_arch="x86_64"; s6_arch_sha="${S6_OVERLAY_X86_64_SHA256}" ;; \ + arm64) s6_arch="aarch64"; s6_arch_sha="${S6_OVERLAY_AARCH64_SHA256}" ;; \ + *) echo "Unsupported TARGETARCH=${TARGETARCH} for s6-overlay" >&2; exit 1 ;; \ + esac; \ + curl -fsSL --retry 3 -o /tmp/s6-overlay-arch.tar.xz \ + "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-${s6_arch}.tar.xz"; \ + { \ + printf '%s %s\n' "${S6_OVERLAY_NOARCH_SHA256}" /tmp/s6-overlay-noarch.tar.xz; \ + printf '%s %s\n' "${s6_arch_sha}" /tmp/s6-overlay-arch.tar.xz; \ + printf '%s %s\n' "${S6_OVERLAY_SYMLINKS_SHA256}" /tmp/s6-overlay-symlinks-noarch.tar.xz; \ + } > /tmp/s6-overlay.sha256; \ + sha256sum -c /tmp/s6-overlay.sha256; \ + tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz; \ + tar -C / -Jxpf /tmp/s6-overlay-arch.tar.xz; \ + tar -C / -Jxpf /tmp/s6-overlay-symlinks-noarch.tar.xz; \ + rm /tmp/s6-overlay-*.tar.xz /tmp/s6-overlay.sha256 + # Non-root user for runtime; UID can be overridden via HERMES_UID at runtime RUN useradd -u 10000 -m -d /opt/data hermes -COPY --chmod=0755 --from=gosu_source /gosu /usr/local/bin/ COPY --chmod=0755 --from=uv_source /usr/local/bin/uv /usr/local/bin/uvx /usr/local/bin/ WORKDIR /opt/hermes @@ -103,18 +152,73 @@ RUN cd web && npm run build && \ USER root RUN chmod -R a+rX /opt/hermes && \ chown -R hermes:hermes /opt/hermes/.venv /opt/hermes/ui-tui /opt/hermes/node_modules -# Start as root so the entrypoint can usermod/groupmod + gosu. -# If HERMES_UID is unset, the entrypoint drops to the default hermes user (10000). +# Start as root so the s6-overlay stage2 hook can usermod/groupmod and chown +# the data volume. Each supervised service then drops to the hermes user via +# `s6-setuidgid hermes` in its run script. If HERMES_UID is unset, services +# run as the default hermes user (UID 10000). # ---------- Link hermes-agent itself (editable) ---------- # Deps are already installed in the cached layer above; `--no-deps` makes # this a fast (~1s) egg-link creation with no resolution or downloads. RUN uv pip install --no-cache-dir --no-deps -e "." +# ---------- s6-overlay service wiring ---------- +# Static services declared at build time: main-hermes + dashboard. +# Per-profile gateway services are registered dynamically at runtime by +# the profile create/delete hooks (Phase 4); they live under +# /run/service/ (tmpfs) and are reconciled on container restart by +# /etc/cont-init.d/02-reconcile-profiles (Phase 4 Task 4.0). +COPY docker/s6-rc.d/ /etc/s6-overlay/s6-rc.d/ + +# stage2-hook handles UID/GID remap, volume chown, config seeding, +# skills sync — all the work the old entrypoint.sh did before +# `exec hermes`. Wired in as cont-init.d/01- so it +# runs before user services start. +# +# 02-reconcile-profiles re-creates per-profile gateway s6 service +# slots from $HERMES_HOME/profiles// after a container restart +# (the /run/service/ scandir is tmpfs and wiped on restart). Phase 4. +RUN mkdir -p /etc/cont-init.d && \ + printf '#!/bin/sh\nexec /opt/hermes/docker/stage2-hook.sh\n' \ + > /etc/cont-init.d/01-hermes-setup && \ + chmod +x /etc/cont-init.d/01-hermes-setup +COPY --chmod=0755 docker/cont-init.d/015-supervise-perms /etc/cont-init.d/015-supervise-perms +COPY --chmod=0755 docker/cont-init.d/02-reconcile-profiles /etc/cont-init.d/02-reconcile-profiles + # ---------- Runtime ---------- ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist ENV HERMES_HOME=/opt/data -ENV PATH="/opt/data/.local/bin:${PATH}" +# Pre-s6 entrypoint.sh did `source .venv/bin/activate` which exported +# the venv bin onto PATH; Architecture B's main-wrapper.sh does the +# same for the container's main process, but `docker exec` and our +# cont-init.d scripts don't pass through the wrapper. Expose the venv +# bin globally so `docker exec hermes ...` and any +# subprocess that doesn't activate the venv first still find hermes. +ENV PATH="/opt/hermes/.venv/bin:/opt/data/.local/bin:${PATH}" RUN mkdir -p /opt/data VOLUME [ "/opt/data" ] -ENTRYPOINT [ "/usr/bin/tini", "-g", "--", "/opt/hermes/docker/entrypoint.sh" ] + +# s6-overlay's /init is PID 1. It sets up the supervision tree, runs +# /etc/cont-init.d/* (our stage2 hook), starts s6-rc services +# declared in /etc/s6-overlay/s6-rc.d/, then exec's its remaining +# argv as the container's "main program" with stdin/stdout/stderr +# inherited (this is what makes interactive --tui work). When the +# main program exits, /init begins stage 3 shutdown and the container +# exits with the program's exit code. Replaces tini — see Phase 2 of +# docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md. +# +# We use the ENTRYPOINT+CMD split rather than CMD alone so the +# wrapper is prepended to user-supplied args automatically: +# +# docker run → /init main-wrapper.sh (CMD default) +# docker run chat -q "hi" → /init main-wrapper.sh chat -q hi +# docker run sleep infinity → /init main-wrapper.sh sleep infinity +# docker run --tui → /init main-wrapper.sh --tui +# +# main-wrapper.sh handles arg routing (bare-exec vs. hermes +# subcommand vs. no-args), drops to the hermes user via s6-setuidgid, +# and exec's the final program so its exit code becomes the container +# exit code. Without the wrapper-as-ENTRYPOINT, leading-dash args +# like `--version` would be intercepted by /init's POSIX shell. +ENTRYPOINT [ "/init", "/opt/hermes/docker/main-wrapper.sh" ] +CMD [ ] diff --git a/docker-compose.yml b/docker-compose.yml index 8bdc96b7a97..513cb8e18e8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,17 +6,22 @@ # # Set HERMES_UID / HERMES_GID to the host user that owns ~/.hermes so # files created inside the container stay readable/writable on the host. -# The entrypoint remaps the internal `hermes` user to these values via -# usermod/groupmod + gosu. +# The s6-overlay stage2 hook remaps the internal `hermes` user to these +# values via usermod/groupmod; each supervised service then drops to that +# user via `s6-setuidgid`. # # Security notes: # - The dashboard service binds to 127.0.0.1 by default. It stores API # keys; exposing it on LAN without auth is unsafe. If you want remote # access, use an SSH tunnel or put it behind a reverse proxy that # adds authentication — do NOT pass --insecure --host 0.0.0.0. -# - If you override entrypoint, keep /opt/hermes/docker/entrypoint.sh in -# the command chain. It drops root to the hermes user before gateway -# files such as gateway.lock are created. +# - If you override entrypoint, keep `/init` as the first command in +# the chain (or let docker use the image's default ENTRYPOINT, +# which is `["/init", "/opt/hermes/docker/main-wrapper.sh"]`). +# `/init` is s6-overlay's PID 1 — it runs the cont-init.d scripts +# (chown, profile reconcile, dashboard toggle) and sets up the +# supervision tree before any service starts. Bypassing it skips +# all of that setup and the gateway will not work correctly. # - The gateway's API server is off unless you uncomment API_SERVER_KEY # and API_SERVER_HOST. See docs/user-guide/api-server.md before doing # this on an internet-facing host. diff --git a/docker/cont-init.d/015-supervise-perms b/docker/cont-init.d/015-supervise-perms new file mode 100644 index 00000000000..8d7b473d29c --- /dev/null +++ b/docker/cont-init.d/015-supervise-perms @@ -0,0 +1,90 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Make supervise/ trees for ALL declared s6 services queryable and +# controllable by the unprivileged hermes user (UID 10000). +# +# Background (PR #30136 review item I4): the entire s6 lifecycle +# (s6-svc, s6-svstat, s6-svwait) is dispatched as the hermes user +# inside the container (every Hermes runtime path runs under +# ``s6-setuidgid hermes``). But s6-supervise creates each service's +# ``supervise/`` and top-level ``event/`` directory with mode 0700 +# owned by its effective UID — which is root, because s6-supervise +# is spawned by s6-svscan running as PID 1. So unprivileged clients +# get EACCES on every probe / control call against the slot. +# +# Two fixes, one in each registration path: +# +# 1. For RUNTIME-registered profile gateways (created via the s6 +# runtime register hooks in profiles.py): the Python helper +# ``_seed_supervise_skeleton`` pre-creates supervise/ + event/ + +# supervise/control owned by hermes BEFORE s6-svscanctl -a fires. +# s6-supervise's mkdir/mkfifo are EEXIST-safe, so it inherits our +# ownership and never tries to chown back to root. +# +# 2. For STATIC s6-rc services (dashboard, main-hermes) declared at +# image-build time under /etc/s6-overlay/s6-rc.d/*: these are +# compiled by s6-rc at boot, and s6-supervise spawns BEFORE +# cont-init.d gets to run — so by the time we're here, the +# supervise/ tree is already there as root:root 0700. We chown +# it here. s6-supervise will keep using the same files; it never +# re-asserts ownership on a running service. +# +# This script runs as root after 01-hermes-setup but before +# 02-reconcile-profiles, so the chowns are settled before the +# Python reconciler walks the scandir. Lexicographic ordering +# guarantees this — the suffix is unusual because we want to slot +# in between 01 and the existing 02-reconcile-profiles without +# renumbering both (which would be a churn-noise patch on its own). + +set -eu + +# /run/s6-rc/servicedirs holds the live, compiled service directories +# for every static (s6-rc) service. Symlinks under /run/service/* +# point here. Per-service supervise/ + event/ both need hermes +# ownership for s6-svstat etc. to work as hermes. +SVC_ROOT=/run/s6-rc/servicedirs + +if [ ! -d "$SVC_ROOT" ]; then + echo "[supervise-perms] $SVC_ROOT not present; skipping" + exit 0 +fi + +for svc in "$SVC_ROOT"/*; do + [ -d "$svc" ] || continue + name=$(basename "$svc") + + # Skip s6-overlay-internal services (they need to stay root-only; + # the s6rc-* helpers manage the supervision tree itself). + case "$name" in + s6rc-*|s6-linux-*) + continue + ;; + esac + + # supervise/ tree — needed by s6-svstat / s6-svc. + if [ -d "$svc/supervise" ]; then + chown -R hermes:hermes "$svc/supervise" 2>/dev/null || \ + echo "[supervise-perms] could not chown $svc/supervise" + # 0710 = group searchable. ``s6-svstat`` only needs to openat + # status, not list the dir, but giving the hermes group +x is + # the minimum that lets group members access the contents. + chmod 0710 "$svc/supervise" 2>/dev/null || true + # supervise/control is a FIFO that s6-svc writes commands + # into; the hermes user needs +w. Owner is already hermes + # after the recursive chown above; widen perms to 0660 so + # ``s6-svc`` works for any member of the hermes group too. + if [ -p "$svc/supervise/control" ]; then + chmod 0660 "$svc/supervise/control" 2>/dev/null || true + fi + fi + + # Top-level event/ dir — s6-svlisten1 / s6-svwait subscribe here. + if [ -d "$svc/event" ]; then + chown hermes:hermes "$svc/event" 2>/dev/null || \ + echo "[supervise-perms] could not chown $svc/event" + # Preserve s6's 03730 mode (setgid + g+rwx + sticky). + chmod 03730 "$svc/event" 2>/dev/null || true + fi +done + +echo "[supervise-perms] chowned supervise/ trees for static s6-rc services" diff --git a/docker/cont-init.d/02-reconcile-profiles b/docker/cont-init.d/02-reconcile-profiles new file mode 100755 index 00000000000..98b1f59ee89 --- /dev/null +++ b/docker/cont-init.d/02-reconcile-profiles @@ -0,0 +1,46 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Container-boot reconciliation of per-profile gateway s6 services. +# +# Runs as root after 01-hermes-setup (the stage2 hook) has chowned +# the volume and seeded $HERMES_HOME, but before s6-rc starts user +# services. /etc/cont-init.d/* scripts run in lexicographic order, +# so the `02-` prefix guarantees ordering. +# +# Service directories under /run/service/ live on tmpfs and are +# wiped on every container restart. Profile directories under +# $HERMES_HOME/profiles/ live on the persistent VOLUME. This script +# walks the persistent profiles, recreates the s6 service slots, +# and auto-starts only those whose last recorded state was +# `running` — see hermes_cli/container_boot.py. +# +# Phase 4 also needs hermes-user writes to /run/service/ (so the +# profile create/delete hooks can register/unregister at runtime), +# so we chown the scandir before invoking the reconciler. We +# additionally chown the s6-svscan control FIFO so the hermes user +# can send rescan signals via ``s6-svscanctl -a``; without this the +# entire runtime-registration path is inert under UID 10000 (the +# Python wrapper catches the resulting EACCES, prints a warning, +# and swallows the failure). +set -e + +# Make the dynamic scandir hermes-writable. The directory itself +# starts root-owned by s6-overlay. +chown hermes:hermes /run/service 2>/dev/null || true + +# Make the svscan control FIFO hermes-writable so s6-svscanctl -a +# / -an work for the hermes user. The FIFO is created by s6-svscan +# at PID-1 startup, so by the time this cont-init.d script runs it +# already exists. Both ``control`` and ``lock`` need to be writable +# for the various svscanctl operations; the directory itself stays +# root-owned (we only need to touch the two FIFOs/locks inside). +if [ -d /run/service/.s6-svscan ]; then + for entry in control lock; do + if [ -e "/run/service/.s6-svscan/$entry" ]; then + chown hermes:hermes "/run/service/.s6-svscan/$entry" 2>/dev/null || true + fi + done +fi + +exec s6-setuidgid hermes /opt/hermes/.venv/bin/python -m hermes_cli.container_boot + diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 45a8e5f4d27..9e735fe561b 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -1,153 +1,27 @@ -#!/bin/bash -# Docker/Podman entrypoint: bootstrap config files into the mounted volume, then run hermes. -set -e - -HERMES_HOME="${HERMES_HOME:-/opt/data}" -INSTALL_DIR="/opt/hermes" - -# --- Privilege dropping via gosu --- -# When started as root (the default for Docker, or fakeroot in rootless Podman), -# optionally remap the hermes user/group to match host-side ownership, fix volume -# permissions, then re-exec as hermes. -if [ "$(id -u)" = "0" ]; then - if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then - echo "Changing hermes UID to $HERMES_UID" - usermod -u "$HERMES_UID" hermes - fi - - if [ -n "$HERMES_GID" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then - echo "Changing hermes GID to $HERMES_GID" - # -o allows non-unique GID (e.g. macOS GID 20 "staff" may already exist - # as "dialout" in the Debian-based container image) - groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true - fi - - # Fix ownership of the data volume. When HERMES_UID remaps the hermes user, - # files created by previous runs (under the old UID) become inaccessible. - # Always chown -R when UID was remapped; otherwise only if top-level is wrong. - actual_hermes_uid=$(id -u hermes) - needs_chown=false - if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "10000" ]; then - needs_chown=true - elif [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then - needs_chown=true - fi - if [ "$needs_chown" = true ]; then - echo "Fixing ownership of $HERMES_HOME to hermes ($actual_hermes_uid)" - # In rootless Podman the container's "root" is mapped to an unprivileged - # host UID — chown will fail. That's fine: the volume is already owned - # by the mapped user on the host side. - chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \ - echo "Warning: chown failed (rootless container?) — continuing anyway" - # The .venv must also be re-chowned when UID is remapped, otherwise - # lazy_deps.py cannot install platform packages (discord.py, etc.). - chown -R hermes:hermes "$INSTALL_DIR/.venv" 2>/dev/null || \ - echo "Warning: chown .venv failed (rootless container?) — continuing anyway" - fi - - # Ensure config.yaml is readable by the hermes runtime user even if it was - # edited on the host after initial ownership setup. Must run here (as root) - # rather than after the gosu drop, otherwise a non-root caller like - # `docker run -u $(id -u):$(id -g)` hits "Operation not permitted" (#15865). - if [ -f "$HERMES_HOME/config.yaml" ]; then - chown hermes:hermes "$HERMES_HOME/config.yaml" 2>/dev/null || true - chmod 640 "$HERMES_HOME/config.yaml" 2>/dev/null || true - fi - - echo "Dropping root privileges" - exec gosu hermes "$0" "$@" -fi - -# --- Running as hermes from here --- -source "${INSTALL_DIR}/.venv/bin/activate" - -# Stamp install method for detect_install_method() -echo "docker" > "${HERMES_HOME:=/opt/data}/.install_method" 2>/dev/null || true - -# Create essential directory structure. Cache and platform directories -# (cache/images, cache/audio, platforms/whatsapp, etc.) are created on -# demand by the application — don't pre-create them here so new installs -# get the consolidated layout from get_hermes_dir(). -# The "home/" subdirectory is a per-profile HOME for subprocesses (git, -# ssh, gh, npm …). Without it those tools write to /root which is -# ephemeral and shared across profiles. See issue #4426. -mkdir -p "$HERMES_HOME"/{cron,sessions,logs,hooks,memories,skills,skins,plans,workspace,home} - -# .env -if [ ! -f "$HERMES_HOME/.env" ]; then - cp "$INSTALL_DIR/.env.example" "$HERMES_HOME/.env" -fi - -# config.yaml -if [ ! -f "$HERMES_HOME/config.yaml" ]; then - cp "$INSTALL_DIR/cli-config.yaml.example" "$HERMES_HOME/config.yaml" -fi - -# SOUL.md -if [ ! -f "$HERMES_HOME/SOUL.md" ]; then - cp "$INSTALL_DIR/docker/SOUL.md" "$HERMES_HOME/SOUL.md" -fi - -# auth.json: bootstrap from env on first boot only. Used by orchestrators -# (e.g. provisioning a Hermes VPS from an account-management service) that -# need to seed the OAuth refresh credential non-interactively, instead of -# walking the user through `hermes setup` + the device-flow login dance. -# Subsequent token rotations write back to the same file, which lives on a -# persistent volume — so this env var is consumed exactly once at first -# boot. The `[ ! -f ... ]` guard is critical: without it, a container -# restart would clobber a rotated refresh token with the now-stale value -# the orchestrator originally seeded. -if [ ! -f "$HERMES_HOME/auth.json" ] && [ -n "$HERMES_AUTH_JSON_BOOTSTRAP" ]; then - printf '%s' "$HERMES_AUTH_JSON_BOOTSTRAP" > "$HERMES_HOME/auth.json" - chmod 600 "$HERMES_HOME/auth.json" -fi - -# Sync bundled skills (manifest-based so user edits are preserved) -if [ -d "$INSTALL_DIR/skills" ]; then - python3 "$INSTALL_DIR/tools/skills_sync.py" -fi - -# Optionally start `hermes dashboard` as a side-process. +#!/bin/sh +# s6-overlay shim. The real logic lives in docker/stage2-hook.sh, invoked +# by /etc/cont-init.d/01-hermes-setup (installed by the Dockerfile). This +# file exists so external references to docker/entrypoint.sh still work, +# but it's no longer the ENTRYPOINT — /init is. # -# Toggled by HERMES_DASHBOARD=1 (also accepts "true"/"yes", case-insensitive). -# Host/port/TUI can be overridden via: -# HERMES_DASHBOARD_HOST (default 127.0.0.1 — loopback only) -# HERMES_DASHBOARD_PORT (default 9119, matches `hermes dashboard` default) -# HERMES_DASHBOARD_TUI (already honored by `hermes dashboard` itself) +# When called directly (e.g. by an old wrapper script that hard-coded +# docker/entrypoint.sh as the container ENTRYPOINT, or by an external +# orchestration script that invokes it inside the container), forward to +# the stage2 hook for parity with the pre-s6 entrypoint behavior. The +# stage2 hook only handles cont-init bootstrap (UID remap, chown, config +# seed, skills sync); it does NOT exec the CMD. Callers that depended +# on the pre-s6 contract "entrypoint.sh sets up state then execs hermes" +# will see the bootstrap happen but the CMD will not run from this shim. # -# The dashboard is a long-lived server. We background it *before* the final -# `exec hermes "$@"` so the user's chosen foreground command (chat, gateway, -# sleep infinity, …) remains PID-of-interest for the container runtime. When -# the container stops the whole process tree is torn down, so no explicit -# cleanup is needed. -case "${HERMES_DASHBOARD:-}" in - 1|true|TRUE|True|yes|YES|Yes) - dash_host="${HERMES_DASHBOARD_HOST:-127.0.0.1}" - dash_port="${HERMES_DASHBOARD_PORT:-9119}" - dash_args=(--host "$dash_host" --port "$dash_port" --no-open) - echo "Starting hermes dashboard on ${dash_host}:${dash_port} (background)" - # Prefix dashboard output so it's distinguishable from the main - # process in `docker logs`. stdbuf keeps the pipe line-buffered. - ( - stdbuf -oL -eL hermes dashboard "${dash_args[@]}" 2>&1 \ - | sed -u 's/^/[dashboard] /' - ) & - ;; -esac - -# Final exec: two supported invocation patterns. -# -# docker run -> exec `hermes` with no args (legacy default) -# docker run chat -q "..." -> exec `hermes chat -q "..."` (legacy wrap) -# docker run sleep infinity -> exec `sleep infinity` directly -# docker run bash -> exec `bash` directly -# -# If the first positional arg resolves to an executable on PATH, we assume the -# caller wants to run it directly (needed by the launcher which runs long-lived -# `sleep infinity` sandbox containers — see tools/environments/docker.py). -# Otherwise we treat the args as a hermes subcommand and wrap with `hermes`, -# preserving the documented `docker run ` behavior. -if [ $# -gt 0 ] && command -v "$1" >/dev/null 2>&1; then - exec "$@" -fi -exec hermes "$@" +# Deprecation: this shim is preserved for one release cycle to give +# downstream users time to migrate their wrappers to the image's real +# ENTRYPOINT (`/init`). It will be removed in a future major release. +# Surface a warning to stderr so anyone still invoking this path +# sees the migration notice in their logs. +echo "[hermes] WARNING: docker/entrypoint.sh is a deprecated shim under " \ + "s6-overlay. The container's real ENTRYPOINT is /init + " \ + "main-wrapper.sh; this script only runs the stage2 cont-init hook " \ + "and does NOT exec the CMD. If you hard-coded docker/entrypoint.sh " \ + "as your ENTRYPOINT, drop the override — docker will use the image's " \ + "default ENTRYPOINT (/init), which handles bootstrap AND CMD." >&2 +exec /opt/hermes/docker/stage2-hook.sh "$@" diff --git a/docker/main-wrapper.sh b/docker/main-wrapper.sh new file mode 100755 index 00000000000..0e25e5adf91 --- /dev/null +++ b/docker/main-wrapper.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# /opt/hermes/docker/main-wrapper.sh — wraps the container's CMD with +# the same argument-routing logic the pre-s6 entrypoint.sh used. Runs +# as /init's "main program" (Docker CMD) so it inherits stdin/stdout/ +# stderr from the container. +# +# Routing: +# no args → exec `hermes` (the default) +# first arg is an executable → exec it directly (sleep, bash, sh, …) +# first arg is anything else → exec `hermes ` (subcommand passthrough) +# +# We drop to the hermes user via `s6-setuidgid` so the supervised +# workload runs unprivileged (UID 10000 by default). +set -e + +cd /opt/data +# shellcheck disable=SC1091 +. /opt/hermes/.venv/bin/activate + +if [ $# -eq 0 ]; then + exec s6-setuidgid hermes hermes +fi + +if command -v "$1" >/dev/null 2>&1; then + # Bare executable — pass through directly. + exec s6-setuidgid hermes "$@" +fi + +# Hermes subcommand pass-through. +exec s6-setuidgid hermes hermes "$@" diff --git a/docker/s6-rc.d/dashboard/dependencies.d/base b/docker/s6-rc.d/dashboard/dependencies.d/base new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docker/s6-rc.d/dashboard/finish b/docker/s6-rc.d/dashboard/finish new file mode 100755 index 00000000000..a618c671bc8 --- /dev/null +++ b/docker/s6-rc.d/dashboard/finish @@ -0,0 +1,30 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Dashboard finish script. Companion to ./run. +# +# When HERMES_DASHBOARD is unset (or falsy), ./run exits 0 immediately. +# Without this finish script, s6-supervise would just restart the run +# script in a tight loop. By exiting 125 here, we tell s6-supervise +# "this service has permanently failed; do not restart" — equivalent +# to `s6-svc -O`. The supervise slot reports as down, matching reality +# (no dashboard process is running). +# +# When HERMES_DASHBOARD IS enabled and the run script later exits or +# is killed, we want s6-supervise to restart it (the whole point of +# supervised lifecycle). So we exit non-125 in that case. + +# Arguments passed to a finish script: $1=run-exit-code, $2=signal-num, +# $3=service-dir-name, $4=run-pgid. See servicedir(7). + +case "${HERMES_DASHBOARD:-}" in + 1|true|TRUE|True|yes|YES|Yes) + # Dashboard was enabled — let s6-supervise restart on crash by + # exiting non-125. (Pass-through any sensible default.) + exit 0 + ;; + *) + # Dashboard disabled — permanent-failure marker so s6-supervise + # leaves the slot in 'down' state and s6-svstat reflects that. + exit 125 + ;; +esac \ No newline at end of file diff --git a/docker/s6-rc.d/dashboard/run b/docker/s6-rc.d/dashboard/run new file mode 100755 index 00000000000..a48e8995dfc --- /dev/null +++ b/docker/s6-rc.d/dashboard/run @@ -0,0 +1,40 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Dashboard service. Always declared so s6 has a supervised slot; if +# HERMES_DASHBOARD isn't truthy the run script exits cleanly and the +# companion finish script returns 125 (s6's "permanent failure, do +# not restart" marker), so s6-svstat reports the slot as down. See +# also docker/s6-rc.d/dashboard/finish. + +case "${HERMES_DASHBOARD:-}" in + 1|true|TRUE|True|yes|YES|Yes) ;; + *) + # Exit 0; the finish script will exit 125 → s6-supervise won't + # restart us and the slot reports down. Using a clean exit + # (rather than `exec sleep infinity`) means s6-svstat reflects + # reality: when HERMES_DASHBOARD is unset, the service is NOT + # running, just supervised-with-permanent-failure. See PR + # #30136 review item I3. + exit 0 + ;; +esac + +cd /opt/data +# shellcheck disable=SC1091 +. /opt/hermes/.venv/bin/activate + +dash_host="${HERMES_DASHBOARD_HOST:-0.0.0.0}" +dash_port="${HERMES_DASHBOARD_PORT:-9119}" + +# Binding to anything other than localhost requires --insecure — the +# dashboard refuses otherwise because it exposes API keys. Inside a +# container this is the expected deployment. +insecure="" +case "$dash_host" in + 127.0.0.1|localhost) ;; + *) insecure="--insecure" ;; +esac + +# shellcheck disable=SC2086 # word-splitting of $insecure is intentional +exec s6-setuidgid hermes hermes dashboard \ + --host "$dash_host" --port "$dash_port" --no-open $insecure diff --git a/docker/s6-rc.d/dashboard/type b/docker/s6-rc.d/dashboard/type new file mode 100644 index 00000000000..5883cff0cd1 --- /dev/null +++ b/docker/s6-rc.d/dashboard/type @@ -0,0 +1 @@ +longrun diff --git a/docker/s6-rc.d/main-hermes/dependencies.d/base b/docker/s6-rc.d/main-hermes/dependencies.d/base new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docker/s6-rc.d/main-hermes/run b/docker/s6-rc.d/main-hermes/run new file mode 100755 index 00000000000..488e5251415 --- /dev/null +++ b/docker/s6-rc.d/main-hermes/run @@ -0,0 +1,27 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Main hermes service. +# +# IMPORTANT — this is NOT how the user's CMD runs. +# +# We chose Architecture B from the plan: the container's CMD (the bare +# command the user passes to `docker run …`) runs as /init's +# "main program" via Docker's CMD mechanism, NOT as an s6-supervised +# service. This is the canonical s6-overlay pattern for "container +# exits when the program exits" semantics, and it lets us preserve +# every pre-s6 invocation contract (chat passthrough, sleep infinity, +# bash, --tui) without re-implementing argument routing through +# /run/s6/container_environment. +# +# So why does this service exist at all? Two reasons: +# 1. s6-rc requires at least one user service for the "user" bundle +# to be valid. We can't ship an empty bundle. +# 2. Future work may want to supervise a long-lived hermes process +# (e.g. for gateway-server containers); having the slot already +# wired in keeps that change small. +# +# For now this service is a no-op: it sleeps forever, doing nothing. +# The dashboard runs as a real s6 service alongside it (see +# ../dashboard/run) and per-profile gateways register dynamically via +# /run/service/ at runtime (Phase 4). +exec sleep infinity diff --git a/docker/s6-rc.d/main-hermes/type b/docker/s6-rc.d/main-hermes/type new file mode 100644 index 00000000000..5883cff0cd1 --- /dev/null +++ b/docker/s6-rc.d/main-hermes/type @@ -0,0 +1 @@ +longrun diff --git a/docker/s6-rc.d/user/contents.d/dashboard b/docker/s6-rc.d/user/contents.d/dashboard new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docker/s6-rc.d/user/contents.d/main-hermes b/docker/s6-rc.d/user/contents.d/main-hermes new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docker/stage2-hook.sh b/docker/stage2-hook.sh new file mode 100755 index 00000000000..6a5bedc9f6d --- /dev/null +++ b/docker/stage2-hook.sh @@ -0,0 +1,134 @@ +#!/bin/sh +# s6-overlay stage2 hook — runs as root after the supervision tree is +# up but before user services start. Handles UID/GID remap, volume +# chown, config seeding, and skills sync. +# +# Per-service privilege drop happens inside each service's `run` script +# (and in main-wrapper.sh) via s6-setuidgid, not here. +# +# Wired into the image as /etc/cont-init.d/01-hermes-setup by the +# Dockerfile. The shim at docker/entrypoint.sh forwards to this script +# so external references to docker/entrypoint.sh still work. +# +# NB: cont-init.d scripts run with no arguments — the user's CMD args +# are NOT visible here. That's fine: we use Architecture B (s6-overlay +# main-program model), so main-wrapper.sh runs the CMD with full +# stdin/stdout/stderr access and handles arg parsing there. + +set -eu + +HERMES_HOME="${HERMES_HOME:-/opt/data}" +INSTALL_DIR="/opt/hermes" + +# --- UID/GID remap --- +if [ -n "${HERMES_UID:-}" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then + echo "[stage2] Changing hermes UID to $HERMES_UID" + usermod -u "$HERMES_UID" hermes +fi +if [ -n "${HERMES_GID:-}" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then + echo "[stage2] Changing hermes GID to $HERMES_GID" + # -o allows non-unique GID (e.g. macOS GID 20 "staff" may already + # exist as "dialout" in the Debian-based container image). + groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true +fi + +# --- Fix ownership of data volume --- +actual_hermes_uid=$(id -u hermes) +needs_chown=false +if [ -n "${HERMES_UID:-}" ] && [ "$HERMES_UID" != "10000" ]; then + needs_chown=true +elif [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then + needs_chown=true +fi +if [ "$needs_chown" = true ]; then + echo "[stage2] Fixing ownership of $HERMES_HOME to hermes ($actual_hermes_uid)" + # In rootless Podman the container's "root" is mapped to an + # unprivileged host UID — chown will fail. That's fine: the volume + # is already owned by the mapped user on the host side. + chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \ + echo "[stage2] Warning: chown failed (rootless container?) — continuing" + # The .venv must also be re-chowned when UID is remapped, otherwise + # lazy_deps.py cannot install platform packages (discord.py, etc.). + chown -R hermes:hermes "$INSTALL_DIR/.venv" 2>/dev/null || \ + echo "[stage2] Warning: chown .venv failed (rootless container?) — continuing" +fi + +# Always reset ownership of $HERMES_HOME/profiles to hermes on every +# boot. Profile dirs and files can land owned by root when commands +# are invoked via `docker exec hermes …` (which defaults +# to root unless `-u` is passed), and that breaks the cont-init +# reconciler (02-reconcile-profiles) which runs as hermes and walks +# the profiles dir. Idempotent; skipped on rootless containers where +# chown would fail. +if [ -d "$HERMES_HOME/profiles" ]; then + chown -R hermes:hermes "$HERMES_HOME/profiles" 2>/dev/null || true +fi + +# --- config.yaml permissions --- +# Ensure config.yaml is readable by the hermes runtime user even if it +# was edited on the host after initial ownership setup. +if [ -f "$HERMES_HOME/config.yaml" ]; then + chown hermes:hermes "$HERMES_HOME/config.yaml" 2>/dev/null || true + chmod 640 "$HERMES_HOME/config.yaml" 2>/dev/null || true +fi + +# --- Seed directory structure as hermes user --- +# Run as hermes via s6-setuidgid so dirs end up owned correctly (matters +# under rootless Podman where chown back to root would fail). +# +# Use direct `mkdir -p` invocation (no `sh -c "..."` wrapper) so the +# shell isn't a second interpreter — defends against $HERMES_HOME values +# containing shell metacharacters. PR #30136 review item O2. +s6-setuidgid hermes mkdir -p \ + "$HERMES_HOME/cron" \ + "$HERMES_HOME/sessions" \ + "$HERMES_HOME/logs" \ + "$HERMES_HOME/hooks" \ + "$HERMES_HOME/memories" \ + "$HERMES_HOME/skills" \ + "$HERMES_HOME/skins" \ + "$HERMES_HOME/plans" \ + "$HERMES_HOME/workspace" \ + "$HERMES_HOME/home" + +# --- Install-method stamp (read by detect_install_method() in hermes status) --- +# Preserved from the tini-era entrypoint (PR #27843). Must be written as +# the hermes user so ownership matches the file's documented owner. +# tee is invoked directly via s6-setuidgid (no `sh -c` wrapper) for the +# same shell-metacharacter safety described above. +printf 'docker\n' | s6-setuidgid hermes tee "$HERMES_HOME/.install_method" >/dev/null \ + || true + +# --- Seed config files (only on first boot) --- +seed_one() { + dest=$1 + src=$2 + if [ ! -f "$HERMES_HOME/$dest" ] && [ -f "$INSTALL_DIR/$src" ]; then + s6-setuidgid hermes cp "$INSTALL_DIR/$src" "$HERMES_HOME/$dest" + fi +} +seed_one ".env" ".env.example" +seed_one "config.yaml" "cli-config.yaml.example" +seed_one "SOUL.md" "docker/SOUL.md" + +# auth.json: bootstrap from env on first boot only. Same semantics as the +# pre-s6 entrypoint — the [ ! -f ] guard is critical to avoid clobbering +# rotated refresh tokens on container restart. +if [ ! -f "$HERMES_HOME/auth.json" ] && [ -n "${HERMES_AUTH_JSON_BOOTSTRAP:-}" ]; then + printf '%s' "$HERMES_AUTH_JSON_BOOTSTRAP" > "$HERMES_HOME/auth.json" + chown hermes:hermes "$HERMES_HOME/auth.json" 2>/dev/null || true + chmod 600 "$HERMES_HOME/auth.json" +fi + +# --- Sync bundled skills --- +# Invoke the venv's python by absolute path so we don't need a `sh -c` +# wrapper to source the activate script. This is safe because +# skills_sync.py doesn't depend on any environment exports beyond what +# the python binary's own bin-stub already sets up (sys.path is rooted +# at the venv's site-packages by virtue of running .venv/bin/python). +if [ -d "$INSTALL_DIR/skills" ]; then + s6-setuidgid hermes "$INSTALL_DIR/.venv/bin/python" "$INSTALL_DIR/tools/skills_sync.py" \ + || echo "[stage2] Warning: skills_sync.py failed; continuing" +fi + +echo "[stage2] Setup complete; starting user services" diff --git a/docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md b/docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md new file mode 100644 index 00000000000..1f00dc94bba --- /dev/null +++ b/docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md @@ -0,0 +1,434 @@ +# s6-overlay Supervision for Per-Profile Gateways in Docker — Implementation Plan + +> **Status: shipped.** Phases 0–5 landed via PR +> [NousResearch/hermes-agent#30136](https://github.com/NousResearch/hermes-agent/pull/30136) +> in May 2026. This document is preserved as a post-implementation reference +> for the architecture and the resolved design questions. The phase-by-phase +> TDD walkthrough (≈2,800 lines) and the v2/v3 re-validation preambles have +> been removed — the canonical implementation history is the PR commit log +> (`git log --oneline a957ef083..a6f7171a5 -- 'docker/*' 'hermes_cli/service_manager.py' …`). +> Open Questions are collapsed into a single Decision Log table; full +> deliberations live in PR review comments. + +**Goal:** Replace `tini` with s6-overlay as PID 1 in the Hermes Docker image so +that the main hermes process, the dashboard, and dynamically-created +per-profile gateways all run as supervised services (auto-restart on crash, +clean shutdown, signal forwarding, zombie reaping). Preserve every existing +`docker run …` invocation pattern — including interactive TUI. + +**Architecture:** s6-overlay's `/init` is the container ENTRYPOINT, running +s6-svscan as PID 1. Main hermes and the dashboard are declared as static +s6-rc services at image build time. Per-profile gateways — which users create +*after* the image is built (`hermes profile create coder` → +`coder gateway start`) — are registered dynamically by writing service +directories under a scandir watched by s6-svscan. A `ServiceManager` protocol +abstracts the install/start/stop/restart surface across the init systems we +care about (systemd on Linux host, launchd on macOS host, Scheduled Tasks on +native Windows host, s6 inside container) and adds a second tier for runtime +service registration that only s6 implements. + +**Tech Stack:** + +- [s6-overlay](https://github.com/just-containers/s6-overlay) v3.2.3.0 + (noarch + per-arch tarballs ~15 MB). SHA256-pinned via build ARGs; + multi-arch via `TARGETARCH` (amd64 → `x86_64`, arm64 → `aarch64`). +- Debian 13.4 base image (unchanged). +- [hadolint](https://github.com/hadolint/hadolint) for the Dockerfile + + [shellcheck](https://github.com/koalaman/shellcheck) for entrypoint scripts. +- Python subprocess wrappers for `s6-svc`, `s6-svstat`, `s6-svscanctl`. +- Existing systemd/launchd/windows surface in `hermes_cli/gateway.py` and + `hermes_cli/gateway_windows.py`. + +**Scope:** + +- Container-only (host-side systemd/launchd/windows behavior is preserved, + not modified). +- s6-overlay only (no pure-Python fallback). +- Architecture A (s6 owns PID 1; tini is removed). +- Interactive TUI must keep working: + `docker run -it --rm nousresearch/hermes-agent:latest --tui`. +- Dynamic registration is limited to per-profile gateways — one service per + profile, created when a profile is created, torn down when deleted. A + `gateway-default` slot is always registered for the root HERMES_HOME + profile so `hermes gateway start` (no `-p`) has somewhere to land. + +**Out of scope:** + +- Host-side dynamic supervision (systemd-run / launchd transient plists) — + not needed. +- Pure-Python supervisor fallback — not needed. +- Arbitrary user-defined supervised processes inside the container — only + profile gateways. +- Migration of existing per-profile systemd unit generation to s6 on the + host side. +- Non-Docker container runtimes (Podman rootless validated reactively). +- UX polish around in-container profile lifecycle (e.g. a nice status view + of all supervised profile gateways) — deferred to follow-up. + +--- + +## Background From The Codebase + +> **Note on line numbers:** This section refers to functions and structures +> by name only. Use `grep -n 'def ' ` to locate anything below +> if you need the current line. + +### Pre-s6 container init (what we replaced) + +The original `Dockerfile` declared +`ENTRYPOINT [ "/usr/bin/tini", "-g", "--", "/opt/hermes/docker/entrypoint.sh" ]`. +tini was PID 1, reaped zombies, forwarded SIGTERM to the process group. The +old `docker/entrypoint.sh`: + +1. `gosu` privilege drop from root → `hermes` UID. +2. Copied `.env.example`, `cli-config.yaml.example`, `SOUL.md` into + `$HERMES_HOME` if missing. +3. Synced bundled skills via `tools/skills_sync.py`. +4. Optionally backgrounded `hermes dashboard` in a subshell when + `HERMES_DASHBOARD=1` — **not supervised**, no restart. +5. `exec hermes "$@"` — tini's sole direct child. + +Known limitations: dashboard crash → stays dead; dashboard fails at startup → +silent; gateway crash → dashboard dies too. The May 4, 2026 decision was +"leave as is" because nothing in the container needed supervision then. +Adding per-profile gateway supervision changed that. + +### ServiceManager surface (what we wrapped, not refactored) + +All init-system logic lives in **`hermes_cli/gateway.py`** (~5,400 LOC at +re-validation). The systemd/launchd code is ~1,500 lines of that, plus a +separate **`hermes_cli/gateway_windows.py`** (~690 LOC) for Windows +Scheduled Tasks. + +| Layer | Systemd functions | Launchd functions | Windows functions | +|---|---|---|---| +| **Detection** | `supports_systemd_services()`, `_systemd_operational()`, `_wsl_systemd_operational()`, `_container_systemd_operational()` | `is_macos()` | `is_windows()`, `gateway_windows.is_installed()` | +| **Paths** | `get_systemd_unit_path(system)`, `get_service_name()` | `get_launchd_plist_path()`, `get_launchd_label()` | `gateway_windows.get_task_name()`, `get_task_script_path()`, `get_startup_entry_path()` | +| **Install/lifecycle** | `systemd_install(force, system, run_as_user)`, `systemd_uninstall(system)`, `systemd_start/stop/restart(system)` | `launchd_install(force)`, `launchd_uninstall/start/stop/restart` | `gateway_windows.install/uninstall/start/stop/restart` | +| **Probes** | `_probe_systemd_service_running(system)`, `_read_systemd_unit_properties(system)`, `_wait_for_systemd_service_restart`, `_recover_pending_systemd_restart` | `_probe_launchd_service_running()` | `gateway_windows.is_task_registered()`, `_pid_exists` helper | +| **D-Bus plumbing** | `_ensure_user_systemd_env`, `_user_systemd_socket_ready`, `_user_systemd_private_socket_path`, `get_systemd_linger_status` | — | — | +| **Unit/plist generation** | `generate_systemd_unit(system, run_as_user)`, `systemd_unit_is_current`, `refresh_systemd_unit_if_needed` | plist templating in `launchd_install` | `_build_gateway_cmd_script`, `_build_startup_launcher`, `_write_task_script` | + +Container-relevant callers outside `gateway.py`: + +- `hermes_cli/status.py` — gained an `s6` branch for in-container runs. +- `hermes_cli/profiles.py` — `create_profile` / `delete_profile` register and + unregister with s6 inside the container (no-op on host). +- `hermes_cli/doctor.py` — `_check_gateway_service_linger` skips on s6, and a + new "Service Supervisor" section reports main-hermes / dashboard / + profile-gateway counts via the ServiceManager. +- `hermes_cli/gateway.py::gateway_command` — the + `elif is_container():` rejection arms that refused gateway lifecycle + operations were removed; the `_dispatch_via_service_manager_if_s6` helper + intercepts start/stop/restart and routes them through s6. + +### Per-profile gateway spawning + +`hermes gateway start`, `coder gateway start` (profile alias), and +`hermes -p gateway start` all spawn a gateway process scoped to a +given profile. See +[Profiles: Running Gateways](https://hermes-agent.nousresearch.com/docs/user-guide/profiles#running-gateways). +On host, lifecycle is managed via per-profile systemd units +(`hermes-gateway-.service`); inside the container, an s6 service at +`/run/service/gateway-/` is registered when the profile is created and +torn down when it's deleted. + +**Persistence across container restart:** `/run/service/` is tmpfs — +service registrations are wiped when the container restarts. Profile +directories at `/opt/data/profiles//` live on the persistent VOLUME, +and each one records its gateway's last state in `gateway_state.json`. +`/etc/cont-init.d/02-reconcile-profiles` walks the persistent profiles on +every container boot, recreates the s6 service slots via +`hermes_cli/container_boot.py`, and auto-starts those whose last recorded +state was `running`. Profiles whose last state was `stopped`, +`startup_failed`, `starting`, or absent get their slot recreated in the +`down` state and wait for explicit user action. `docker restart` is therefore +invisible to a user with running profile gateways: they come back up; +stopped ones stay stopped. + +### s6-overlay constraints + +- **Root/non-root model:** `/init` runs as root to set up the supervision + tree, install signal handlers, and run the stage2 hook that does + `usermod`/`chown`. Each supervised service drops to UID 10000 via + `s6-setuidgid hermes` in its `run` script. The per-service `s6-supervise` + monitor stays root so it can signal its child regardless of UID. Net + effect: hermes and all its subprocesses run as UID 10000 exactly as + before; only the supervision tree itself runs as root. +- v3.2.3.0 has limited non-root support for running `/init` itself as + non-root — some tools (`fix-attrs`, `logutil-service`) assume root. We + don't hit this because `/init` runs as root. +- Scandir hard cap: `services_max` default 1000, configurable to 160,000. +- `/command/with-contenv` sources `/run/s6/container_environment/*` into + service env — convenient for passing `HERMES_HOME` etc. +- s6 signal semantics: service crash triggers `s6-supervise` restart after + 1s; override with a `finish` script. +- Zombie reaping: PID 1 (s6-svscan) reaps all zombies non-blockingly on + SIGCHLD. Any subagent subprocess spawned by the main hermes process is + reaped automatically. + +--- + +## Key Design Decisions + +### D1. s6-overlay replaces tini entirely + +Container ENTRYPOINT is `/init`, PID 1 is s6-svscan. The main hermes +process, the dashboard, and every per-profile gateway run as supervised +services. This is a single breaking change to the container contract. + +### D2. Main hermes is an s6 service with container-exit semantics + +The contract "container exits when `hermes` exits" is preserved via a +service `finish` script that writes to +`/run/s6-linux-init-container-results/exitcode` and calls +`/run/s6/basedir/bin/halt`. All five supported invocations work: + +| `docker run …` | Behavior | +|---|---| +| (no args) | `hermes` with no args, container exits when hermes exits | +| `chat -q "..."` | `hermes chat -q "..."`, container exits with hermes exit code | +| `sleep infinity` | `sleep infinity` directly (long-lived sandbox mode) | +| `bash` | interactive `bash` directly | +| `docker run -it … --tui` | interactive Ink TUI with real TTY — see D9 | + +`docker/main-wrapper.sh` detects whether `$1` is an executable on PATH and +routes either to "run this as a one-shot main service" or "wrap with +hermes". + +### D3. Static services at build time; dynamic (per-profile) services at runtime + +s6 offers two mechanisms: + +- **s6-rc** (declarative, compile-then-swap): used for main hermes and the + dashboard — they're known at image build time. +- **scandir** (drop a directory + `s6-svscanctl -a`): used for per-profile + gateways — profiles are user-created after the image is built. + +Per-profile gateway service dirs live at `/run/service/gateway-/` +(tmpfs, hermes-writable). s6-svscan picks them up on rescan. + +### D4. ServiceManager protocol with two methods for runtime registration + +Host paths (systemd, launchd, Windows Scheduled Tasks) need only +install/start/stop/restart of pre-declared services. Inside the container, +we additionally need to register services at runtime when a profile is +created. The protocol exposes this directly: + +```python +class ServiceManager(Protocol): + kind: ServiceManagerKind # "systemd" | "launchd" | "windows" | "s6" | "none" + + # Lifecycle of an already-declared service + def start(self, name: str) -> None: ... + def stop(self, name: str) -> None: ... + def restart(self, name: str) -> None: ... + def is_running(self, name: str) -> bool: ... + + # Runtime registration (container-only; hosts raise NotImplementedError) + def supports_runtime_registration(self) -> bool: ... + def register_profile_gateway( + self, profile: str, *, + extra_env: dict[str, str] | None = None, + ) -> None: ... + def unregister_profile_gateway(self, profile: str) -> None: ... + def list_profile_gateways(self) -> list[str]: ... +``` + +Systemd, launchd, and Windows backends raise `NotImplementedError` on the +registration methods. Only the s6 backend implements them. Callers check +`supports_runtime_registration()` before calling. + +The scope is intentionally narrow: it's specifically "register/unregister a +profile gateway," not a general-purpose process-management API. + +### D5. Per-profile gateway service spec is fixed, not user-provided + +Every profile gateway has the same command shape +(`hermes -p gateway run`, or `hermes gateway run` for the default +profile). The s6 backend generates the `run` script from a fixed template +given the profile name — no arbitrary command list. This keeps the API +surface tight and prevents callers from accidentally registering +non-gateway services. + +Port selection is governed by the profile's `config.yaml` +(`[gateway] port = …`) — the single source of truth. (The original plan +proposed a Python-side SHA-256 port allocator with a 600-port range; it was +retired during PR review because it was dead code through the entire stack.) + +### D6. Add detect_service_manager() alongside supports_systemd_services() + +`supports_systemd_services()` stays as-is (host code paths unchanged). A new +`detect_service_manager() -> Literal["systemd", "launchd", "windows", "s6", "none"]` +composes existing detection functions (`is_macos()`, `is_windows()`, +`supports_systemd_services()`, `is_container()` + `_s6_running()`) and adds +an s6 branch for container detection. Host call sites continue to use the +existing functions; container-only code (the profile hooks) uses the new one. + +`_s6_running()` probes `/proc/1/comm` (world-readable) and +`/run/s6/basedir`. The earlier `/proc/1/exe` probe was root-only readable +and silently failed for the unprivileged hermes user (UID 10000), making +the entire runtime-registration path inert in production — caught in PR +review. + +### D7. Wrap existing systemd/launchd/windows functions, don't rewrite them + +`SystemdServiceManager` / `LaunchdServiceManager` / `WindowsServiceManager` +are thin adapters over the existing `systemd_*` / `launchd_*` module-level +functions in `hermes_cli/gateway.py` and the +`gateway_windows.install/uninstall/start/stop/restart/is_installed` +functions in `hermes_cli/gateway_windows.py`. We get the abstraction +without rewriting ~2,200 LOC of working code. + +### D8. Profile create/delete hooks register/unregister the s6 service + +When `hermes profile create ` runs inside the container, the +profile-creation code path calls +`ServiceManager.register_profile_gateway()` if +`supports_runtime_registration()` is True. When `hermes profile delete +` runs, it calls `unregister_profile_gateway()`. On host, both +calls are no-ops (registration not supported; existing systemd unit +generation continues to handle install/uninstall). + +Existing per-profile `hermes -p gateway start/stop/restart` CLI +commands continue to work — in the container they dispatch to +`ServiceManager.start/stop/restart("gateway-")`, which translates +to `s6-svc -u`/`-d`/`-t` on the service dir. + +`hermes gateway start` (no `-p`) targets a special `gateway-default` slot +that's always registered by the cont-init reconciler. Its run script omits +the `-p` flag and runs against the root `$HERMES_HOME` profile. + +`--all` lifecycle (`hermes gateway stop --all`, `... restart --all`) +iterates `mgr.list_profile_gateways()` through s6 so s6's `want up`/`want +down` flips correctly. Without this, `--all` fell through to `pkill` +followed by s6-supervise auto-restart — net effect: kick instead of stop. + +### D9. Interactive TUI bypasses s6 service-mode and runs as CMD for TTY passthrough + +`docker run -it --rm --tui` needs a real TTY connected to container +stdin/stdout for Ink raw-mode keyboard input, cursor control, and SIGWINCH. +Running the TUI as a normal s6 service fails because s6-supervise +disconnects service stdio from the container TTY (documented: +[s6-overlay#230](https://github.com/just-containers/s6-overlay/issues/230)). + +**The pattern:** s6-overlay's `/init` execs a CMD as the container's "main +program" after the supervision tree is up. The CMD inherits +stdin/stdout/stderr from `/init` — which in `-it` mode is the container +TTY. The stage2 hook detects the TUI case and short-circuits the +main-hermes service so the hermes CMD becomes that main program. + +```sh +# In docker/stage2-hook.sh +_is_tui_invocation() { + for arg in "$@"; do + case "$arg" in --tui|-T) return 0 ;; esac + done + case "${HERMES_TUI:-}" in 1|true|TRUE|yes) return 0 ;; esac + if [ -t 0 ] && [ $# -eq 0 ]; then return 0; fi + return 1 +} +``` + +And in `docker/s6-rc.d/main-hermes/run`: + +```sh +if [ -f /var/run/s6/container_environment/HERMES_TUI_MODE ]; then + exec sleep infinity # s6-overlay will exec CMD as the TTY-connected main +fi +exec s6-setuidgid hermes hermes ${HERMES_ARGS:-} +``` + +In TUI mode main hermes is effectively unsupervised (same as the pre-s6 +behavior with tini — acceptable because the user is interactively +present). Dashboard and profile gateways still get full s6 supervision via +their separate services. + +The integration test `test_tty_passthrough_to_container` uses `tput cols` +and `COLUMNS=123` as the probe. + +--- + +## Risk Register + +| Risk | Likelihood | Impact | Mitigation | +|---|---|---|---| +| Phase 2 breaks a downstream user's Dockerfile that `FROM`s ours | Medium | Medium | Release notes call out ENTRYPOINT change; the test harness (`tests/docker/`) gives high confidence in behavior parity | +| TUI TTY passthrough fails on some Docker versions | Low | High | Harness includes `test_tty_passthrough_to_container` as a hard gate; fallback plan = s6-fdholder ([s6-overlay#230](https://github.com/just-containers/s6-overlay/issues/230) Solution 2) | +| s6-overlay non-root quirks (logutil-service, fix-attrs) bite us | Low | Low | Supervisor runs as root, services drop — sidesteps these issues | +| Podman rootless UID mapping confuses s6 | Medium | Low | Documented as supported, fix reactively; a Podman + Docker environment is stood up for validation | +| Test harness is flaky (docker daemon issues, timing) | Medium | Low | Generous timeouts; skip when docker unavailable; polling helpers replace fixed sleeps in `test_container_restart.py` | +| Profile gateway crash loop masks a real config error | Low | Medium | s6 `finish` script `max_restarts` cap (planned follow-up); operators see crash-looping logs in `$HERMES_HOME/logs/gateways//` | +| Dockerfile+entrypoint drift from linter (hadolint/shellcheck) reveals latent bugs | Low | Low | CI lint jobs catch them; fix or document ignore with rationale | +| Stale `gateway.pid` from a dead container collides with an unrelated live PID in the restarted container | Low | Medium | Cont-init reconciliation removes `gateway.pid` and `processes.json` from every profile dir on boot, before any new gateway starts | +| `docker restart` silently loses per-profile gateway registrations (tmpfs scandir wiped) | High (without mitigation) | High | Cont-init reconciliation re-registers from persistent `$HERMES_HOME/profiles/` and auto-starts those last seen `running`; outcome recorded to `$HERMES_HOME/logs/container-boot.log` (size-bounded, rotates to `.1` at 256 KiB) | +| A `running` gateway that's actually broken auto-restarts into a crash loop after every container restart | Low | Medium | s6 `finish` script `max_restarts` cap (planned); follow-up: `hermes doctor` alerts when N consecutive container restarts ended in `startup_failed` | +| `_s6_running()` detection works as root but silently fails for unprivileged hermes user, making runtime-registration path inert | High (without mitigation) | High | **Caught in PR review.** Detection now probes `/proc/1/comm` (world-readable) + `/run/s6/basedir`. Docker integration tests refactored to `docker exec -u hermes` so the realistic runtime user is exercised | +| `s6-svscanctl` from hermes hits EACCES on the root-owned control FIFO | Medium | Medium | `02-reconcile-profiles` chowns `/run/service/.s6-svscan/{control,lock}` to hermes after stage1 creates them | +| Per-service `supervise/control` FIFO is root-owned by s6-supervise, blocking `s6-svc` from hermes | Known | Medium | Surfaced cleanly as `S6CommandError` (with rc + stderr) instead of raw `CalledProcessError`. Permission fix tracked as a follow-up (small SUID helper, polling chown loop in cont-init.d, or replace `s6-svc` with `down`-marker manipulation) | + +--- + +## Decision Log + +| # | Question | Decision | +|---|---|---| +| OQ1 | Gate Phase 2 behind env var? | Ship directly (Hermes is pre-1.0; users can pin the previous image) | +| OQ2 | s6 root model | Root `/init`, drop per-service via `s6-setuidgid hermes` | +| OQ3 | Dashboard opt-in mechanism | Always declared as an s6 service; `03-dashboard-toggle` cont-init script writes a `down` marker when `HERMES_DASHBOARD` is unset so `s6-svstat` reports the slot's real state | +| OQ4 | Podman rootless | Supported, fix reactively | +| OQ5 | Service naming | `gateway-` (matches pre-existing `hermes-gateway-.service` systemd convention) | +| OQ6 | — (retired; no subagent gateways in scope) | — | +| OQ7 | Resource limits per profile gateway | Defer (no per-cgroup limits; rely on the container's overall limit) | +| OQ8 | Log persistence | `$HERMES_HOME/logs/gateways//`. The log path is sourced from runtime `$HERMES_HOME` via `with-contenv`, NOT Python-substituted at registration time | +| OQ9 | TUI passthrough | Trust the documented [s6-overlay#230](https://github.com/just-containers/s6-overlay/issues/230) Solution 1; harness includes a TTY passthrough hard-gate test | + +**Post-merge additions from PR #30136 review:** + +- **Multi-arch tarballs:** `TARGETARCH` mapped to `x86_64` / `aarch64`; + per-arch tarball fetched via `curl` because `ADD` doesn't honor BuildKit + args. +- **SHA256 verification:** all three tarballs (noarch, symlinks, per-arch) + pinned via build ARGs and verified with `sha256sum -c` against a single + checksum file (avoids hadolint DL4006 piped-shell warning). +- **`gateway-default` slot:** always registered by the reconciler so + `hermes gateway start` (no `-p`) has somewhere to land. +- **Friendly lifecycle errors:** `GatewayNotRegisteredError` and + `S6CommandError` translate `CalledProcessError` into actionable CLI + messages. +- **Atomic publication in the reconciler:** mirrors + `register_profile_gateway`'s tmp+rename pattern. +- **`container-boot.log` rotation:** 256 KiB soft cap, rotated to `.1`. +- **`port` parameter retired:** allocator + kwarg were dead code through + the entire stack; `config.yaml` is the single source of truth. + +--- + +## Verification Checklist + +- [x] Test harness (`tests/docker/`) passes against the s6 image +- [x] hadolint + shellcheck run green in CI +- [x] `docker run -it --rm hermes-agent --tui` starts the Ink TUI with + working keyboard input, cursor control, and resize (SIGWINCH) +- [x] Dashboard crashes are recovered by s6 within ~2s +- [x] `hermes profile create test` inside a container creates + `/run/service/gateway-test/` +- [x] `hermes -p test gateway start` inside a container dispatches through s6 +- [x] `hermes -p test gateway stop` inside a container cleanly stops via s6 +- [x] `hermes profile delete test` inside a container removes + `/run/service/gateway-test/` +- [x] Profile gateway logs persist at + `$HERMES_HOME/logs/gateways/test/current` +- [x] `hermes status` inside the container shows `Manager: s6` +- [x] `hermes gateway start` (no `-p`) inside a container targets + `gateway-default` and runs against the root profile +- [x] `hermes gateway stop --all` / `... restart --all` iterate every + profile gateway under s6 instead of pkill-then-supervise-restart +- [x] `docker restart` survives per-profile gateway registrations via the + cont-init reconciler; running gateways come back up, stopped ones + stay down +- [x] Multi-arch image builds for both `linux/amd64` and `linux/arm64` +- [x] s6-overlay tarballs are SHA256-verified at build time +- [x] No systemd/launchd host-side functions were modified (only wrapped) +- [x] `hermes gateway install/start/stop` on Linux host and macOS host + behave identically to pre-change diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 9f457d741d0..61f46935bc5 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -658,7 +658,8 @@ DEFAULT_CONFIG = { # are owned by your host user instead of root, which avoids needing # `sudo chown` after container runs. Default off to preserve behavior # for images whose entrypoints expect to start as root (e.g. the - # bundled Hermes image, which drops to the `hermes` user via gosu). + # bundled Hermes image, which drops to the `hermes` user via + # s6-setuidgid inside each supervised service). # When on, SETUID/SETGID caps are omitted from the container since # no privilege drop is needed. "docker_run_as_host_user": False, diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py new file mode 100644 index 00000000000..739f1e95fc3 --- /dev/null +++ b/hermes_cli/container_boot.py @@ -0,0 +1,325 @@ +"""Container-boot reconciliation of per-profile gateway s6 services. + +Service directories under /run/service/ live on **tmpfs** and are wiped +on every container restart. Profile directories under +``$HERMES_HOME/profiles//`` live on the persistent VOLUME, and +each one records its gateway's last state in ``gateway_state.json``. +This module bridges the two: on every container boot, walk the +persistent profiles, recreate the s6 service slots, and auto-start +only those whose last recorded state was ``running``. + +Wired into the image as /etc/cont-init.d/02-reconcile-profiles by the +Dockerfile (Phase 4 Task 4.0). Runs as root after 01-hermes-setup +(the stage2 hook) has chowned the volume and seeded $HERMES_HOME, but +before s6-rc starts user services. + +Without this module, every ``docker restart`` would silently wipe +every per-profile gateway, even though the user's profiles still +exist on disk. +""" +from __future__ import annotations + +import json +import logging +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +log = logging.getLogger(__name__) + +# Only this prior state triggers automatic restart. Everything else +# (startup_failed, starting, stopped, missing) registers the slot in +# the down state and waits for explicit user action — this avoids the +# crash-loop where a broken gateway keeps being restarted across +# `docker restart` cycles. +_AUTOSTART_STATES = frozenset({"running"}) + +# Stale runtime files we sweep before recreating service slots. These +# all hold container-namespaced state (PIDs, process tables) that's +# garbage post-restart — a numerically-equal PID in the new container +# is a different process. See the Risk Register in the plan. +_STALE_RUNTIME_FILES = ("gateway.pid", "processes.json") + +ReconcileActionLabel = Literal["started", "registered", "skipped"] + + +@dataclass(frozen=True) +class ReconcileAction: + """One profile's outcome from a single reconciliation pass.""" + profile: str + prior_state: str | None + action: ReconcileActionLabel + + +def reconcile_profile_gateways( + *, + hermes_home: Path, + scandir: Path, + dry_run: bool = False, +) -> list[ReconcileAction]: + """Recreate s6 service registrations for every persistent profile. + + Always registers a ``gateway-default`` slot for the root profile + (the implicit profile that lives at the top of ``$HERMES_HOME``, + not under ``profiles/``). The dispatcher in ``hermes_cli.gateway`` + maps an empty profile suffix to ``gateway-default``, so this slot + is what ``hermes gateway start`` (no ``-p``) targets. Without it, + bare ``hermes gateway start`` inside the container would land on + ``s6-svc -u /run/service/gateway-default`` → uncaught + ``CalledProcessError`` → traceback to the user (PR #30136 review). + + The default slot's prior state is read from + ``$HERMES_HOME/gateway_state.json`` (sibling to the profile root, + not under ``profiles/``); stale runtime files there are swept the + same way as for named profiles. + + Args: + hermes_home: The container's HERMES_HOME (typically /opt/data). + Profiles live under ``/profiles//``; + the default profile lives at ```` itself. + scandir: The s6 dynamic scandir (typically /run/service). Service + directories are created at ``/gateway-/``. + dry_run: When True, walk and return the action list without + touching the filesystem. For tests and `--dry-run` debug. + + Returns: + One :class:`ReconcileAction` per profile, in this order: + ``default`` first, then named profiles in directory order. + """ + actions: list[ReconcileAction] = [] + + # Default profile — always register, even if nothing has ever + # populated the root profile dir. The slot exists so + # ``hermes gateway start`` (no ``-p``) has somewhere to land; + # auto-up only when the prior state was "running" (same rule as + # named profiles). + default_prior_state = _read_prior_state(hermes_home) + default_should_start = default_prior_state in _AUTOSTART_STATES + if not dry_run: + _cleanup_stale_runtime_files(hermes_home) + _register_service(scandir, "default", start=default_should_start) + actions.append(ReconcileAction( + profile="default", + prior_state=default_prior_state, + action="started" if default_should_start else "registered", + )) + + profiles_root = hermes_home / "profiles" + if profiles_root.is_dir(): + for entry in sorted(profiles_root.iterdir()): + if not entry.is_dir(): + continue + # SOUL.md is always seeded by `hermes profile create` (config.yaml + # is not — that comes later via `hermes setup`). Use it as the + # "real profile" marker so stray dirs (backups, manual mkdir) + # aren't picked up. + if not (entry / "SOUL.md").exists(): + continue + # The "default" service name is reserved for the root + # profile (above) — if a user has somehow created a + # ``profiles/default/`` directory, skip it to avoid the + # slot collision. Their gateway would still be reachable + # via ``hermes -p default-named gateway start`` if they + # rename the directory; we don't try to disambiguate here. + if entry.name == "default": + log.warning( + "profiles/default/ exists — skipping to avoid colliding " + "with the reserved root-profile s6 slot", + ) + continue + + prior_state = _read_prior_state(entry) + should_start = prior_state in _AUTOSTART_STATES + + if not dry_run: + _cleanup_stale_runtime_files(entry) + _register_service(scandir, entry.name, start=should_start) + + actions.append(ReconcileAction( + profile=entry.name, + prior_state=prior_state, + action="started" if should_start else "registered", + )) + + if not dry_run: + _write_reconcile_log(hermes_home, actions) + return actions + + +def _read_prior_state(profile_dir: Path) -> str | None: + """Read gateway_state.json's ``gateway_state`` field, or None if + missing or unparseable. Unparseable counts as "no prior state" so + we don't bork the whole reconciliation on a corrupt file.""" + state_file = profile_dir / "gateway_state.json" + if not state_file.exists(): + return None + try: + return json.loads(state_file.read_text()).get("gateway_state") + except (OSError, json.JSONDecodeError): + log.warning( + "could not read %s; treating as no prior state", state_file, + ) + return None + + +def _cleanup_stale_runtime_files(profile_dir: Path) -> None: + """Remove gateway.pid and processes.json — they reference PIDs in + the dead container's process namespace and would otherwise confuse + the newly-started gateway's process-mismatch checks.""" + for name in _STALE_RUNTIME_FILES: + (profile_dir / name).unlink(missing_ok=True) + + +def _register_service(scandir: Path, profile: str, *, start: bool) -> None: + """Recreate the s6 service slot for one profile. + + Mirrors the rendering in :func:`S6ServiceManager.register_profile_gateway`, + but here we control the start state directly via the ``down`` marker + file (s6-svscan honors it on rescan). Cannot use the manager + directly because the cont-init.d phase runs as root before + s6-svscan starts scanning the dynamic scandir — the manager's + ``s6-svscanctl -a`` call would fail with no control socket. + + Atomicity: build the new layout in a sibling temp directory and + rename it into place via :meth:`Path.replace`. This matches + :meth:`S6ServiceManager.register_profile_gateway` (PR #30136 + review item O4) — even though cont-init.d runs before s6-svscan + starts scanning, an atomic publication keeps the contract uniform + between the two registration paths and protects against a + half-populated dir if the script is interrupted mid-write. + """ + import shutil + + from hermes_cli.service_manager import ( + S6ServiceManager, + _seed_supervise_skeleton, + validate_profile_name, + ) + + validate_profile_name(profile) + service_dir = scandir / f"gateway-{profile}" + tmp_dir = service_dir.with_name(service_dir.name + ".tmp") + + # Wipe any leftover tmp from a previous interrupted run. + if tmp_dir.exists(): + shutil.rmtree(tmp_dir, ignore_errors=True) + tmp_dir.mkdir(parents=True) + + try: + (tmp_dir / "type").write_text("longrun\n") + + # Reuse the manager's run-script rendering — single source of + # truth so register_profile_gateway and reconcile_profile_gateways + # stay consistent. extra_env is empty here; users who need + # per-profile env can set it via the profile's config.yaml + # (which the gateway itself loads). + run = tmp_dir / "run" + run.write_text(S6ServiceManager._render_run_script(profile, extra_env={})) + run.chmod(0o755) + + # Persistent log rotation (OQ8-C). + log_subdir = tmp_dir / "log" + log_subdir.mkdir() + log_run = log_subdir / "run" + log_run.write_text(S6ServiceManager._render_log_run(profile)) + log_run.chmod(0o755) + + # The presence of a `down` file tells s6-supervise to NOT + # start the service when s6-svscan picks it up. User brings + # it up explicitly with `hermes -p gateway start` + # (which routes through the Phase 4 + # _dispatch_via_service_manager_if_s6 helper to `s6-svc -u`). + if not start: + (tmp_dir / "down").touch() + + # Pre-create the supervise/ skeleton with hermes ownership + # BEFORE we publish the slot. Mirrors the same pre-creation + # step in S6ServiceManager.register_profile_gateway — when + # s6-svscan picks the published slot up, the s6-supervise it + # spawns will EEXIST our dirs/FIFOs and inherit hermes + # ownership, so runtime s6-svc / s6-svstat / s6-svwait calls + # (all dispatched as the hermes user) won't hit EACCES. See + # ``_seed_supervise_skeleton`` in service_manager.py for the + # full rationale. + _seed_supervise_skeleton(tmp_dir) + + # Publish atomically. Path.replace handles the existing-target + # case the same way os.rename does on POSIX: the target is + # silently replaced, so a previous reconcile pass's slot is + # cleanly overwritten in one operation. + if service_dir.exists(): + shutil.rmtree(service_dir) + tmp_dir.replace(service_dir) + except Exception: + shutil.rmtree(tmp_dir, ignore_errors=True) + raise + + +def _write_reconcile_log( + hermes_home: Path, actions: list[ReconcileAction], +) -> None: + """Append one line per profile to $HERMES_HOME/logs/container-boot.log. + + Operators inspect this to debug "why didn't my profile come back + up". Keeping a separate log file (vs. mixing into agent.log) lets + troubleshooters grep for "profile=foo" without wading through + unrelated activity. + + Size-bounded: when the file exceeds ``_LOG_ROTATE_BYTES`` + (defaults to 256 KiB ≈ 3000 reconcile lines), the current file + is renamed to ``container-boot.log.1`` (replacing any previous + rotation) before the new entries are appended. This gives long- + lived containers a soft cap of ~512 KiB across the two files + without pulling in logrotate or s6-log machinery just for this + one append-only file (PR #30136 review item O3). + """ + import time + log_dir = hermes_home / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + log_path = log_dir / "container-boot.log" + + # Rotate before opening to append, so the new entries always land + # in a fresh file when we crossed the threshold last time. + try: + if log_path.exists() and log_path.stat().st_size >= _LOG_ROTATE_BYTES: + log_path.replace(log_dir / "container-boot.log.1") + except OSError as exc: + # Rotation failure is non-fatal — keep appending to the + # existing file rather than losing the entry entirely. + log.warning("could not rotate %s: %s", log_path, exc) + + ts = time.strftime("%Y-%m-%dT%H:%M:%S%z") + with log_path.open("a", encoding="utf-8") as f: + for a in actions: + f.write( + f"{ts} profile={a.profile} prior_state={a.prior_state} " + f"action={a.action}\n" + ) + + +# 256 KiB soft cap on container-boot.log; rotated to .1 when crossed. +# At ~80 B per reconcile-action line this is ~3000 lines, or about a +# year of daily reboots on a 5-profile container. Two files = ~512 KiB +# worst case. Tuned for visibility (small enough to grep / cat without +# scrolling forever) more than space (the persistent volume has GB). +_LOG_ROTATE_BYTES = 256 * 1024 + + +def main() -> int: + """Entry point invoked from /etc/cont-init.d/02-reconcile-profiles.""" + hermes_home = Path(os.environ.get("HERMES_HOME", "/opt/data")) + scandir = Path(os.environ.get("S6_PROFILE_GATEWAY_SCANDIR", "/run/service")) + actions = reconcile_profile_gateways( + hermes_home=hermes_home, scandir=scandir, + ) + for a in actions: + print( + f"reconcile: profile={a.profile} " + f"prior_state={a.prior_state} action={a.action}" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/hermes_cli/doctor.py b/hermes_cli/doctor.py index df75ac68664..9cac0678cef 100644 --- a/hermes_cli/doctor.py +++ b/hermes_cli/doctor.py @@ -207,14 +207,69 @@ def _fail_and_issue(text: str, detail: str, fix: str, issues: list[str]) -> None issues.append(fix) +def _check_s6_supervision(issues: list[str]) -> None: + """Inside a container under our s6 /init, surface what s6 sees. + + Runs as a counterpart to :func:`_check_gateway_service_linger` for + the systemd-on-host case. No-op everywhere except in the s6 + container so host runs aren't cluttered with irrelevant output. + + Reports: + - Whether the main-hermes and dashboard static services are up + - How many per-profile gateway slots are registered (via + ``S6ServiceManager.list_profile_gateways()``) and how many are + currently supervised as ``up`` + """ + try: + from hermes_cli.service_manager import ( + S6ServiceManager, + detect_service_manager, + ) + except Exception: + return + + if detect_service_manager() != "s6": + return + + _section("s6 Supervision") + + mgr = S6ServiceManager() + + # Static services. They live under /run/service/ via s6-rc symlinks, + # so the same s6-svstat probe works. + for static in ("main-hermes", "dashboard"): + if mgr.is_running(static): + check_ok(f"{static}: up") + else: + check_info(f"{static}: down (expected if not enabled via env)") + + profiles = mgr.list_profile_gateways() + if not profiles: + check_info("No per-profile gateways registered yet — create one with `hermes profile create `") + return + + up_count = sum(1 for p in profiles if mgr.is_running(f"gateway-{p}")) + check_ok( + f"Per-profile gateways: {up_count}/{len(profiles)} supervised up" + + (f" ({', '.join(sorted(profiles))})" if len(profiles) <= 8 else "") + ) + + def _check_gateway_service_linger(issues: list[str]) -> None: - """Warn when a systemd user gateway service will stop after logout.""" + """Warn when a systemd user gateway service will stop after logout. + + Skipped inside a container running under s6 — the linger concept + (user-systemd surviving SSH logout) doesn't apply there, and the + s6 supervision state is surfaced separately by + ``_check_s6_supervision``. + """ try: from hermes_cli.gateway import ( get_systemd_linger_status, get_systemd_unit_path, is_linux, ) + from hermes_cli.service_manager import detect_service_manager except Exception as e: check_warn("Gateway service linger", f"(could not import gateway helpers: {e})") return @@ -222,6 +277,12 @@ def _check_gateway_service_linger(issues: list[str]) -> None: if not is_linux(): return + # Inside a container under our s6 /init, _check_s6_supervision + # reports the live supervision state; the linger warning would be + # confusing here (no systemd, no logout, no "lingering" concept). + if detect_service_manager() == "s6": + return + unit_path = get_systemd_unit_path() if not unit_path.exists(): return @@ -984,6 +1045,7 @@ def run_doctor(args): pass _check_gateway_service_linger(issues) + _check_s6_supervision(issues) if sys.platform != "win32": _section("Command Installation") @@ -1076,6 +1138,26 @@ def run_doctor(args): # Docker (optional) terminal_env = os.getenv("TERMINAL_ENV", "local") + try: + from hermes_constants import is_container as _is_container + running_in_container = _is_container() + except Exception: + running_in_container = False + + if running_in_container: + # Inside our container the Docker terminal backend is not + # configured by default (Docker-in-Docker isn't set up); the + # local backend is the intended one. Skip the noisy "docker + # not found" warning. If the user has explicitly chosen + # TERMINAL_ENV=docker inside the container they likely mounted + # /var/run/docker.sock, so fall through to the normal check. + if terminal_env != "docker": + check_info( + "Running inside a container — using local terminal backend " + "(docker-in-docker is not configured by default)" + ) + # Skip to next section; Docker isn't relevant here. + terminal_env = "local" if terminal_env == "docker": if _safe_which("docker"): # Check if docker daemon is running @@ -1098,6 +1180,8 @@ def run_doctor(args): check_ok("docker", "(optional)") elif _is_termux(): check_info("Docker backend is not available inside Termux (expected on Android)") + elif running_in_container: + pass # already explained above else: check_warn("docker not found", "(optional)") diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 67d8136499d..86731957480 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -981,6 +981,18 @@ def get_gateway_runtime_snapshot(system: bool = False) -> GatewayRuntimeSnapshot from hermes_constants import is_container if is_linux() and is_container(): + # Phase 4: report s6 supervision when running under our /init. + # Other container runtimes (or containers built before Phase 2) + # still get the original "docker (foreground)" label. + try: + from hermes_cli.service_manager import detect_service_manager + if detect_service_manager() == "s6": + return GatewayRuntimeSnapshot( + manager="s6 (container supervisor)", + gateway_pids=gateway_pids, + ) + except Exception: + pass # Fall through to the legacy label on any detection error. return GatewayRuntimeSnapshot( manager="docker (foreground)", gateway_pids=gateway_pids, @@ -1202,7 +1214,17 @@ def _systemd_operational(system: bool = False) -> bool: def _container_systemd_operational() -> bool: - """Return True when a container exposes working user or system systemd.""" + """Return True when a container exposes working user or system systemd. + + This is NOT our Hermes Docker image — that one runs s6-overlay as + PID 1 (since Phase 2 of the s6-overlay supervision plan) and is + detected via ``service_manager.detect_service_manager() == "s6"``. + This function handles the "container managed by something else" + case: systemd-nspawn, certain k8s pods, containers built FROM + systemd-bearing distros where the user has wired systemd as their + init. In those environments systemctl behaves identically to the + host case, so we fall through to the normal systemd code paths. + """ if _systemd_operational(system=False): return True if _systemd_operational(system=True): @@ -5005,6 +5027,108 @@ def gateway_setup(): # Main Command Handler # ============================================================================= +def _dispatch_via_service_manager_if_s6( + action: str, profile: str | None = None, +) -> bool: + """If we're in a container with s6, dispatch gateway lifecycle via s6. + + Returns True iff dispatched (caller should ``return``); False + otherwise — caller continues with the host-side code path. + + ``action`` is one of ``start`` / ``stop`` / ``restart``. The + profile defaults to the current one (resolved via ``_profile_arg``). + The s6 service slot was created either by the Phase 4 profile-create + hook or by the container-boot reconciler (cont-init.d/02-…). If it + doesn't exist or s6 returns an error, the named errors from + :mod:`hermes_cli.service_manager` are caught and surfaced as + actionable CLI messages (no raw ``CalledProcessError`` traceback). + """ + from hermes_cli.service_manager import ( + GatewayNotRegisteredError, + S6CommandError, + detect_service_manager, + get_service_manager, + ) + + if detect_service_manager() != "s6": + return False + if profile is None: + # _profile_suffix() returns the bare profile name for + # HERMES_HOME=/profiles/, "" for the default root, + # or a hash for unrelated paths. Map "" → "default" so the + # default-profile gateway is reachable as gateway-default. + profile = _profile_suffix() or "default" + mgr = get_service_manager() + service_name = f"gateway-{profile}" + try: + if action == "start": + mgr.start(service_name) + elif action == "stop": + mgr.stop(service_name) + elif action == "restart": + mgr.restart(service_name) + else: + return False + except GatewayNotRegisteredError as exc: + print(f"✗ {exc}") + sys.exit(1) + except S6CommandError as exc: + print(f"✗ {exc}") + sys.exit(1) + return True + + +def _dispatch_all_via_service_manager_if_s6(action: str) -> bool: + """Inside a container with s6, dispatch ``--all`` lifecycle to every + registered profile gateway. + + Returns True iff dispatched (caller should ``return``); False + otherwise — caller continues with the host-side code path. + + Without this, ``hermes gateway stop --all`` and ``... restart --all`` + fall through to ``kill_gateway_processes(all_profiles=True)``, which + just ``pkill``s every gateway process. s6-supervise observes the + crash and restarts each one ~1s later — so ``--all`` ends up + *kicking* every gateway instead of *stopping* it. By iterating + ``list_profile_gateways()`` and sending the lifecycle command + through the service manager we get the intended semantics (s6's + ``want up``/``want down`` flips correctly so supervise stays down + after a stop). + + ``action`` is one of ``stop`` / ``restart`` (``start --all`` isn't + a supported CLI surface). + """ + from hermes_cli.service_manager import ( + detect_service_manager, + get_service_manager, + ) + + if detect_service_manager() != "s6": + return False + if action not in ("stop", "restart"): + return False + mgr = get_service_manager() + profiles = mgr.list_profile_gateways() + if not profiles: + print("✗ No profile gateways registered under s6") + return True + fn = mgr.stop if action == "stop" else mgr.restart + errors: list[tuple[str, Exception]] = [] + for profile in profiles: + service_name = f"gateway-{profile}" + try: + fn(service_name) + except Exception as exc: # noqa: BLE001 — report and continue + errors.append((profile, exc)) + succeeded = len(profiles) - len(errors) + verb = "stopped" if action == "stop" else "restarted" + if succeeded: + print(f"✓ {verb.capitalize()} {succeeded} profile gateway(s) under s6") + for profile, exc in errors: + print(f"✗ Could not {action} gateway-{profile}: {exc}") + return True + + def gateway_command(args): """Handle gateway subcommands.""" try: @@ -5089,6 +5213,21 @@ def _gateway_command_inner(args): print(" nohup hermes gateway run > ~/.hermes/logs/gateway.log 2>&1 & # background") sys.exit(1) elif is_container(): + # Phase 4: inside a container with s6 the gateway service is + # auto-registered when the profile is created (and reconciled + # at every container boot). `install` is therefore informational. + from hermes_cli.service_manager import detect_service_manager + if detect_service_manager() == "s6": + print("Per-profile gateways are auto-registered when you create a profile.") + print() + print(" hermes profile create # creates the s6 service slot") + print(" hermes -p gateway start # bring it up via s6") + print(" hermes status # see currently-supervised gateways") + return + # Fallback for pre-s6 containers or other container runtimes + # we haven't taught about supervision (Podman without our + # /init, k8s plain runs, etc.) — the historical guidance still + # applies. print("Service installation is not needed inside a Docker container.") print("The container runtime is your service manager — use Docker restart policies instead:") print() @@ -5119,6 +5258,13 @@ def _gateway_command_inner(args): from hermes_cli import gateway_windows gateway_windows.uninstall() elif is_container(): + from hermes_cli.service_manager import detect_service_manager + if detect_service_manager() == "s6": + print("Per-profile gateways are auto-unregistered when you delete the profile.") + print() + print(" hermes profile delete # tears down the s6 service slot") + print(" hermes -p gateway stop # stop without deleting the profile") + return print("Service uninstall is not applicable inside a Docker container.") print("To stop the gateway, stop or remove the container:") print() @@ -5133,6 +5279,14 @@ def _gateway_command_inner(args): system = getattr(args, 'system', False) start_all = getattr(args, 'all', False) + # Phase 4: inside a container with s6, dispatch via the service + # manager instead of falling through to systemd/launchd/windows. + # `--all` isn't meaningful here (each profile has its own service + # slot — start them individually via `hermes -p gateway + # start`), so just bring up the current profile's slot. + if not start_all and _dispatch_via_service_manager_if_s6("start"): + return + if start_all: # Kill all stale gateway processes across all profiles before starting killed = kill_gateway_processes(all_profiles=True) @@ -5162,6 +5316,11 @@ def _gateway_command_inner(args): print("To enable systemd: add systemd=true to /etc/wsl.conf and run 'wsl --shutdown' from PowerShell.") sys.exit(1) elif is_container(): + # Reached only when s6 ISN'T running (the early dispatch + # above handles the s6 case). Pre-s6 containers or other + # container runtimes that don't ship our /init get the + # historical guidance: the gateway is the container's main + # process, so use docker lifecycle commands. print("Service start is not applicable inside a Docker container.") print("The gateway runs as the container's main process.") print() @@ -5178,6 +5337,15 @@ def _gateway_command_inner(args): stop_all = getattr(args, 'all', False) system = getattr(args, 'system', False) + # Phase 4: inside a container with s6, dispatch via the service + # manager. ``--all`` iterates every registered profile gateway + # through s6 (otherwise it would fall through to ``pkill``, + # which s6-supervise observes as a crash and immediately restarts). + if stop_all and _dispatch_all_via_service_manager_if_s6("stop"): + return + if not stop_all and _dispatch_via_service_manager_if_s6("stop"): + return + if stop_all: # --all: kill every gateway process on the machine service_available = False @@ -5247,6 +5415,16 @@ def _gateway_command_inner(args): restart_all = getattr(args, 'all', False) service_configured = False + # Phase 4: inside a container with s6, dispatch via the service + # manager (s6-svc -t restarts the supervised process). ``--all`` + # iterates every registered profile gateway through s6; without + # this it would fall through to ``pkill``, which s6-supervise + # would observe as a crash and immediately restart anyway. + if restart_all and _dispatch_all_via_service_manager_if_s6("restart"): + return + if not restart_all and _dispatch_via_service_manager_if_s6("restart"): + return + if restart_all: # --all: stop every gateway process across all profiles, then start fresh service_stopped = False diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index aa33d9182b8..c4cb373bddc 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -777,6 +777,14 @@ def create_profile( except Exception: pass # non-fatal — user can describe later with `hermes profile describe` + # Phase 4: when running inside a container under s6, register the + # new profile's gateway as a runtime s6 service so + # `hermes -p gateway start` can supervise it via + # `s6-svc -u` instead of spawning a bare process. On host (systemd + # / launchd / windows) this is a no-op — the existing per-profile + # unit-generation paths handle gateway lifecycle. + _maybe_register_gateway_service(canon) + return profile_dir @@ -893,6 +901,10 @@ def delete_profile(name: str, yes: bool = False) -> Path: # 1. Disable service (prevents auto-restart) _cleanup_gateway_service(canon, profile_dir) + # 1b. Phase 4: unregister the s6 service slot (container path). + # On host this is a no-op; on container it removes + # /run/service/gateway-/ so s6-supervise drops it. + _maybe_unregister_gateway_service(canon) # 2. Stop running gateway if gw_running: @@ -965,6 +977,87 @@ def delete_profile(name: str, yes: bool = False) -> Path: return profile_dir +def _maybe_register_gateway_service(profile_name: str) -> None: + """Register a profile's gateway with s6 inside the container. + + No-op on host (systemd/launchd/windows) — those backends raise + ``NotImplementedError`` on ``register_profile_gateway`` and the + existing per-profile unit-generation paths handle lifecycle. + + Best-effort: any error (no backend detected, s6 not yet ready, + etc.) is logged and swallowed so profile creation doesn't fail + because the s6 supervision tree is in a weird state. The user + can re-register manually later via the gateway start command, + which goes through the same dispatch path. + + Port selection is governed by the profile's ``config.yaml`` + (``[gateway] port = …``) — there is no Python-side allocator + (PR #30136 review item I5 retired the SHA-256-derived range + [9200, 9800) because it was dead code through the entire stack). + + Host short-circuit: check ``detect_service_manager()`` first and + return immediately if it isn't ``"s6"``. This keeps host + (systemd/launchd/windows) profile creation completely silent — + no ``get_service_manager()`` call, no exception path, no chance + of the ``⚠ Could not register s6 gateway service`` warning ever + rendering on a non-container machine. The earlier + ``supports_runtime_registration()`` check still catches the case + where detection somehow returns ``"s6"`` but the backend isn't + actually the S6 one. + """ + try: + from hermes_cli.service_manager import detect_service_manager + if detect_service_manager() != "s6": + return # host path — silent, no registration needed + from hermes_cli.service_manager import get_service_manager + mgr = get_service_manager() + except RuntimeError: + return # no backend on this host — nothing to do + except Exception: + # Defensive: detect_service_manager failed for some other + # reason. Stay silent on host rather than printing a confusing + # s6 warning to users who have never touched the container. + return + if not mgr.supports_runtime_registration(): + return # host backend; no-op + try: + mgr.register_profile_gateway(profile_name) + except ValueError: + # Already registered (e.g. the container-boot reconciler ran + # first and brought up a stale slot). That's fine. + pass + except Exception as exc: + # Don't fail profile create over a supervision-tree hiccup. + print(f"⚠ Could not register s6 gateway service: {exc}") + + +def _maybe_unregister_gateway_service(profile_name: str) -> None: + """Tear down a profile's s6 gateway service inside the container. + + No-op on host. Idempotent: absent services are silently skipped + by ``unregister_profile_gateway``. + + Same host short-circuit as :func:`_maybe_register_gateway_service` + — see that docstring. + """ + try: + from hermes_cli.service_manager import detect_service_manager + if detect_service_manager() != "s6": + return # host path — silent + from hermes_cli.service_manager import get_service_manager + mgr = get_service_manager() + except RuntimeError: + return + except Exception: + return + if not mgr.supports_runtime_registration(): + return + try: + mgr.unregister_profile_gateway(profile_name) + except Exception as exc: + print(f"⚠ Could not unregister s6 gateway service: {exc}") + + def _cleanup_gateway_service(name: str, profile_dir: Path) -> None: """Disable and remove systemd/launchd service for a profile.""" import platform as _platform diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py new file mode 100644 index 00000000000..417ec4ec982 --- /dev/null +++ b/hermes_cli/service_manager.py @@ -0,0 +1,886 @@ +"""Abstract service manager interface. + +Wraps the existing systemd (Linux host), launchd (macOS host), Windows +Scheduled Task (native Windows host), and s6 (container) backends behind +a common Protocol. Only the s6 backend supports runtime registration +(for per-profile gateways) — host backends raise NotImplementedError +from those methods, and callers MUST check supports_runtime_registration() +before invoking them. + +Host-side call sites (setup wizard, uninstall, status) continue to use +the existing module-level functions in hermes_cli.gateway and +hermes_cli.gateway_windows directly. This protocol is a thin facade +used by new code that needs to be backend-agnostic — specifically the +profile create/delete hooks (Phase 4) and the s6 dispatch path in +``hermes gateway start/stop/restart`` when running inside a container. +""" +from __future__ import annotations + +import re +from pathlib import Path +from typing import Literal, Protocol, runtime_checkable + +ServiceManagerKind = Literal["systemd", "launchd", "windows", "s6", "none"] + +# Profile name → service directory mapping. Profile names must be safe +# as filesystem directory names because the s6 backend creates a service +# directory at ``/gateway-/``. We reject anything that +# could traverse paths, span filesystems, or break s6's own naming rules. +_VALID_PROFILE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]*$") +_MAX_PROFILE_LEN = 251 # s6-svscan default name_max + + +def validate_profile_name(name: str) -> None: + """Raise ValueError if ``name`` is not usable as a profile name. + + Profile names are used as s6 service directory names, so they must + match a conservative subset of filesystem-safe characters. Reject + empty strings, uppercase, paths-traversal sequences, and anything + longer than s6's default ``name_max``. + """ + if not name: + raise ValueError("profile name must not be empty") + if len(name) > _MAX_PROFILE_LEN: + raise ValueError( + f"profile name too long ({len(name)} > {_MAX_PROFILE_LEN})" + ) + if not _VALID_PROFILE_RE.match(name): + raise ValueError( + f"profile name must match [a-z0-9][a-z0-9_-]*, got {name!r}" + ) + + +@runtime_checkable +class ServiceManager(Protocol): + """Abstract interface for init-system-specific service operations. + + Lifecycle methods (start / stop / restart / is_running) are + implemented by every backend. Runtime registration + (register_profile_gateway / unregister_profile_gateway / + list_profile_gateways) is implemented only by the s6 backend — + callers MUST check ``supports_runtime_registration()`` before + invoking the registration methods. + """ + + kind: ServiceManagerKind + + # Lifecycle of a pre-declared service. + def start(self, name: str) -> None: ... + def stop(self, name: str) -> None: ... + def restart(self, name: str) -> None: ... + def is_running(self, name: str) -> bool: ... + + # Runtime registration (s6 only). + def supports_runtime_registration(self) -> bool: ... + def register_profile_gateway( + self, + profile: str, + *, + extra_env: dict[str, str] | None = None, + ) -> None: ... + def unregister_profile_gateway(self, profile: str) -> None: ... + def list_profile_gateways(self) -> list[str]: ... + + +def detect_service_manager() -> ServiceManagerKind: + """Detect which service manager is available in this environment. + + Returns: + "s6" — inside a container when /init is s6-svscan (Phase 2+) + "windows" — native Windows host + "launchd" — macOS host + "systemd" — Linux host with a working user/system bus + "none" — anything else (Termux, sandbox shells, etc.) + + This function does NOT replace ``supports_systemd_services()`` — + host call sites continue to use that. It exists for new backend- + agnostic code (profile create/delete hooks, the s6 dispatch path + in ``hermes gateway start/stop/restart``). + """ + # Imports deferred so importing this module doesn't drag in the + # whole gateway dependency graph for callers that only need the + # Protocol type or validate_profile_name(). + from hermes_constants import is_container + from hermes_cli.gateway import ( + is_macos, + is_windows, + supports_systemd_services, + ) + + if is_container() and _s6_running(): + return "s6" + if is_windows(): + return "windows" + if is_macos(): + return "launchd" + if supports_systemd_services(): + return "systemd" + return "none" + + +def _s6_running() -> bool: + """True when s6-svscan is running as PID 1 in this container. + + Detection has to work for **both** root and the unprivileged hermes + user (UID 10000). The obvious probe — ``Path('/proc/1/exe').resolve()`` + — only works as root: for any other UID, the symlink at + ``/proc/1/exe`` is unreadable and ``resolve()`` silently returns the + path unchanged, so the resolved name is the literal ``"exe"`` and + detection always fails. Since every Hermes runtime call inside the + container drops to hermes via ``s6-setuidgid``, that silent failure + made the entire service-manager runtime-registration path inert in + production (PR #30136 review). + + Probe instead via: + * ``/proc/1/comm`` — world-readable, contains the process comm + (``s6-svscan`` when s6-overlay is PID 1). + * ``/run/s6/basedir`` — s6-overlay-specific directory created by + stage1. World-readable. More specific than ``/run/s6`` (which + other tools occasionally create). + + Both signals are required; either alone could false-positive + (e.g. a container with the s6 binaries installed but a different + init, or an unrelated process named ``s6-svscan``). + """ + try: + comm = Path("/proc/1/comm").read_text(encoding="utf-8").strip() + except OSError: + return False + if comm != "s6-svscan": + return False + return Path("/run/s6/basedir").is_dir() + + +# --------------------------------------------------------------------------- +# Backend wrappers +# +# These adapters are thin facades over the existing module-level functions +# in ``hermes_cli.gateway`` (systemd/launchd) and ``hermes_cli.gateway_windows`` +# (Windows Scheduled Tasks). The protocol's ``name`` parameter is currently +# unused for host backends — they operate on whichever profile is currently +# active (set via the ``hermes -p `` flag before the call). This +# matches existing host-side semantics; the parameter shape is designed +# for s6 where each profile maps to a distinct service directory. +# --------------------------------------------------------------------------- + + +class _RegistrationUnsupportedMixin: + """Mixin for host backends that don't support runtime registration.""" + + def supports_runtime_registration(self) -> bool: + return False + + def register_profile_gateway( + self, + profile: str, + *, + extra_env: dict[str, str] | None = None, + ) -> None: + raise NotImplementedError( + f"{type(self).__name__} does not support runtime profile " + "gateway registration (container-only feature)" + ) + + def unregister_profile_gateway(self, profile: str) -> None: + raise NotImplementedError( + f"{type(self).__name__} does not support runtime profile " + "gateway unregistration (container-only feature)" + ) + + def list_profile_gateways(self) -> list[str]: + return [] + + +class SystemdServiceManager(_RegistrationUnsupportedMixin): + """Thin wrapper around the ``systemd_*`` functions in hermes_cli.gateway. + + Existing host call sites continue to use those functions directly; + this wrapper exists for new code that needs to be backend-agnostic + (the Phase 4 profile create/delete hooks). + """ + + kind: ServiceManagerKind = "systemd" + + def start(self, name: str) -> None: + from hermes_cli.gateway import systemd_start + systemd_start() + + def stop(self, name: str) -> None: + from hermes_cli.gateway import systemd_stop + systemd_stop() + + def restart(self, name: str) -> None: + from hermes_cli.gateway import systemd_restart + systemd_restart() + + def is_running(self, name: str) -> bool: + from hermes_cli.gateway import _probe_systemd_service_running + _, running = _probe_systemd_service_running() + return running + + +class LaunchdServiceManager(_RegistrationUnsupportedMixin): + """Thin wrapper around the ``launchd_*`` functions in hermes_cli.gateway.""" + + kind: ServiceManagerKind = "launchd" + + def start(self, name: str) -> None: + from hermes_cli.gateway import launchd_start + launchd_start() + + def stop(self, name: str) -> None: + from hermes_cli.gateway import launchd_stop + launchd_stop() + + def restart(self, name: str) -> None: + from hermes_cli.gateway import launchd_restart + launchd_restart() + + def is_running(self, name: str) -> bool: + from hermes_cli.gateway import _probe_launchd_service_running + return _probe_launchd_service_running() + + +class WindowsServiceManager(_RegistrationUnsupportedMixin): + """Thin wrapper around ``hermes_cli.gateway_windows`` (Scheduled Task / + Startup-folder fallback). + + The native Windows backend uses a Scheduled Task rather than a true + init-system service, but for protocol purposes the lifecycle is the + same: start / stop / restart / is_running. ``install`` accepts a + handful of Windows-specific kwargs (start_now, start_on_login, + elevated_handoff) that are passed straight through — non-Windows + callers should never invoke ``install`` on this wrapper. + """ + + kind: ServiceManagerKind = "windows" + + def install( + self, + *, + force: bool = False, + start_now: bool | None = None, + start_on_login: bool | None = None, + elevated_handoff: bool = False, + ) -> None: + from hermes_cli import gateway_windows + gateway_windows.install( + force=force, + start_now=start_now, + start_on_login=start_on_login, + elevated_handoff=elevated_handoff, + ) + + def start(self, name: str) -> None: + from hermes_cli import gateway_windows + gateway_windows.start() + + def stop(self, name: str) -> None: + from hermes_cli import gateway_windows + gateway_windows.stop() + + def restart(self, name: str) -> None: + from hermes_cli import gateway_windows + gateway_windows.restart() + + def is_running(self, name: str) -> bool: + from hermes_cli import gateway_windows + from hermes_cli.gateway import find_gateway_pids + if not gateway_windows.is_installed(): + return False + return bool(find_gateway_pids()) + + +def get_service_manager() -> ServiceManager: + """Return the ServiceManager instance for the current environment. + + Raises: + RuntimeError: when no supported backend is available. + """ + kind = detect_service_manager() + if kind == "systemd": + return SystemdServiceManager() + if kind == "launchd": + return LaunchdServiceManager() + if kind == "windows": + return WindowsServiceManager() + if kind == "s6": + return S6ServiceManager() + raise RuntimeError("no supported service manager detected") + + +# --------------------------------------------------------------------------- +# S6ServiceManager (container-only) +# +# Per-profile gateways are registered dynamically when `hermes profile create` +# runs inside the container (Phase 4). Static services (main-hermes, dashboard) +# live in /etc/s6-overlay/s6-rc.d/ and are NOT managed by this class — they're +# part of the image, not runtime-created. +# --------------------------------------------------------------------------- + + +# s6-overlay's dynamic scandir for runtime-registered services. Lives on +# tmpfs and is the directory s6-svscan watches. Writes here trigger +# automatic supervision on the next rescan. +S6_DYNAMIC_SCANDIR = Path("/run/service") +S6_SERVICE_PREFIX = "gateway-" + +# s6-overlay installs its binaries under /command/ and only adds that +# directory to PATH for processes started under the supervision tree +# (services started by s6-svscan, cont-init.d scripts, etc.). Code +# that runs via `docker exec` or any other out-of-tree entry point — +# notably our Phase 4 profile create/delete hooks — inherits the +# container's base PATH which does NOT include /command/. +# +# Rather than asking every caller to fix up its environment, the +# S6ServiceManager calls s6-* binaries by absolute path via this +# constant. We don't use `/usr/bin/s6-…` symlinks because the +# s6-overlay-symlinks-noarch tarball only links a subset, and we +# want every s6 invocation to be guaranteed-findable. +_S6_BIN_DIR = "/command" + + +# UID/GID of the in-image ``hermes`` user. Hardcoded to match what +# ``stage2-hook.sh`` enforces (the runtime invariant — see also +# tests/docker/test_uid_remap.py). The container starts s6-supervise +# under root and immediately drops to this UID via ``s6-setuidgid``. +_HERMES_UID = 10000 +_HERMES_GID = 10000 + + +def _seed_supervise_skeleton(svc_dir: Path) -> None: + """Pre-create the ``supervise/`` and top-level ``event/`` skeleton + inside a service directory, owned by the hermes user. + + Why this exists + --------------- + When s6-supervise spawns a service it tries to ``mkdir`` two + directories: ``/event`` and ``/supervise``, both with mode + ``0700``. It also ``mkfifo``s ``/supervise/control`` with mode + ``0600``. Because s6-supervise runs as PID 1's effective UID (root) + these dirs end up root-owned mode 0700, and an unprivileged client + (the ``hermes`` user — UID 10000 — running every Hermes runtime + operation via ``s6-setuidgid``) gets ``EACCES`` on any ``s6-svc``, + ``s6-svstat``, or ``s6-svwait`` invocation against the slot. + + The PR #30136 review surfaced this as a real product gap: the + entire S6ServiceManager lifecycle (``register/start/stop/unregister + _profile_gateway``) was inert in production because every operation + is dispatched as the hermes user. + + Why this works + -------------- + Reading s6's source (src/supervision/s6-supervise.c::trymkdir + + control_init): the ``mkdir`` and ``mkfifo`` calls both treat + ``EEXIST`` as success. If the directory is already present, the + chown/chmod fix-up that would normally make event/ ``03730 + root:root`` is **skipped** entirely — s6-supervise just opens the + pre-existing FIFOs and proceeds. So if we lay the skeleton down + with hermes ownership before triggering ``s6-svscanctl -a``, + s6-supervise inherits our layout and never touches it. + + Layout produced + --------------- + ``svc_dir/`` hermes:hermes, 0755 (parent must already exist) + ``svc_dir/event/`` hermes:hermes, 03730 (setgid + g+rwx + sticky) + ``svc_dir/supervise/`` hermes:hermes, 0755 + ``svc_dir/supervise/event/`` hermes:hermes, 03730 + ``svc_dir/supervise/control`` hermes:hermes, 0660 (FIFO) + + The ``death_tally``, ``lock``, and ``status`` regular files end up + written by s6-supervise itself (as root), but those land mode 0644 — + world-readable — and ``s6-svstat`` only needs read access, so the + hermes user reads them fine. + + If ``svc_dir/log/`` is present (the canonical s6 logger pattern — + one s6-supervise instance per service, plus a second for its + logger), the same skeleton is seeded under ``log/`` as well: + ``log/event/``, ``log/supervise/``, ``log/supervise/event/``, + ``log/supervise/control``. Without this, unregister teardown + would EACCES on the logger's supervise dir even after the parent + slot's supervise/ was hermes-owned. + + Idempotency + ----------- + Safe to call against a directory where the skeleton already exists. + Existing entries are left untouched (the helper doesn't try to + re-chown / re-chmod live FIFOs that s6-supervise may have already + opened). + + Reference + --------- + Discussed at length on the skarnet `skaware` mailing list in 2020 + (``_); see also + just-containers/s6-overlay#130. The pre-creation pattern was + historically called out as forward-compatibility-fragile, but the + EEXIST handling in s6-supervise has been stable since 2015 — it's + the same pattern ``s6-svperms`` and ``fix-attrs.d`` rely on. + """ + import os + + def _mkdir_owned(path: Path, mode: int) -> None: + if path.exists(): + return + path.mkdir(parents=False, exist_ok=False) + path.chmod(mode) + try: + os.chown(path, _HERMES_UID, _HERMES_GID) + except PermissionError: + # Running as the hermes user already — directory is hermes- + # owned by default. The chown is a no-op in that case, so + # swallowing this keeps both root and unprivileged callers + # on one code path. + pass + + # Top-level event/ dir (this is the s6-svlisten1 event-subscription + # dir at the service root, distinct from supervise/event/). + _mkdir_owned(svc_dir / "event", 0o3730) + + # supervise/ dir + its inner event/ dir. + supervise = svc_dir / "supervise" + _mkdir_owned(supervise, 0o755) + _mkdir_owned(supervise / "event", 0o3730) + + # supervise/control FIFO. Same EEXIST-safe pattern: if it's already + # there (s6-supervise has already started against this slot), leave + # it alone. The explicit chmod after mkfifo is required because + # mkfifo honors the process umask, which can strip group-write + # (e.g. the default 0022 on most dev hosts → 0o660 becomes 0o640). + # The container runs with umask 0 inside s6-overlay's stage2, but + # being defensive here keeps the helper consistent under any + # invocation context. + control = supervise / "control" + if not control.exists(): + os.mkfifo(control, 0o660) + control.chmod(0o660) + try: + os.chown(control, _HERMES_UID, _HERMES_GID) + except PermissionError: + pass + + # If a log/ subdir is present (the canonical s6 logger pattern — + # see servicedir(7)), it gets its own s6-supervise instance and + # needs the same skeleton. Without this, unregister teardown + # would EACCES on the logger's root-owned supervise/ dir even + # when the parent slot's supervise/ is hermes-owned. + log_dir = svc_dir / "log" + if log_dir.is_dir(): + _mkdir_owned(log_dir / "event", 0o3730) + log_supervise = log_dir / "supervise" + _mkdir_owned(log_supervise, 0o755) + _mkdir_owned(log_supervise / "event", 0o3730) + log_control = log_supervise / "control" + if not log_control.exists(): + os.mkfifo(log_control, 0o660) + log_control.chmod(0o660) + try: + os.chown(log_control, _HERMES_UID, _HERMES_GID) + except PermissionError: + pass + + +class S6Error(RuntimeError): + """Base error for S6ServiceManager lifecycle failures. + + Concrete subclasses carry the slot name (and, where useful, the + underlying subprocess output) so the CLI can render an actionable + message instead of leaking a raw ``CalledProcessError`` traceback. + """ + + def __init__(self, message: str, *, service: str | None = None) -> None: + super().__init__(message) + self.service = service + + +class GatewayNotRegisteredError(S6Error): + """Raised when a lifecycle method targets a slot that doesn't exist. + + Most commonly: ``hermes -p typo gateway start`` when no profile + ``typo`` exists. Carries the unprefixed profile name (not the + full ``gateway-`` service-dir name) so callers can phrase + a user-facing message like "no such gateway 'typo'". + """ + + def __init__(self, profile: str) -> None: + self.profile = profile + super().__init__( + f"no such gateway {profile!r}: register it with " + f"`hermes profile create {profile}` first, or pass " + "an existing profile name via `-p `", + service=f"gateway-{profile}", + ) + + +class S6CommandError(S6Error): + """Raised when an s6 command fails for a reason other than a + missing slot — e.g. permission denied on the supervise control + FIFO, or s6-svc returning a non-zero exit for an unexpected + reason. Carries the stderr from the failing command so callers + can surface it. + """ + + def __init__( + self, *, service: str, action: str, returncode: int, stderr: str, + ) -> None: + self.action = action + self.returncode = returncode + self.stderr = stderr + message = ( + f"s6-svc {action} on {service!r} failed (rc={returncode})" + ) + if stderr.strip(): + message += f": {stderr.strip()}" + super().__init__(message, service=service) + + +class S6ServiceManager: + """Per-profile gateway supervision via s6-overlay. + + Only handles runtime-registered services under + ``S6_DYNAMIC_SCANDIR``. Static services (main-hermes, dashboard) + are managed by s6-rc at image-build time and are out of scope. + """ + + kind: ServiceManagerKind = "s6" + + def __init__(self, scandir: Path = S6_DYNAMIC_SCANDIR) -> None: + self.scandir = scandir + + # -- internal helpers -------------------------------------------------- + + def _service_dir(self, profile: str) -> Path: + validate_profile_name(profile) + return self.scandir / f"{S6_SERVICE_PREFIX}{profile}" + + def _service_name(self, profile: str) -> str: + return f"{S6_SERVICE_PREFIX}{profile}" + + @staticmethod + def _render_run_script( + profile: str, + extra_env: dict[str, str], + ) -> str: + """Generate the run script for a profile-gateway s6 service. + + The script: + 1. Sources HERMES_HOME (and any extra env) via with-contenv — + so e.g. ``-e HERMES_HOME=/data/hermes`` is honored at run + time, not Python-substituted at registration time (OQ8-C). + 2. Activates the bundled venv. + 3. Drops to the hermes user and exec's + ``hermes -p gateway run`` (or just ``hermes + gateway run`` for the default profile — see below). + + Special case: ``profile == "default"`` emits ``hermes gateway + run`` with **no** ``-p`` flag. This is the sentinel for "the + root HERMES_HOME profile" (the implicit profile that exists at + the top of $HERMES_HOME, not under profiles/). It must be + spelled this way because ``_profile_suffix()`` returns the + empty string for the root profile, and the dispatcher in + ``hermes_cli.gateway`` maps that empty string to the + ``gateway-default`` service slot. Passing ``-p default`` here + would instead look up ``$HERMES_HOME/profiles/default/`` — a + completely different (and almost always nonexistent) profile. + + Port selection: the gateway picks its bind port from the + profile's ``config.yaml`` (``[gateway] port = ...``) — that + is the single source of truth. Previously this method took a + ``port`` parameter that was passed in but never substituted + into the rendered script (it was carried in for "API parity" + with a deterministic SHA-256 allocator in + ``hermes_cli.profiles._allocate_gateway_port``). PR #30136 + review item I5 retired both the allocator and the parameter + because they were dead code through the entire stack. + """ + import shlex + lines = [ + "#!/command/with-contenv sh", + "# shellcheck shell=sh", + "set -e", + "cd /opt/data", + ". /opt/hermes/.venv/bin/activate", + ] + for k, v in sorted(extra_env.items()): + lines.append(f"export {k}={shlex.quote(v)}") + if profile == "default": + lines.append("exec s6-setuidgid hermes hermes gateway run") + else: + lines.append( + f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run" + ) + return "\n".join(lines) + "\n" + + @staticmethod + def _render_log_run(profile: str) -> str: + """Generate the log/run script for a profile-gateway service. + + OQ8-C: persist to ``${HERMES_HOME}/logs/gateways//``. + CRITICAL: the HERMES_HOME path is sourced from the runtime env + via with-contenv — NOT Python-substituted at registration time + — so a container started with ``-e HERMES_HOME=/data/hermes`` + gets its logs under /data/hermes/logs/..., not the build-time + default. + """ + import shlex + prof = shlex.quote(profile) + return ( + f"#!/command/with-contenv sh\n" + f"# shellcheck shell=sh\n" + f': "${{HERMES_HOME:=/opt/data}}"\n' + f'log_dir="$HERMES_HOME/logs/gateways/{prof}"\n' + f'mkdir -p "$log_dir"\n' + f'chown -R hermes:hermes "$log_dir" 2>/dev/null || true\n' + f'exec s6-setuidgid hermes s6-log n10 s1000000 T "$log_dir"\n' + ) + + # -- lifecycle --------------------------------------------------------- + + def _run_svc(self, action_flag: str, action_label: str, name: str) -> None: + """Shared lifecycle dispatch for start / stop / restart. + + Translates the two failure modes operators care about into + named errors: + + * ``GatewayNotRegisteredError`` — the service directory at + ``//`` doesn't exist. ``s6-svc`` would + exit non-zero with a fairly opaque message; we pre-empt + it with a clear "no such gateway 'X'" tied to the profile + name (without the ``gateway-`` prefix). + * ``S6CommandError`` — anything else (EACCES on the + supervise control FIFO, timeout, etc.). Carries the + subprocess return code and stderr so callers can render + them inline. + + ``action_flag`` is the ``s6-svc`` flag (``-u`` / ``-d`` / + ``-t``); ``action_label`` is the human verb (``start`` / + ``stop`` / ``restart``) used in error messages. + """ + import subprocess + + service_dir = self.scandir / name + if not service_dir.is_dir(): + # Strip the gateway- prefix back off so the message + # matches what the user typed on the CLI (``-p ``). + profile = ( + name[len(S6_SERVICE_PREFIX):] + if name.startswith(S6_SERVICE_PREFIX) + else name + ) + raise GatewayNotRegisteredError(profile) + + try: + subprocess.run( + [f"{_S6_BIN_DIR}/s6-svc", action_flag, str(service_dir)], + check=True, capture_output=True, text=True, timeout=5, + ) + except subprocess.CalledProcessError as exc: + raise S6CommandError( + service=name, + action=action_label, + returncode=exc.returncode, + stderr=exc.stderr or "", + ) from exc + + def start(self, name: str) -> None: + """Bring up a registered service (``s6-svc -u``). + + Raises: + GatewayNotRegisteredError: no service directory for ``name``. + S6CommandError: s6-svc exited non-zero for any other reason + (permission denied on the supervise FIFO, timeout, etc.). + """ + self._run_svc("-u", "start", name) + + def stop(self, name: str) -> None: + """Bring down a registered service (``s6-svc -d``). + + Raises: + GatewayNotRegisteredError: no service directory for ``name``. + S6CommandError: s6-svc exited non-zero for any other reason. + """ + self._run_svc("-d", "stop", name) + + def restart(self, name: str) -> None: + """Restart a registered service (``s6-svc -t`` = SIGTERM). + + Raises: + GatewayNotRegisteredError: no service directory for ``name``. + S6CommandError: s6-svc exited non-zero for any other reason. + """ + self._run_svc("-t", "restart", name) + + def is_running(self, name: str) -> bool: + """True iff ``s6-svstat`` reports the service as up.""" + import subprocess + result = subprocess.run( + [f"{_S6_BIN_DIR}/s6-svstat", str(self.scandir / name)], + capture_output=True, text=True, timeout=5, + ) + return result.returncode == 0 and "up " in result.stdout + + # -- runtime registration --------------------------------------------- + + def supports_runtime_registration(self) -> bool: + return True + + def register_profile_gateway( + self, + profile: str, + *, + extra_env: dict[str, str] | None = None, + ) -> None: + """Create the s6 service directory for a profile gateway. + + Triggers ``s6-svscanctl -a`` so s6-svscan picks the new directory + up immediately. The service is created in the *up* state — to + register without auto-starting, follow up with ``stop(profile)`` + (or pass the start flag via the future ``start_now=False`` arg, + which the Phase 4 reconciliation path uses via a ``down`` + marker file written directly). + + Raises: + ValueError: if the profile name is invalid or the service + directory already exists. + RuntimeError: if ``s6-svscanctl`` fails. + """ + import shutil + import subprocess + + svc_dir = self._service_dir(profile) + if svc_dir.exists(): + raise ValueError( + f"profile gateway {profile!r} already registered at {svc_dir}" + ) + + # Build the service directory atomically: write to a sibling + # temp dir, then rename. Avoids s6-svscan observing a half- + # populated directory on a fast rescan. + tmp_dir = svc_dir.with_name(svc_dir.name + ".tmp") + if tmp_dir.exists(): + shutil.rmtree(tmp_dir, ignore_errors=True) + tmp_dir.mkdir(parents=True) + + try: + (tmp_dir / "type").write_text("longrun\n") + + run_script = self._render_run_script(profile, extra_env or {}) + run_path = tmp_dir / "run" + run_path.write_text(run_script) + run_path.chmod(0o755) + + # Persistent log rotation (OQ8-C). + log_subdir = tmp_dir / "log" + log_subdir.mkdir() + log_run = log_subdir / "run" + log_run.write_text(self._render_log_run(profile)) + log_run.chmod(0o755) + + # Pre-create the supervise/ skeleton with hermes ownership + # BEFORE we publish the slot. s6-supervise will EEXIST our + # dirs/FIFOs and inherit the ownership, so the runtime + # s6-svc / s6-svstat / s6-svwait calls (all dispatched as + # the hermes user) won't hit EACCES on root-owned 0700 + # dirs. See ``_seed_supervise_skeleton`` for the full + # rationale. + _seed_supervise_skeleton(tmp_dir) + + tmp_dir.rename(svc_dir) + except Exception: + shutil.rmtree(tmp_dir, ignore_errors=True) + raise + + # Trigger rescan so s6-svscan picks up the new service. + result = subprocess.run( + [f"{_S6_BIN_DIR}/s6-svscanctl", "-a", str(self.scandir)], + capture_output=True, text=True, timeout=5, + ) + if result.returncode != 0: + # Clean up: rescan failed, leave the directory in place would + # be confusing (no supervisor watching it). + shutil.rmtree(svc_dir, ignore_errors=True) + raise RuntimeError( + f"s6-svscanctl failed: {result.stderr or result.stdout}" + ) + + def unregister_profile_gateway(self, profile: str) -> None: + """Stop the profile gateway service and remove its directory. + + Idempotent: absent services are a no-op. Best-effort stop + + wait-for-down before removal so the running gateway process + gets a chance to shut down cleanly before its service dir + disappears. + + Teardown ordering matters: ``s6-svscanctl -an`` is fired + **before** ``rmtree`` so s6-svscan reaps the supervise child + process (releasing its handle on ``supervise/lock`` and the + regular files inside the supervise dir), giving us a clean + directory to remove. Without the reap-first ordering, the + rmtree races s6-supervise on a set of root-owned files inside + the supervise dir and the dir is left half-removed. + """ + import shutil + import subprocess + import time + + svc_dir = self._service_dir(profile) + if not svc_dir.exists(): + return + + # Stop the service (best effort — service may already be down). + subprocess.run( + [f"{_S6_BIN_DIR}/s6-svc", "-d", str(svc_dir)], + capture_output=True, text=True, timeout=5, + check=False, + ) + # Wait for it to actually go down (up to 10s). + subprocess.run( + [f"{_S6_BIN_DIR}/s6-svwait", "-D", "-t", "10000", str(svc_dir)], + capture_output=True, text=True, timeout=15, + check=False, + ) + + # Reap the supervise child FIRST: -n tells s6-svscan to drop + # any supervise processes whose service dir is gone (which + # includes any service dir we're about to remove). This + # releases the file handles s6-supervise holds against the + # supervise/lock + supervise/status + supervise/death_tally + # files inside the slot, so the upcoming rmtree doesn't race. + subprocess.run( + [f"{_S6_BIN_DIR}/s6-svscanctl", "-an", str(self.scandir)], + capture_output=True, text=True, timeout=5, + check=False, + ) + # Give s6-svscan a moment to reap. There's no synchronous + # "scan completed" handshake — the -a/-n trigger just sets a + # flag s6-svscan reads on its next loop iteration. 200ms is + # comfortably above the loop's resolution but well under any + # user-perceived latency. + time.sleep(0.2) + + # Now the supervise dir's files are no longer held open by a + # live s6-supervise, so rmtree can remove them. Files inside + # supervise/ are root-owned (death_tally, lock, status, written + # by s6-supervise itself) — but the parent supervise/ directory + # is hermes-owned (see ``_seed_supervise_skeleton``), and on + # POSIX you only need write+execute on the parent to remove + # contained files regardless of file ownership. + shutil.rmtree(svc_dir, ignore_errors=True) + + def list_profile_gateways(self) -> list[str]: + """Return the profile names of all currently-registered gateway services. + + Filters the scandir to entries that match the ``gateway-`` prefix. + Other services (e.g. ``s6-linux-init-shutdownd``) are ignored. + """ + if not self.scandir.exists(): + return [] + profiles: list[str] = [] + for entry in self.scandir.iterdir(): + if entry.name.startswith("."): + continue + if not entry.is_dir(): + continue + if not entry.name.startswith(S6_SERVICE_PREFIX): + continue + profiles.append(entry.name[len(S6_SERVICE_PREFIX):]) + return profiles diff --git a/scripts/run_tests_parallel.py b/scripts/run_tests_parallel.py index 57178899012..7fe0b57947a 100755 --- a/scripts/run_tests_parallel.py +++ b/scripts/run_tests_parallel.py @@ -52,10 +52,24 @@ from typing import Dict, List, Tuple # Default test discovery roots. _DEFAULT_ROOTS = ["tests"] -# Directories to skip during discovery — the e2e + integration suites -# require real services and are run separately. Match exactly the -# ``--ignore=`` flags the previous CI command used. -_SKIP_PARTS = {"integration", "e2e"} +# Directories to skip during discovery — these suites require real +# external services (a model gateway, a docker daemon with a prebuilt +# image, etc.) and are run in their own dedicated CI jobs: +# +# tests/e2e/ — .github/workflows/tests.yml :: e2e job +# tests/integration/ — historical; legacy --ignore flags +# tests/docker/ — .github/workflows/docker-publish.yml :: +# build-amd64 job (runs against the freshly-loaded +# nousresearch/hermes-agent:test image, via +# ``HERMES_TEST_IMAGE`` so the fixture skips +# rebuild). The full pytest-shard runner can't +# host these because the session-scoped +# ``built_image`` fixture would do a 3-7min +# ``docker build`` inside a 180s per-test +# pytest-timeout cap (set by tests/docker/conftest.py), +# so the build is guaranteed to die in fixture +# setup. The dedicated job sidesteps both costs. +_SKIP_PARTS = {"integration", "e2e", "docker"} # Per-file wall-clock cap. Generous default — pytest-timeout still # enforces per-test caps inside each subprocess; this is just an outer @@ -136,7 +150,10 @@ def _discover_files(roots: List[Path]) -> List[Path]: Exclude any file whose path contains a component in ``_SKIP_PARTS``, UNLESS the user explicitly named it as a root (in which case the - user's intent overrides the skip filter). + user's intent overrides the skip filter). This makes + ``scripts/run_tests.sh tests/docker/`` work locally the same way + ``pytest tests/docker/`` does — the CI-level skip exists to keep + the sharded matrix from blowing up, not to block targeted runs. """ seen: set[Path] = set() out: List[Path] = [] @@ -151,8 +168,17 @@ def _discover_files(roots: List[Path]) -> List[Path]: seen.add(real) out.append(root) continue + # If the explicit root itself sits inside a skipped dir (e.g. + # the user said ``tests/docker``), the user has overridden the + # skip for that subtree. Compute the set of skip-parts the user + # opted into, and only filter files whose path crosses a + # skip-part *outside* that opt-in. + root_skip_overrides = { + part for part in root.parts if part in _SKIP_PARTS + } + effective_skips = _SKIP_PARTS - root_skip_overrides for path in root.rglob("test_*.py"): - if any(part in _SKIP_PARTS for part in path.parts): + if any(part in effective_skips for part in path.parts): continue real = path.resolve() if real in seen: diff --git a/skills/software-development/hermes-s6-container-supervision/SKILL.md b/skills/software-development/hermes-s6-container-supervision/SKILL.md new file mode 100644 index 00000000000..934b26bc181 --- /dev/null +++ b/skills/software-development/hermes-s6-container-supervision/SKILL.md @@ -0,0 +1,176 @@ +--- +name: hermes-s6-container-supervision +description: Modify, debug, or extend the s6-overlay supervision tree inside the Hermes Agent Docker image — adding new services, debugging profile gateways, understanding the Architecture B main-program pattern. +version: 1.0.0 +author: Hermes Agent +license: MIT +metadata: + hermes: + tags: [docker, s6, supervision, gateway, profiles] + related_skills: [hermes-agent, hermes-agent-dev] +--- + +# Hermes s6-overlay Container Supervision + +## When to use this skill + +Load this skill when you're working on: +- Adding or removing a static service in the Hermes Docker image (something that should be supervised at every container start, like the dashboard) +- Diagnosing why a per-profile gateway isn't starting, restarting, or surviving `docker restart` +- Understanding why the container's CMD is `/opt/hermes/docker/main-wrapper.sh` and how leading-dash args reach the user's program +- Modifying `cont-init.d` boot scripts (UID remap, volume seeding, profile reconciliation) +- Changing the rendered run-script for per-profile gateways (Phase 4) + +If you're just running the Hermes Agent and want to use Docker, see `website/docs/user-guide/docker.md` instead. + +## Architecture at a glance + +``` +/init ← PID 1 (s6-overlay v3.2.3.0) +├── cont-init.d ← oneshot setup, runs as root +│ ├── 01-hermes-setup ← docker/stage2-hook.sh +│ │ ├── UID/GID remap +│ │ ├── chown /opt/data +│ │ ├── chown /opt/data/profiles (every boot) +│ │ ├── seed .env / config.yaml / SOUL.md +│ │ └── skills_sync.py +│ └── 02-reconcile-profiles ← hermes_cli.container_boot +│ ├── chown /run/service (hermes-writable for runtime register) +│ └── walk $HERMES_HOME/profiles//gateway_state.json +│ → recreate /run/service/gateway-/ +│ → auto-start only those with prior_state == "running" +│ +├── s6-rc.d (static services, in /etc/s6-overlay/s6-rc.d/) +│ ├── main-hermes/run ← exec sleep infinity (no-op slot) +│ └── dashboard/run ← if HERMES_DASHBOARD=1, runs `hermes dashboard` +│ +├── /run/service (s6-svscan watches; tmpfs) +│ ├── gateway-coder/ ← runtime-registered per-profile +│ │ ├── type ("longrun") +│ │ ├── run ("#!/command/with-contenv sh ... exec s6-setuidgid hermes hermes -p coder gateway run") +│ │ ├── down (marker — present means "registered but don't auto-start") +│ │ └── log/run (s6-log → $HERMES_HOME/logs/gateways/coder/current) +│ └── ... +│ +└── CMD ("main program") ← /opt/hermes/docker/main-wrapper.sh + └── routes user args: bare exec | hermes subcommand | hermes (no args) + — exec'd by /init with stdin/stdout/stderr inherited (TTY for --tui) +``` + +## Key files + +| Path | Role | +|---|---| +| `Dockerfile` | s6-overlay install + cont-init.d wiring + `ENTRYPOINT ["/init", "/opt/hermes/docker/main-wrapper.sh"]` | +| `docker/stage2-hook.sh` | The "old entrypoint logic" — UID remap, chown, seed, skills sync. Runs as cont-init.d/01-hermes-setup. | +| `docker/cont-init.d/02-reconcile-profiles` | Calls `hermes_cli.container_boot` on every boot to restore profile gateway slots from the persistent volume. | +| `docker/main-wrapper.sh` | The container's CMD. Routes user args, drops to hermes via `s6-setuidgid`, exec's the chosen program. | +| `docker/s6-rc.d/main-hermes/run` | No-op `sleep infinity` — slot exists so the s6-rc user bundle is valid; main hermes runs as the CMD, not as a supervised service. | +| `docker/s6-rc.d/dashboard/run` | Conditional service — `exec sleep infinity` unless `HERMES_DASHBOARD` is truthy. | +| `docker/entrypoint.sh` | Back-compat shim that `exec`s the stage2 hook. External scripts that hard-coded the old entrypoint path still work. | +| `hermes_cli/service_manager.py` | `S6ServiceManager`: `register_profile_gateway`, `unregister_profile_gateway`, `start/stop/restart/is_running`, `list_profile_gateways`. | +| `hermes_cli/container_boot.py` | `reconcile_profile_gateways()` — walks persistent profiles, regenerates s6 slots, emits `container-boot.log`. | +| `hermes_cli/gateway.py::_dispatch_via_service_manager_if_s6` | Intercepts `hermes gateway start/stop/restart` and routes to s6 when running in a container. | + +## Why Architecture B (CMD as main program, not s6-supervised) + +The original plan (v1–v3) called for main hermes to run as a supervised s6-rc service. Two real s6-overlay v3 mechanics blocked that: + +1. **cont-init.d scripts receive no CMD args** — so the stage2 hook can't parse `docker run chat -q "hi"` to set `HERMES_ARGS` for a service `run` script to consume. +2. **`/run/s6/basedir/bin/halt` does NOT propagate the exit code** written to `/run/s6-linux-init-container-results/exitcode`. Containers always exit 143 (SIGTERM) regardless. Confirmed by skarnet (s6 author) in [issue #477](https://github.com/just-containers/s6-overlay/issues/477): _"if you want a container shutdown, you need to either have your CMD exit, or, if you have no CMD, write the container exit code you want then call halt"_. + +So we use the s6-overlay-native CMD pattern: `ENTRYPOINT ["/init", "/opt/hermes/docker/main-wrapper.sh"]`. /init prepends the wrapper to user args automatically — so `docker run --version` becomes `/init main-wrapper.sh --version`, and `--version` doesn't get intercepted by /init's POSIX shell. The wrapper drops to hermes via `s6-setuidgid`, then exec's the chosen program. The program's exit code becomes the container exit code, exactly matching the pre-s6 tini contract. + +Trade-off: main hermes is unsupervised under s6. That exactly matches its behavior under tini (the pre-s6 image). Dashboard supervision is the only **new** guarantee — and per-profile gateways under `/run/service/` get full supervision. + +## Quick recipes + +### Verify s6 is PID 1 in a running container + +```sh +docker exec sh -c 'cat /proc/1/comm; readlink /proc/1/exe' +# Expect: s6-svscan or init / /package/admin/s6/.../s6-svscan +``` + +### Inspect a profile gateway service + +```sh +# /command/ isn't on docker-exec PATH — use absolute path +docker exec /command/s6-svstat /run/service/gateway- +# "up (pid …) … seconds" → running +# "down (exitcode N) … seconds, normally up, want up, …" → s6 wants it up but the process keeps exiting (crash loop) +# "down … normally up, ready …" → user stopped it +``` + +### Bring a service up/down manually + +```sh +docker exec /command/s6-svc -u /run/service/gateway- # up +docker exec /command/s6-svc -d /run/service/gateway- # down +docker exec /command/s6-svc -t /run/service/gateway- # SIGTERM (restart) +``` + +### Watch the cont-init reconciler log + +```sh +docker exec tail -n 50 /opt/data/logs/container-boot.log +# 2026-05-21T06:18:05+0000 profile=coder prior_state=running action=started +# 2026-05-21T06:18:05+0000 profile=writer prior_state=stopped action=registered +``` + +### Add a new static service + +1. Create `docker/s6-rc.d//type` with `longrun\n` and `docker/s6-rc.d//run` (use `#!/command/with-contenv sh` + `# shellcheck shell=sh`). +2. Drop to hermes via `s6-setuidgid hermes` at the top of run (unless you specifically need root). +3. Create empty `docker/s6-rc.d//dependencies.d/base` so it waits for the base bundle. +4. Create empty `docker/s6-rc.d/user/contents.d/` so it joins the user bundle. +5. The `COPY docker/s6-rc.d/` in the Dockerfile picks it up automatically — no other changes. + +### Change the per-profile gateway run command + +Edit `S6ServiceManager._render_run_script` in `hermes_cli/service_manager.py`. The function is also called by `hermes_cli/container_boot.py::_register_service` during boot reconciliation, so it's the single source of truth. Update the corresponding assertion in `tests/hermes_cli/test_service_manager.py::test_s6_register_creates_service_dir_and_triggers_scan`. + +### Run the docker test harness + +```sh +docker build -t hermes-agent-harness:latest . +HERMES_TEST_IMAGE=hermes-agent-harness:latest scripts/run_tests.sh tests/docker/ -v +# Expect 19 passed, 0 xfailed against the s6 image +``` + +The harness lives in `tests/docker/` and skips when Docker isn't available. The per-test timeout is bumped to 180s (see `tests/docker/conftest.py`). + +## Common pitfalls + +### "command not found" via `docker exec` + +`/command/` (where s6-overlay puts its binaries) is on PATH only for processes spawned by the supervision tree — services, cont-init.d, main-wrapper.sh. `docker exec s6-svstat …` will fail with "command not found"; always use the absolute path `/command/s6-svstat`. The `hermes` binary works because the Dockerfile adds `/opt/hermes/.venv/bin` to the runtime `ENV PATH`. + +### Profile directory ownership + +The cont-init reconciler runs as hermes (`s6-setuidgid hermes` in `02-reconcile-profiles`). If a profile dir ends up root-owned (e.g. because `docker exec hermes profile create …` ran as root by default), the reconciler can't read SOUL.md and fails with `PermissionError`. Mitigation: `stage2-hook.sh` chowns `$HERMES_HOME/profiles` to hermes on **every** boot, idempotently. Don't remove that block. + +### Files written by `docker exec` are root-owned + +`docker exec` defaults to root. Either pass `--user hermes` or rely on the stage2 chown sweep next reboot. Don't write files under `$HERMES_HOME/profiles//` as root manually — the next reconcile pass will sweep them but in-flight operations may hit perm errors. + +### Service slot exists but s6-svstat says "s6-supervise not running" + +The service directory is on tmpfs and was wiped on container restart. Either the cont-init reconciler hasn't run yet (give it a moment after `docker restart`) or it failed. Check `docker logs | grep '02-reconcile'`. + +### Gateway starts then immediately exits (`down (exitcode 1)` in svstat) + +Most likely the profile has no model or auth configured. The service slot is correct — the gateway itself is unconfigured. Run `hermes -p setup` first. The s6 supervisor will keep restarting it; that's the desired behavior (when you fix the config, the next attempt succeeds and stays up). + +### Reconciler skipped a profile + +The reconciler keys on the **presence of `SOUL.md`** as the "real profile" marker. `hermes profile create` always seeds it. If a profile dir is missing SOUL.md (stray directory, partial restore, backup-in-progress), the reconciler skips it intentionally. Add a `SOUL.md` (even empty) to opt back in. + +### "Help, the container exits 143!" + +Check whether something is invoking `s6-svscanctl -t` or `/run/s6/basedir/bin/halt` — both cause /init to begin stage 3 shutdown but return 143 (SIGTERM) rather than the desired exit code. This was the Phase 2 architecture pivot from A to B. For container shutdown with a real exit code, you must let the CMD (main-wrapper.sh) exit normally; do **not** try to control exit from a finish script. + +## Related skills + +- `hermes-agent-dev`: General hermes-agent codebase navigation +- `hermes-tool-quirks`: Specific Hermes-tool workarounds (sed/grep/etc.) — load when debugging the s6 stack's interaction with hermes built-in tools. diff --git a/tests/docker/__init__.py b/tests/docker/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/docker/conftest.py b/tests/docker/conftest.py new file mode 100644 index 00000000000..4281a292fae --- /dev/null +++ b/tests/docker/conftest.py @@ -0,0 +1,139 @@ +"""Shared fixtures for docker-image integration tests. + +Tests in this directory build the image with the current ``Dockerfile`` +and exercise it via ``docker run``. They skip when Docker is unavailable +(e.g. on developer laptops without a daemon). + +Override the image with ``HERMES_TEST_IMAGE`` env var to point at a pre-built +image (faster local iteration); otherwise the ``built_image`` fixture builds +the repo's Dockerfile once per session. + +Docker tests need longer timeouts than the suite default (30s), so every +test under this directory is granted a 180s default via +``pytest.mark.timeout`` applied at collection time. +""" +from __future__ import annotations + +import os +import shutil +import subprocess +from collections.abc import Iterator + +import pytest + +IMAGE_TAG = os.environ.get("HERMES_TEST_IMAGE", "hermes-agent-harness:latest") + + +def _docker_available() -> bool: + """Return True iff a docker CLI is on PATH and the daemon answers.""" + if shutil.which("docker") is None: + return False + try: + r = subprocess.run( + ["docker", "info"], capture_output=True, timeout=5, + ) + return r.returncode == 0 + except (subprocess.TimeoutExpired, OSError): + return False + + +def pytest_collection_modifyitems(config, items): # noqa: D401 - pytest hook + """Apply docker-suite policy: timeout bump + skip on missing docker.""" + docker_ok = _docker_available() + skip_docker = pytest.mark.skip( + reason="Docker not available or daemon not running", + ) + extend_timeout = pytest.mark.timeout(180) + for item in items: + if "tests/docker/" not in str(item.fspath).replace(os.sep, "/"): + continue + item.add_marker(extend_timeout) + if not docker_ok: + item.add_marker(skip_docker) + + +@pytest.fixture(scope="session") +def built_image() -> str: + """Build the image once per test session. + + Override with ``HERMES_TEST_IMAGE`` env var to point at a pre-built + image (faster local iteration). + """ + if os.environ.get("HERMES_TEST_IMAGE"): + return IMAGE_TAG + repo_root = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", ".."), + ) + result = subprocess.run( + ["docker", "build", "-t", IMAGE_TAG, repo_root], + capture_output=True, text=True, timeout=1200, + ) + assert result.returncode == 0, ( + f"docker build failed:\n{result.stderr[-2000:]}" + ) + return IMAGE_TAG + + +@pytest.fixture +def container_name(request) -> Iterator[str]: + """Generate a unique container name and ensure cleanup on test exit.""" + safe = request.node.name.replace("[", "_").replace("]", "_") + name = f"hermes-test-{safe}" + yield name + subprocess.run( + ["docker", "rm", "-f", name], + capture_output=True, timeout=10, + ) + + +# --------------------------------------------------------------------------- +# docker_exec — default to the unprivileged hermes user +# --------------------------------------------------------------------------- +# +# Background: every Hermes runtime path inside the container drops to UID +# 10000 (the ``hermes`` user) via ``s6-setuidgid hermes``. ``docker exec`` +# without ``-u`` runs as root, which is **not** representative of how +# production code executes. PR #30136 review caught a real regression +# this way — ``Path('/proc/1/exe').resolve()`` works as root and silently +# fails (PermissionError swallowed) for hermes, so a test that ran as root +# couldn't catch a feature that was inert for the actual runtime user. +# +# Tests in this directory MUST exercise the realistic user context. The +# helpers below run every probe under ``-u hermes`` unless a specific +# test explicitly opts into ``user="root"`` (rare — e.g. inspecting +# /proc/1/exe itself, chowning a volume). +# --------------------------------------------------------------------------- + + +def docker_exec( + container: str, + *args: str, + user: str = "hermes", + timeout: int = 30, + extra_docker_args: tuple[str, ...] = (), +) -> subprocess.CompletedProcess[str]: + """Run a command inside ``container`` as ``user`` (default: hermes). + + Returns the CompletedProcess with text=True, capture_output=True. + + Pass ``user="root"`` only when the test specifically needs root + capabilities (e.g. reading /proc/1/exe, manipulating ownership). + Most tests should use the default. + """ + cmd = ["docker", "exec", "-u", user, *extra_docker_args, container, *args] + return subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, + ) + + +def docker_exec_sh( + container: str, + command: str, + *, + user: str = "hermes", + timeout: int = 30, +) -> subprocess.CompletedProcess[str]: + """Run ``sh -c `` inside the container as ``user``.""" + return docker_exec( + container, "sh", "-c", command, user=user, timeout=timeout, + ) diff --git a/tests/docker/test_container_restart.py b/tests/docker/test_container_restart.py new file mode 100644 index 00000000000..c8615898375 --- /dev/null +++ b/tests/docker/test_container_restart.py @@ -0,0 +1,252 @@ +"""Container-restart survives per-profile gateway registrations. + +The s6 dynamic scandir at /run/service/ lives on tmpfs and is wiped +on every container restart. Phase 4 Task 4.0's container_boot module ++ cont-init.d/02-reconcile-profiles regenerate the service slots from +$HERMES_HOME/profiles//gateway_state.json on every boot and +auto-start only those whose last state was `running`. + +These tests stand up a container with a named volume, create profiles +inside it in various gateway states, restart the container, and +assert the reconciler did the right thing. + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec` / :func:`docker_exec_sh` in conftest); see +the conftest module docstring. +""" +from __future__ import annotations + +import subprocess +import time + +import pytest + +from tests.docker.conftest import docker_exec, docker_exec_sh + + +def _docker(*args: str, **kw) -> subprocess.CompletedProcess[str]: + return subprocess.run( + ["docker", *args], + capture_output=True, text=True, timeout=kw.pop("timeout", 60), + **kw, + ) + + +def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]: + return docker_exec(container, *args, timeout=timeout) + + +def _sh(container: str, cmd: str, timeout: int = 30) -> subprocess.CompletedProcess[str]: + return docker_exec_sh(container, cmd, timeout=timeout) + + +def _wait_for_path( + container: str, + path: str, + *, + kind: str = "f", + deadline_s: float = 30.0, + interval_s: float = 0.25, +) -> bool: + """Poll `test - ` inside container until success or timeout. + + `kind` is the `test` flag: 'f' for file, 'd' for directory, 'e' for + existence. Returns True on success, False on timeout. Strictly + better than a fixed `time.sleep()` because: + + * we don't wait the full budget when the path appears early, and + * the test fails with a precise "waited N seconds" assertion + instead of a confusing one-line failure mid-test when the + sleep was too short. + """ + end = time.monotonic() + deadline_s + while time.monotonic() < end: + r = _sh(container, f"test -{kind} {path}", timeout=5) + if r.returncode == 0: + return True + time.sleep(interval_s) + return False + + +def _wait_for_reconcile_log_mention( + container: str, + profile: str, + *, + deadline_s: float = 30.0, + interval_s: float = 0.25, +) -> str: + """Poll until /opt/data/logs/container-boot.log mentions `profile`. + + Returns the matching log content on success. On timeout, returns + the last observed contents so the assertion can render a + meaningful diagnostic. The container-boot.log is the explicit + signal that the reconciler has finished — much more reliable + than a fixed sleep that hopes 8 seconds is enough. + """ + end = time.monotonic() + deadline_s + last = "" + while time.monotonic() < end: + r = _sh(container, "cat /opt/data/logs/container-boot.log", timeout=5) + if r.returncode == 0: + last = r.stdout + if f"profile={profile}" in last: + return last + time.sleep(interval_s) + return last + + +@pytest.fixture +def restart_container(request, built_image: str): + """A long-running container with a named volume so docker restart + preserves $HERMES_HOME/profiles/.""" + safe = request.node.name.replace("[", "_").replace("]", "_") + name = f"hermes-restart-{safe}" + volume = f"hermes-restart-vol-{safe}" + _docker("rm", "-f", name) + _docker("volume", "rm", "-f", volume) + _docker("volume", "create", volume, timeout=10).check_returncode() + r = _docker( + "run", "-d", "--name", name, + "-v", f"{volume}:/opt/data", + built_image, "sleep", "infinity", + timeout=30, + ) + r.check_returncode() + # Wait for s6 + stage2 + 02-reconcile to publish the boot log so + # the test can rely on the default slot being registered before + # it starts issuing commands. The reconciler always writes one + # 'default' line on every boot (PR #30136 item I1) — that's our + # readiness signal. + deadline = time.monotonic() + 30.0 + while time.monotonic() < deadline: + r = _docker( + "exec", "-u", "hermes", name, "sh", "-c", + "cat /opt/data/logs/container-boot.log 2>/dev/null", + timeout=5, + ) + if r.returncode == 0 and "profile=default" in r.stdout: + break + time.sleep(0.25) + else: + # Defensive: surface a timeout from the fixture itself so the + # test failure points at "container never finished cont-init" + # rather than mid-test where the symptom would be obscure. + raise RuntimeError( + f"container {name} did not finish cont-init within 30s" + ) + yield name + _docker("rm", "-f", name) + _docker("volume", "rm", "-f", volume) + + +def test_running_gateway_survives_container_restart(restart_container: str) -> None: + container = restart_container + + # Create the profile + start its gateway. The Phase 4 hooks + # register the s6 service slot during create and the dispatch + # path brings it up via s6-svc -u. + r = _exec(container, "hermes", "profile", "create", "coder") + assert r.returncode == 0, f"profile create failed: {r.stderr}" + + r = _exec(container, "hermes", "-p", "coder", "gateway", "start", timeout=60) + assert r.returncode == 0, f"gateway start failed: {r.stderr}" + + # Give the service time to actually come up under supervision. + deadline = time.monotonic() + 15.0 + while time.monotonic() < deadline: + r = _sh(container, "/command/s6-svstat /run/service/gateway-coder") + if r.returncode == 0 and "up " in r.stdout: + break + time.sleep(0.5) + assert "up " in r.stdout, f"gateway never came up pre-restart: {r.stdout!r}" + + # Persist state so the reconciler will treat the slot as 'running' + # post-restart. The gateway process itself writes gateway_state.json + # via gateway/status.py — but we don't want to wait for or assert + # against the live process here; just stamp the file directly to + # exercise the reconciler's contract. + write_state = ( + "import json, pathlib; " + "p = pathlib.Path('/opt/data/profiles/coder/gateway_state.json'); " + "p.write_text(json.dumps({'gateway_state': 'running', 'timestamp': 1}))" + ) + _exec(container, "python3", "-c", write_state, timeout=10).check_returncode() + + # Restart. After this, /run/service/ is empty until cont-init.d + # runs the reconciler. We need to wait long enough for the + # reconciler to write coder's entry to the boot log AND for + # s6-svscan to spin up the service supervise tree from the + # restored slot. Polling the boot log gives us the first signal. + _docker("restart", container, timeout=60).check_returncode() + log = _wait_for_reconcile_log_mention(container, "coder", deadline_s=30.0) + assert "profile=coder" in log, ( + f"reconciler never logged coder after restart: {log!r}" + ) + assert "action=started" in log + + # Service slot exists. + assert _wait_for_path( + container, "/run/service/gateway-coder", kind="d", deadline_s=10.0, + ), "slot not recreated after restart" + + # No `down` marker — we asked for auto-start. + r = _sh(container, "test -f /run/service/gateway-coder/down") + assert r.returncode != 0, "down marker present despite prior_state=running" + + +def test_stopped_gateway_stays_stopped_after_restart(restart_container: str) -> None: + container = restart_container + + _exec(container, "hermes", "profile", "create", "writer").check_returncode() + + # Write 'stopped' directly so we don't have to race against the + # gateway's own state writes. + write_state = ( + "import json, pathlib; " + "p = pathlib.Path('/opt/data/profiles/writer/gateway_state.json'); " + "p.write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1}))" + ) + _exec(container, "python3", "-c", write_state, timeout=10).check_returncode() + + _docker("restart", container, timeout=60).check_returncode() + log = _wait_for_reconcile_log_mention(container, "writer", deadline_s=30.0) + assert "profile=writer" in log + + # Slot exists. + assert _wait_for_path( + container, "/run/service/gateway-writer", kind="d", deadline_s=10.0, + ) + + # Down marker present. + r = _sh(container, "test -f /run/service/gateway-writer/down") + assert r.returncode == 0, "down marker missing despite prior_state=stopped" + + +def test_stale_gateway_pid_cleaned_up_on_restart(restart_container: str) -> None: + """A dead container's gateway.pid + processes.json must NOT + survive the restart — a numerically-equal live PID in the new + container is a different process and would confuse the gateway + process-mismatch checks.""" + container = restart_container + + _exec(container, "hermes", "profile", "create", "ghost").check_returncode() + + # Stamp stale runtime files alongside a 'running' state so the + # reconciler walks this profile. + stamp = ( + "import json, pathlib; " + "p = pathlib.Path('/opt/data/profiles/ghost'); " + "(p / 'gateway_state.json').write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1})); " + "(p / 'gateway.pid').write_text(json.dumps({'pid': 99999, 'host': 'old'})); " + "(p / 'processes.json').write_text('[]')" + ) + _exec(container, "python3", "-c", stamp, timeout=10).check_returncode() + + _docker("restart", container, timeout=60).check_returncode() + _wait_for_reconcile_log_mention(container, "ghost", deadline_s=30.0) + + # Stale runtime files swept. + r = _sh(container, "test -f /opt/data/profiles/ghost/gateway.pid") + assert r.returncode != 0, "stale gateway.pid survived restart" + r = _sh(container, "test -f /opt/data/profiles/ghost/processes.json") + assert r.returncode != 0, "stale processes.json survived restart" diff --git a/tests/docker/test_dashboard.py b/tests/docker/test_dashboard.py new file mode 100644 index 00000000000..56d4fa41c8a --- /dev/null +++ b/tests/docker/test_dashboard.py @@ -0,0 +1,203 @@ +"""Harness: dashboard opt-in via HERMES_DASHBOARD. + +Today (tini): dashboard starts once when HERMES_DASHBOARD=1; if it crashes +it stays dead. After Phase 2 (s6): dashboard starts once; if it crashes +it is restarted under supervision. The restart-after-crash test lives in +Phase 2 Task 2.5; this file only locks the opt-in surface (which must +not change between tini and s6). + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec`/:func:`docker_exec_sh` in conftest), matching +the realistic runtime context. See the conftest module docstring. +""" +from __future__ import annotations + +import subprocess +import time + +from tests.docker.conftest import docker_exec, docker_exec_sh + + +def _poll(container: str, probe: str, *, deadline_s: float = 30.0, + interval_s: float = 0.5) -> tuple[bool, str]: + """Repeatedly run ``probe`` inside the container until it exits 0 or + ``deadline_s`` elapses. Returns (success, last stdout).""" + end = time.monotonic() + deadline_s + last = "" + while time.monotonic() < end: + r = docker_exec_sh(container, probe, timeout=10) + last = r.stdout + if r.returncode == 0: + return True, last + time.sleep(interval_s) + return False, last + + +def test_dashboard_not_running_by_default( + built_image: str, container_name: str, +) -> None: + """Without HERMES_DASHBOARD, no dashboard process should be running.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "60"], + check=True, capture_output=True, timeout=30, + ) + # Give the entrypoint enough time to finish bootstrap; if a dashboard + # were going to start it'd be visible by now. + time.sleep(5) + r = docker_exec(container_name, "pgrep", "-f", "hermes dashboard") + # pgrep exits non-zero when no match found + assert r.returncode != 0, ( + "Dashboard should not be running without HERMES_DASHBOARD" + ) + + +def test_dashboard_slot_reports_down_when_disabled( + built_image: str, container_name: str, +) -> None: + """Without HERMES_DASHBOARD, s6-svstat should report the dashboard + slot as DOWN (not up-with-sleep-infinity, which would + false-positive `hermes doctor` and any other health check). + + Locks the PR #30136 review item I3 fix: cont-init.d/03-dashboard-toggle + writes a `down` marker file in the live service-dir when + HERMES_DASHBOARD is unset, so the slot reflects reality. + """ + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "60"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(5) + # /command/ isn't on PATH for docker-exec sessions, so call by + # absolute path. + r = docker_exec( + container_name, "/command/s6-svstat", "/run/service/dashboard", + ) + assert r.returncode == 0, f"s6-svstat failed: {r.stderr!r} / {r.stdout!r}" + assert "down" in r.stdout, ( + f"Dashboard slot should be 'down' without HERMES_DASHBOARD; " + f"svstat reports: {r.stdout!r}" + ) + + +def test_dashboard_slot_reports_up_when_enabled( + built_image: str, container_name: str, +) -> None: + """Symmetry: with HERMES_DASHBOARD=1, s6-svstat reports the slot as up.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", built_image, "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + # uvicorn takes a moment to bind; poll svstat. + deadline = time.monotonic() + 30.0 + last = "" + while time.monotonic() < deadline: + r = docker_exec( + container_name, "/command/s6-svstat", "/run/service/dashboard", + ) + last = r.stdout + if r.returncode == 0 and "up " in r.stdout: + return # success + time.sleep(0.5) + raise AssertionError( + f"Dashboard slot never reached up state; last svstat: {last!r}" + ) + + +def test_dashboard_opt_in_starts( + built_image: str, container_name: str, +) -> None: + """With HERMES_DASHBOARD=1, a dashboard process should be visible.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", built_image, "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + # Poll for the dashboard subprocess to appear — the entrypoint + # backgrounds it and bootstrap (skills sync etc.) can take a few + # seconds before the python process actually launches. + ok, _ = _poll( + container_name, "pgrep -f 'hermes dashboard'", deadline_s=30.0, + ) + assert ok, "Dashboard should be running with HERMES_DASHBOARD=1" + + +def test_dashboard_port_override( + built_image: str, container_name: str, +) -> None: + """HERMES_DASHBOARD_PORT changes the dashboard's listen port.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", "-e", "HERMES_DASHBOARD_PORT=9120", + built_image, "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + # The dashboard process appearing in pgrep doesn't mean it's bound + # to the port yet — uvicorn takes another second or two to come up. + # The image doesn't ship ss/netstat, so probe /proc/net/tcp directly: + # port 9120 = 0x23A0, state 0A = LISTEN. + ok, stdout = _poll( + container_name, + "grep -E ' 0+:23A0 .* 0A ' /proc/net/tcp /proc/net/tcp6 " + "2>/dev/null", + deadline_s=60.0, + ) + assert ok, f"Dashboard not listening on port 9120: stdout={stdout!r}" + + +def test_dashboard_restarts_after_crash( + built_image: str, container_name: str, +) -> None: + """Phase 2 invariant: under s6 supervision, killing the dashboard + process should be recovered automatically. + + Pre-s6 (tini) behavior was "stays dead" — the test wouldn't have + passed against that image. After the s6-overlay migration the + dashboard runs as a longrun s6-rc service and s6-supervise restarts + it after a ~1s backoff (the default). + """ + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", built_image, "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + # Wait for the first dashboard to come up. + ok, _ = _poll( + container_name, "pgrep -f 'hermes dashboard'", deadline_s=30.0, + ) + assert ok, "Dashboard never started initially" + + # Grab the initial PID. s6 may briefly transition through restart + # state between our poll-success and the follow-up pgrep, so retry + # a couple of times before giving up. + first_pid: str | None = None + for _attempt in range(10): + first_pid_result = docker_exec( + container_name, "pgrep", "-f", "hermes dashboard", + ) + first_pids = first_pid_result.stdout.strip().split() + if first_pids: + first_pid = first_pids[0] + break + time.sleep(0.5) + assert first_pid is not None, "Could not capture initial dashboard PID" + + # Kill the dashboard. The dashboard process runs as hermes, so the + # hermes user can kill it (same UID). + docker_exec(container_name, "kill", "-9", first_pid) + + # s6 backs off ~1s before restart; allow up to 15s for the new + # process to appear with a different PID. + deadline = time.monotonic() + 15.0 + while time.monotonic() < deadline: + r = docker_exec(container_name, "pgrep", "-f", "hermes dashboard") + pids = r.stdout.strip().split() if r.returncode == 0 else [] + if pids and pids[0] != first_pid: + return # success + time.sleep(0.5) + + raise AssertionError( + f"Dashboard not restarted after kill (first_pid={first_pid})" + ) diff --git a/tests/docker/test_main_invocation.py b/tests/docker/test_main_invocation.py new file mode 100644 index 00000000000..884b939153d --- /dev/null +++ b/tests/docker/test_main_invocation.py @@ -0,0 +1,79 @@ +"""Harness: docker run [cmd...] invocation patterns. + +These tests MUST pass on the current tini-based image AND continue to +pass after the Phase 2 s6 migration. Any behavior drift is a regression. + +The harness expects ``built_image`` and ``container_name`` fixtures from +``tests/docker/conftest.py``. When Docker isn't available every test +here is skipped at collection time. +""" +from __future__ import annotations + +import subprocess + + +def test_no_args_starts_hermes(built_image: str) -> None: + """``docker run `` should start hermes cleanly. + + We invoke ``--version`` so the call exits without needing a configured + model. Exit code may be 0 (printed version) or 1 (config bootstrapping + failure on a fresh volume), but never a stack trace. + """ + r = subprocess.run( + ["docker", "run", "--rm", built_image, "--version"], + capture_output=True, text=True, timeout=60, + ) + assert r.returncode in (0, 1), ( + f"Unexpected exit {r.returncode}: stderr={r.stderr!r}" + ) + assert "Traceback" not in r.stderr + + +def test_chat_subcommand_passthrough(built_image: str) -> None: + """``docker run chat --help`` should exec ``hermes chat --help``. + + Uses ``--help`` so the call doesn't need an upstream model configured. + """ + r = subprocess.run( + ["docker", "run", "--rm", built_image, "chat", "--help"], + capture_output=True, text=True, timeout=60, + ) + assert r.returncode == 0 + combined = (r.stdout + r.stderr).lower() + assert "chat" in combined or "usage" in combined + + +def test_bare_executable_passthrough(built_image: str) -> None: + """``docker run sleep 1`` should exec ``sleep`` directly. + + The entrypoint detects that ``sleep`` is on PATH and routes around the + hermes wrapper. Useful for long-lived sandbox mode and for testing. + """ + r = subprocess.run( + ["docker", "run", "--rm", built_image, "sleep", "1"], + capture_output=True, text=True, timeout=30, + ) + assert r.returncode == 0 + + +def test_bash_pattern(built_image: str) -> None: + """``docker run bash -c 'echo ok'`` should exec bash directly.""" + r = subprocess.run( + ["docker", "run", "--rm", built_image, "bash", "-c", "echo ok"], + capture_output=True, text=True, timeout=30, + ) + assert r.returncode == 0 + assert "ok" in r.stdout + + +def test_container_exit_code_matches_inner_exit(built_image: str) -> None: + """The container exit code must match the inner process's exit code. + + Critical for CI: ``docker run hermes batch ...`` returns a + non-zero status when batch fails. Phase 2 (s6) must preserve this. + """ + r = subprocess.run( + ["docker", "run", "--rm", built_image, "sh", "-c", "exit 42"], + capture_output=True, text=True, timeout=30, + ) + assert r.returncode == 42 diff --git a/tests/docker/test_profile_gateway.py b/tests/docker/test_profile_gateway.py new file mode 100644 index 00000000000..5bfc1c46c87 --- /dev/null +++ b/tests/docker/test_profile_gateway.py @@ -0,0 +1,138 @@ +"""Harness: per-profile gateway start/stop inside the container. + +Phase 4 wires `hermes -p gateway start/stop` through the s6 +ServiceManager dispatch path inside the container — so the lifecycle +commands now bring up an s6-supervised gateway rather than refusing +with the pre-Phase-4 informational message. + +These tests were marked ``xfail(strict=True)`` through Phase 0–3 and +flip to plain ``test_…`` once Phase 4 lands (now). + +NB: The harness profile has no model/auth configured. Depending on +how the gateway run script handles missing config, the supervised +process may either spin up successfully (and svstat reports ``up``) +or exit fast and get throttled by s6 (and svstat reports ``down …, +want up``). Both states are valid "user asked for gateway up" results +— what we assert is the *want* intent the lifecycle command set, NOT +the supervised process's health. ``s6-svc -u`` records ``want up`` in +the supervise/status file regardless of the run-script outcome. + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec_sh` in conftest); see the conftest module +docstring. +""" +from __future__ import annotations + +import subprocess +import time + +from tests.docker.conftest import docker_exec_sh + +PROFILE = "test-harness-profile" + + +def _sh( + container: str, command: str, timeout: int = 30, +) -> subprocess.CompletedProcess[str]: + return docker_exec_sh(container, command, timeout=timeout) + + +def _svstat(container: str) -> str: + """Returns the raw s6-svstat output for the test profile's slot. + /command/s6-svstat is called by absolute path because /command/ + isn't on PATH for docker-exec sessions.""" + r = _sh(container, f"/command/s6-svstat /run/service/gateway-{PROFILE}") + return r.stdout if r.returncode == 0 else "" + + +def _svstat_wants_up(container: str) -> bool: + """Read the slot's want-state from s6-svstat output. + + s6-svstat formats the output to elide redundancies — when the + service is currently up AND s6 wants it up, the literal token + ``want up`` doesn't appear (it's implicit from the leading ``up``). + When the service is down but s6 wants it back up, ``, want up`` + appears explicitly. So a comprehensive "is the want-intent set to + up" check has to accept both spellings. + """ + state = _svstat(container) + if not state: + return False + head = state.split()[0] if state.split() else "" + if head == "up": + # Currently up implies wanted-up unless ``want down`` is set. + return "want down" not in state + # Currently down — ``want up`` only shows up when explicitly set. + return "want up" in state + + +def test_profile_create_then_gateway_start( + built_image: str, container_name: str, +) -> None: + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(3) + + r = _sh(container_name, f"hermes profile create {PROFILE}") + assert r.returncode == 0, f"profile create failed: {r.stderr}" + + # Profile create's s6-register hook should have produced a service slot. + r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}") + assert r.returncode == 0, "s6 service slot not created on profile create" + + r = _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60) + assert r.returncode == 0, ( + f"gateway start failed: stderr={r.stderr!r} stdout={r.stdout!r}" + ) + + # After start, s6's intent is "up" — even if the supervised gateway + # process spin-fails (no model/auth in the test profile), the + # supervision-state contract holds. See ``_svstat_wants_up`` for + # why we accept both ``up …`` (currently up) and ``down …, want + # up`` (down but s6 wants up). + time.sleep(2) + assert _svstat_wants_up(container_name), ( + f"slot want-state is not up after gateway start: " + f"{_svstat(container_name)!r}" + ) + + r = _sh(container_name, f"hermes -p {PROFILE} gateway stop", timeout=30) + assert r.returncode == 0 + + time.sleep(2) + assert not _svstat_wants_up(container_name), ( + f"slot want-state still up after gateway stop: " + f"{_svstat(container_name)!r}" + ) + + +def test_profile_delete_stops_gateway( + built_image: str, container_name: str, +) -> None: + """Deleting a profile should stop its gateway and remove the s6 + service slot.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(3) + + _sh(container_name, f"hermes profile create {PROFILE}") + _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60) + time.sleep(3) + + r = _sh( + container_name, + f"hermes profile delete {PROFILE} --yes", + timeout=30, + ) + assert r.returncode == 0, f"profile delete failed: {r.stderr}" + + time.sleep(2) + # Service slot should be gone. + r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}") + assert r.returncode != 0, "s6 service slot still present after profile delete" diff --git a/tests/docker/test_s6_profile_gateway_integration.py b/tests/docker/test_s6_profile_gateway_integration.py new file mode 100644 index 00000000000..22b41ca5ace --- /dev/null +++ b/tests/docker/test_s6_profile_gateway_integration.py @@ -0,0 +1,129 @@ +"""Harness: in-container integration tests for S6ServiceManager. + +The unit tests in tests/hermes_cli/test_service_manager.py exercise the +class against a tmp-path scandir with a stubbed ``subprocess.run``. +These tests run the real class inside a real container against the +real s6-svc / s6-svscanctl binaries, validating end-to-end. + +Phase 3 only registers the service slot — it doesn't depend on the +gateway actually starting (the binary will refuse to start without a +valid profile config). The full register → start → supervised-restart +→ unregister cycle is covered by Phase 4 once profile create/delete +hooks land. + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec` in conftest); see the conftest module +docstring. ``/run/service`` is chowned hermes-writable by the +``02-reconcile-profiles`` cont-init.d script, so register/unregister +operations work correctly under UID 10000. +""" +from __future__ import annotations + +import subprocess +import time + +from tests.docker.conftest import docker_exec + + +_REGISTER_SCRIPT = """ +import sys +sys.path.insert(0, "/opt/hermes") +from hermes_cli.service_manager import S6ServiceManager +S6ServiceManager().register_profile_gateway("phase3test") +# Don't worry about whether the gateway actually starts — we only care +# that the supervision slot was created. The gateway run script will +# likely error out (no profile config exists) but that's expected. +print("REGISTERED") +""" + +_UNREGISTER_SCRIPT = """ +import sys +sys.path.insert(0, "/opt/hermes") +from hermes_cli.service_manager import S6ServiceManager +S6ServiceManager().unregister_profile_gateway("phase3test") +print("UNREGISTERED") +""" + + +def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess: + return docker_exec(container, *args, timeout=timeout) + + +def test_s6_register_creates_service_dir_in_live_container( + built_image: str, container_name: str, +) -> None: + """S6ServiceManager.register_profile_gateway must create + ``/run/service/gateway-/`` and trigger s6-svscan rescan + against the real s6 supervision tree.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + # Give the supervision tree a moment to come up. + time.sleep(3) + + r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30) + assert "REGISTERED" in r.stdout, ( + f"register failed: stderr={r.stderr!r} stdout={r.stdout!r}" + ) + + # Service directory exists with the expected structure. + r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test") + assert r.returncode == 0, "service directory not created" + + r = _exec(container_name, "test", "-f", "/run/service/gateway-phase3test/run") + assert r.returncode == 0, "run script not created" + + r = _exec(container_name, "test", "-f", + "/run/service/gateway-phase3test/log/run") + assert r.returncode == 0, "log/run script not created" + + # s6-svscan picked it up — s6-svstat works against the dir. + # `docker exec` doesn't put /command/ on PATH (only the supervision + # tree does), so call s6-svstat by absolute path. + r = _exec(container_name, "/command/s6-svstat", + "/run/service/gateway-phase3test") + assert r.returncode == 0, f"s6-svstat failed: {r.stderr or r.stdout}" + + # list_profile_gateways picks it up. + r = _exec(container_name, "python3", "-c", ( + "from hermes_cli.service_manager import S6ServiceManager;" + "print(S6ServiceManager().list_profile_gateways())" + )) + assert "phase3test" in r.stdout, f"list output: {r.stdout!r}" + + +def test_s6_unregister_removes_service_dir_in_live_container( + built_image: str, container_name: str, +) -> None: + """unregister_profile_gateway must stop the service, remove the + directory, and trigger s6-svscan rescan so the supervise process + is dropped.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "120"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(3) + + # First register so we have something to unregister. + r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30) + assert "REGISTERED" in r.stdout + + # Then unregister. + r = _exec(container_name, "python3", "-c", _UNREGISTER_SCRIPT, timeout=30) + assert "UNREGISTERED" in r.stdout, ( + f"unregister failed: stderr={r.stderr!r} stdout={r.stdout!r}" + ) + + # Directory is gone. + r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test") + assert r.returncode != 0, "service directory still exists after unregister" + + # list_profile_gateways no longer includes it. + r = _exec(container_name, "python3", "-c", ( + "from hermes_cli.service_manager import S6ServiceManager;" + "print(S6ServiceManager().list_profile_gateways())" + )) + assert "phase3test" not in r.stdout diff --git a/tests/docker/test_tui_passthrough.py b/tests/docker/test_tui_passthrough.py new file mode 100644 index 00000000000..6de78216fd5 --- /dev/null +++ b/tests/docker/test_tui_passthrough.py @@ -0,0 +1,51 @@ +"""Harness: interactive TUI TTY passthrough. + +Uses ``script -qc`` on the host to allocate a PTY for the docker client, +which then allocates a container-side PTY via ``-t``. The probe inside +the container is ``tput cols``, which returns a real column count when +stdout is a TTY and either prints ``80`` (the terminfo fallback) or +nothing when it is not. + +These tests MUST pass on the current tini-based image AND continue to +pass after the Phase 2 s6 migration. Any drift is a regression. +""" +from __future__ import annotations + +import shlex +import shutil +import subprocess + +import pytest + +pytestmark = pytest.mark.skipif( + shutil.which("script") is None, + reason="`script` command not available on this host", +) + + +def test_tty_passthrough_to_container(built_image: str) -> None: + """``docker run -t`` must deliver a real TTY to the container process.""" + probe = "if [ -t 1 ]; then tput cols; else echo NO_TTY; fi" + cmd = ( + f"docker run --rm -t -e COLUMNS=123 {built_image} " + f"sh -c {shlex.quote(probe)}" + ) + r = subprocess.run( + ["script", "-qc", cmd, "/dev/null"], + capture_output=True, text=True, timeout=120, + ) + output = r.stdout.strip() + assert "NO_TTY" not in output, f"TTY passthrough failed: {output!r}" + numeric_lines = [s for s in output.split() if s.strip().isdigit()] + assert numeric_lines, f"No numeric width in output: {output!r}" + assert int(numeric_lines[0]) > 0 + + +def test_tui_flag_recognized(built_image: str) -> None: + """``docker run -it --help`` should run without crashing.""" + cmd = f"docker run --rm -t {built_image} --help" + r = subprocess.run( + ["script", "-qc", cmd, "/dev/null"], + capture_output=True, text=True, timeout=60, + ) + assert r.returncode == 0 diff --git a/tests/docker/test_zombie_reaping.py b/tests/docker/test_zombie_reaping.py new file mode 100644 index 00000000000..ff31be8c0d2 --- /dev/null +++ b/tests/docker/test_zombie_reaping.py @@ -0,0 +1,45 @@ +"""Harness: PID 1 must reap orphaned zombie processes. + +tini (current PID 1) reaps zombies via its built-in subreaper behavior. +s6-overlay's ``/init`` (Phase 2 PID 1) does the same. This invariant is +required for long-running containers spawning subprocesses (subagents, +dashboard, dynamic gateways) — otherwise the process table fills with +defunct entries and eventually exhausts the kernel PID space. + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec_sh` in conftest); see the conftest module +docstring. +""" +from __future__ import annotations + +import subprocess +import time + +from tests.docker.conftest import docker_exec, docker_exec_sh + + +def test_orphan_zombies_reaped( + built_image: str, container_name: str, +) -> None: + """Spawn an orphan child that exits immediately. PID 1 must reap it.""" + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "sleep", "60"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(2) + + # `( ( sleep 0.1 & ) & ); sleep 1` creates a grandchild detached from + # the original docker exec session — it becomes an orphan reparented + # to PID 1 in the container. When it exits, PID 1 must reap it. + docker_exec_sh( + container_name, "( ( sleep 0.1 & ) & ); sleep 1", timeout=10, + ) + time.sleep(1) + + r = docker_exec(container_name, "ps", "axo", "stat,pid,comm") + zombies = [ + line for line in r.stdout.split("\n") + if line.strip().startswith("Z") + ] + assert not zombies, f"Zombies not reaped by PID 1: {zombies}" diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py new file mode 100644 index 00000000000..58ad016f22e --- /dev/null +++ b/tests/hermes_cli/test_container_boot.py @@ -0,0 +1,578 @@ +"""Tests for hermes_cli.container_boot — the cont-init.d-time +reconciliation that recreates per-profile gateway s6 service slots +from the persistent profiles directory. + +These tests run against a fake $HERMES_HOME under tmp_path; no real +s6 supervision tree is required. The in-container integration test +covering end-to-end "docker restart" survival lives in +tests/docker/test_container_restart.py. +""" +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from hermes_cli.container_boot import ( + ReconcileAction, + reconcile_profile_gateways, +) + + +# --------------------------------------------------------------------------- +# Fixtures + helpers +# --------------------------------------------------------------------------- + + +def _make_profile( + hermes_home: Path, + name: str, + *, + state: str | None, + with_pid: bool = False, + config: bool = True, +) -> Path: + """Create a fake profile directory under hermes_home/profiles//.""" + p = hermes_home / "profiles" / name + p.mkdir(parents=True) + if config: + # SOUL.md is what the reconciler keys on — it's always seeded by + # `hermes profile create`. See container_boot._render_run_script. + (p / "SOUL.md").write_text("# fake profile\n") + if state is not None: + (p / "gateway_state.json").write_text(json.dumps({ + "gateway_state": state, "timestamp": 1234567890, + })) + if with_pid: + (p / "gateway.pid").write_text(json.dumps( + {"pid": 99999, "host": "old-container"}, + )) + (p / "processes.json").write_text("[]") + return p + + +def _seed_default_root( + hermes_home: Path, + *, + state: str | None = None, + with_pid: bool = False, +) -> None: + """Populate gateway_state.json / stale runtime files at the + HERMES_HOME root (the implicit default profile).""" + if state is not None: + (hermes_home / "gateway_state.json").write_text(json.dumps({ + "gateway_state": state, "timestamp": 1234567890, + })) + if with_pid: + (hermes_home / "gateway.pid").write_text(json.dumps( + {"pid": 99999, "host": "old-container"}, + )) + (hermes_home / "processes.json").write_text("[]") + + +def _named_actions(actions: list[ReconcileAction]) -> list[ReconcileAction]: + """Drop the always-present default-profile action so tests that + only care about named profiles can assert against a clean list.""" + return [a for a in actions if a.profile != "default"] + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert _named_actions(actions) == [ReconcileAction( + profile="coder", prior_state="running", action="started", + )] + svc = scandir / "gateway-coder" + assert (svc / "run").exists() + assert (svc / "run").stat().st_mode & 0o111 # executable + assert (svc / "type").read_text().strip() == "longrun" + # Auto-start means no down-marker. + assert not (svc / "down").exists() + + +def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "writer", state="stopped") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert _named_actions(actions) == [ReconcileAction( + profile="writer", prior_state="stopped", action="registered", + )] + # down marker tells s6-svscan to NOT start the service. + assert (scandir / "gateway-writer" / "down").exists() + + +def test_startup_failed_does_not_autostart(tmp_path: Path) -> None: + """Avoid crash-loop on restart when the gateway was failing to boot.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "broken", state="startup_failed") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + named = _named_actions(actions) + assert named[0].action == "registered" + assert (scandir / "gateway-broken" / "down").exists() + + +def test_starting_state_does_not_autostart(tmp_path: Path) -> None: + """`starting` means the gateway died mid-boot last time; treat as + failed, not as a candidate for auto-restart.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "unlucky", state="starting") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + named = _named_actions(actions) + assert named[0].action == "registered" + + +def test_stale_runtime_files_are_removed(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + profile = _make_profile(tmp_path, "coder", state="running", with_pid=True) + assert (profile / "gateway.pid").exists() + assert (profile / "processes.json").exists() + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert not (profile / "gateway.pid").exists() + assert not (profile / "processes.json").exists() + + +def test_profile_without_state_file_is_registered_but_not_started( + tmp_path: Path, +) -> None: + """A freshly-created profile that's never been started: register + its slot but don't auto-start.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "fresh", state=None) + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert _named_actions(actions) == [ReconcileAction( + profile="fresh", prior_state=None, action="registered", + )] + assert (scandir / "gateway-fresh" / "down").exists() + + +def test_directory_without_marker_file_is_skipped(tmp_path: Path) -> None: + """A stray dir under profiles/ that isn't actually a profile (no + SOUL.md — the marker the reconciler keys on) should be skipped.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + # Create a profile dir but without SOUL.md + (tmp_path / "profiles" / "stray").mkdir(parents=True) + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert _named_actions(actions) == [] + assert not (scandir / "gateway-stray").exists() + + +def test_corrupt_state_file_treated_as_no_prior_state(tmp_path: Path) -> None: + """If gateway_state.json is malformed JSON, don't blow up the whole + reconciliation — register the slot in the down state.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + profile = _make_profile(tmp_path, "junk", state="running") + (profile / "gateway_state.json").write_text("{ not valid json") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + named = _named_actions(actions) + assert named[0].action == "registered" # not "started" + assert (scandir / "gateway-junk" / "down").exists() + + +def test_reconcile_log_is_written(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "a", state="running") + _make_profile(tmp_path, "b", state="stopped") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + log = (tmp_path / "logs" / "container-boot.log").read_text() + assert "profile=a" in log + assert "action=started" in log + assert "profile=b" in log + assert "action=registered" in log + + +def test_reconcile_log_rotates_when_size_exceeded( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When container-boot.log exceeds _LOG_ROTATE_BYTES, the existing + file is rotated to .1 before the new entries are appended.""" + from hermes_cli import container_boot + + # Tighten the threshold so we don't have to write 256 KiB. + monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 200) + + log_path = tmp_path / "logs" / "container-boot.log" + log_path.parent.mkdir() + log_path.write_text("X" * 300) # already over the threshold + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + rotated = tmp_path / "logs" / "container-boot.log.1" + assert rotated.exists(), "expected previous log to be rotated to .1" + assert rotated.read_text().startswith("X" * 300) + # The new entries land in a fresh container-boot.log (no leftover Xs). + new_contents = log_path.read_text() + assert "X" not in new_contents + assert "profile=coder" in new_contents + + +def test_reconcile_log_does_not_rotate_below_threshold( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A small existing log is appended to in place; no .1 is created.""" + from hermes_cli import container_boot + monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 10_000_000) + + log_path = tmp_path / "logs" / "container-boot.log" + log_path.parent.mkdir() + log_path.write_text("previous entry\n") + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert not (tmp_path / "logs" / "container-boot.log.1").exists() + contents = log_path.read_text() + assert contents.startswith("previous entry\n") + assert "profile=coder" in contents + + +def test_reconcile_log_rotation_overwrites_existing_dot1( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Rotating again replaces the prior .1 — we keep at most one + rotated file (soft cap of ~2 × threshold).""" + from hermes_cli import container_boot + monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 200) + + log_dir = tmp_path / "logs"; log_dir.mkdir() + (log_dir / "container-boot.log.1").write_text("OLD ROTATION") + (log_dir / "container-boot.log").write_text("Y" * 300) + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + # .1 now contains the previous .log (Ys), not OLD ROTATION. + rotated = (log_dir / "container-boot.log.1").read_text() + assert "OLD ROTATION" not in rotated + assert rotated.startswith("Y" * 300) + + +def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + profile = _make_profile(tmp_path, "coder", state="running", with_pid=True) + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=True, + ) + + # The action list is still produced... + assert _named_actions(actions) == [ReconcileAction( + profile="coder", prior_state="running", action="started", + )] + # ...but nothing on disk was touched. + assert (profile / "gateway.pid").exists() # not removed under dry_run + assert not (scandir / "gateway-coder").exists() + assert not (tmp_path / "logs" / "container-boot.log").exists() + + +def test_missing_profiles_root_still_registers_default_slot( + tmp_path: Path, +) -> None: + """When $HERMES_HOME/profiles doesn't exist (fresh install), the + reconciliation should still register a gateway-default slot for + the root profile and return without raising. Previously this + returned an empty list; the default slot is now always present + so `hermes gateway start` (no -p) has somewhere to land.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + assert actions == [ReconcileAction( + profile="default", prior_state=None, action="registered", + )] + assert (scandir / "gateway-default").is_dir() + assert (scandir / "gateway-default" / "down").exists() + + +def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None: + """A profile dir whose name doesn't match validate_profile_name's + rules (uppercase, etc.) must surface as a hard error rather than + silently produce an invalid s6 service dir.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "BadName", state="running") + with pytest.raises(ValueError): + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + +def test_register_service_publishes_atomically(tmp_path: Path) -> None: + """The reconciler should build the new service dir in a sibling + tmp directory and rename it into place — never leaving a half- + populated slot visible to a concurrent s6-svscan rescan. + + We verify the invariant indirectly: after a clean reconcile, the + target directory exists with all required files, and no sibling + .tmp leftovers remain. (Atomic publication is the only way to + achieve both with mkdir + write.) + """ + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + # No leftover tmp dir. + leftover = list(scandir.glob("*.tmp")) + assert leftover == [], f"leftover tmp directories: {leftover}" + + # Target is fully populated. + svc = scandir / "gateway-coder" + assert (svc / "type").exists() + assert (svc / "run").exists() + assert (svc / "log" / "run").exists() + + +def test_register_service_overwrites_existing_slot(tmp_path: Path) -> None: + """A second reconciliation pass cleanly replaces an existing + slot (the tmp+rename publication overwrites the previous one).""" + scandir = tmp_path / "run-service"; scandir.mkdir() + profile = _make_profile(tmp_path, "coder", state="running") + + # First pass. + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + first_run = (scandir / "gateway-coder" / "run").read_text() + + # Mutate the profile state so the run-script changes (extra_env + # rendering would differ if we wired profile config through, but + # for now just exercise the overwrite path). + (profile / "gateway_state.json").write_text( + '{"gateway_state": "stopped"}', + ) + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + # Slot still exists, no .tmp remnants. + assert (scandir / "gateway-coder" / "run").read_text() == first_run + assert list(scandir.glob("*.tmp")) == [] + # Down marker now present (state went from running → stopped). + assert (scandir / "gateway-coder" / "down").exists() + + +def test_register_service_cleans_up_stale_tmp_dir(tmp_path: Path) -> None: + """If a previous interrupted run left a .tmp sibling directory, + a fresh reconcile must clean it up rather than failing on mkdir.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + # Simulate a leftover from an interrupted run. + stale_tmp = scandir / "gateway-coder.tmp" + stale_tmp.mkdir() + (stale_tmp / "stale-file").write_text("garbage") + + _make_profile(tmp_path, "coder", state="running") + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert not stale_tmp.exists() + assert (scandir / "gateway-coder" / "run").exists() + + +# --------------------------------------------------------------------------- +# Default-profile slot — always registered (PR #30136 review item I1) +# --------------------------------------------------------------------------- + + +def test_default_slot_always_registered_on_empty_home(tmp_path: Path) -> None: + """Bare HERMES_HOME with nothing under it still produces a + gateway-default slot (down state).""" + scandir = tmp_path / "run-service"; scandir.mkdir() + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions == [ReconcileAction( + profile="default", prior_state=None, action="registered", + )] + svc = scandir / "gateway-default" + assert svc.is_dir() + assert (svc / "run").exists() + assert (svc / "down").exists() + + +def test_default_slot_run_script_omits_profile_flag(tmp_path: Path) -> None: + """The default slot's run script must NOT pass `-p default` — + that would resolve to $HERMES_HOME/profiles/default/ instead of + the root profile. It must call `hermes gateway run` directly.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + run = (scandir / "gateway-default" / "run").read_text() + assert "hermes gateway run" in run + assert "-p default" not in run + assert "-p 'default'" not in run + + +def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None: + """gateway_state.json at the HERMES_HOME root with state=running + means the default slot auto-starts on container boot.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="running") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.prior_state == "running" + assert default_action.action == "started" + assert not (scandir / "gateway-default" / "down").exists() + + +def test_default_slot_does_not_autostart_when_root_state_stopped( + tmp_path: Path, +) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="stopped") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.action == "registered" + assert (scandir / "gateway-default" / "down").exists() + + +def test_default_slot_does_not_autostart_when_root_state_startup_failed( + tmp_path: Path, +) -> None: + """Crash-loop guard applies to the default slot too.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="startup_failed") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.action == "registered" + + +def test_default_slot_cleans_up_stale_runtime_files_at_root( + tmp_path: Path, +) -> None: + """gateway.pid and processes.json at the HERMES_HOME root (left + over from the previous container's default gateway) must be + swept the same way as for named profiles.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="running", with_pid=True) + assert (tmp_path / "gateway.pid").exists() + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert not (tmp_path / "gateway.pid").exists() + assert not (tmp_path / "processes.json").exists() + + +def test_default_slot_appears_before_named_profiles(tmp_path: Path) -> None: + """The action list is ordered: default first, then named profiles + in directory order. Operators and the boot-log reader rely on + this ordering being stable.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "z-last-alphabetically", state="stopped") + _make_profile(tmp_path, "a-first-alphabetically", state="stopped") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert [a.profile for a in actions] == [ + "default", + "a-first-alphabetically", + "z-last-alphabetically", + ] + + +def test_profiles_default_subdir_is_skipped_with_warning( + tmp_path: Path, + caplog: pytest.LogCaptureFixture, +) -> None: + """A user-created profiles/default/ collides with the reserved + root-profile slot — the named entry is skipped (with a warning) + so we don't double-register gateway-default.""" + import logging + caplog.set_level(logging.WARNING) + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "default", state="running") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + # Only the root-profile default slot appears — not the colliding + # named profile. + default_actions = [a for a in actions if a.profile == "default"] + assert len(default_actions) == 1 + # And the warning surfaces so operators know the named profile + # was ignored. + assert any( + "profiles/default/" in record.message for record in caplog.records + ) diff --git a/tests/hermes_cli/test_gateway_s6_dispatch.py b/tests/hermes_cli/test_gateway_s6_dispatch.py new file mode 100644 index 00000000000..ba83c1a1187 --- /dev/null +++ b/tests/hermes_cli/test_gateway_s6_dispatch.py @@ -0,0 +1,335 @@ +"""Tests for the Phase 4 s6 dispatch helper in hermes_cli.gateway. + +`_dispatch_via_service_manager_if_s6` decides whether a +`hermes gateway start/stop/restart` invocation should be routed to +the in-container S6ServiceManager instead of falling through to the +host systemd/launchd/windows code path. +""" +from __future__ import annotations + +from typing import Any + +import pytest + + +class _CallRecorder: + """Minimal stand-in for S6ServiceManager.""" + kind = "s6" + + def __init__(self) -> None: + self.calls: list[tuple[str, str]] = [] + + def start(self, name: str) -> None: + self.calls.append(("start", name)) + + def stop(self, name: str) -> None: + self.calls.append(("stop", name)) + + def restart(self, name: str) -> None: + self.calls.append(("restart", name)) + + +def test_dispatch_returns_false_on_host(monkeypatch: pytest.MonkeyPatch) -> None: + """When the environment isn't s6 (host run), the helper must + return False and not invoke a manager — callers continue with + their existing systemd/launchd/windows path.""" + from hermes_cli import gateway as gw + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "systemd", + ) + # Should not even attempt to construct a manager. + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: pytest.fail("manager should not be constructed on host"), + ) + assert gw._dispatch_via_service_manager_if_s6("start", profile="x") is False + + +def test_dispatch_returns_true_and_calls_start_on_s6( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli import gateway as gw + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_via_service_manager_if_s6("start", profile="coder") is True + assert rec.calls == [("start", "gateway-coder")] + + +@pytest.mark.parametrize("action,expected", [ + ("start", "start"), + ("stop", "stop"), + ("restart", "restart"), +]) +def test_dispatch_translates_action_to_manager_method( + monkeypatch: pytest.MonkeyPatch, action: str, expected: str, +) -> None: + from hermes_cli import gateway as gw + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_via_service_manager_if_s6(action, profile="x") is True + assert rec.calls == [(expected, "gateway-x")] + + +def test_dispatch_unknown_action_returns_false( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """An unrecognized action (e.g. 'install') must not silently + succeed — return False so the host code path handles it.""" + from hermes_cli import gateway as gw + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_via_service_manager_if_s6("install", profile="x") is False + assert rec.calls == [] + + +def test_dispatch_defaults_profile_to_default( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When profile is None, the helper resolves it via _profile_arg(). + With no profile context set anywhere, that resolves to "default".""" + from hermes_cli import gateway as gw + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + monkeypatch.setattr( + "hermes_cli.gateway._profile_suffix", lambda: "", + ) + assert gw._dispatch_via_service_manager_if_s6("start") is True + assert rec.calls == [("start", "gateway-default")] + + +# --------------------------------------------------------------------------- +# _dispatch_all_via_service_manager_if_s6 — --all under s6 +# --------------------------------------------------------------------------- + + +class _ListingRecorder(_CallRecorder): + """_CallRecorder that also exposes a profile list.""" + + def __init__(self, profiles: list[str]) -> None: + super().__init__() + self._profiles = profiles + + def list_profile_gateways(self) -> list[str]: + return list(self._profiles) + + +def test_dispatch_all_returns_false_on_host( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli import gateway as gw + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "systemd", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: pytest.fail("manager should not be constructed on host"), + ) + assert gw._dispatch_all_via_service_manager_if_s6("stop") is False + + +def test_dispatch_all_iterates_every_profile_on_stop( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + from hermes_cli import gateway as gw + rec = _ListingRecorder(["coder", "writer", "assistant"]) + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_all_via_service_manager_if_s6("stop") is True + assert rec.calls == [ + ("stop", "gateway-coder"), + ("stop", "gateway-writer"), + ("stop", "gateway-assistant"), + ] + out = capsys.readouterr().out + assert "Stopped 3 profile gateway(s)" in out + + +def test_dispatch_all_iterates_every_profile_on_restart( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + from hermes_cli import gateway as gw + rec = _ListingRecorder(["coder", "writer"]) + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_all_via_service_manager_if_s6("restart") is True + assert rec.calls == [ + ("restart", "gateway-coder"), + ("restart", "gateway-writer"), + ] + out = capsys.readouterr().out + assert "Restarted 2 profile gateway(s)" in out + + +def test_dispatch_all_handles_partial_failure( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + """A failure on one profile must not skip the others; the helper + reports each failure and the success count.""" + from hermes_cli import gateway as gw + + class _FailOnWriter(_ListingRecorder): + def stop(self, name: str) -> None: + if name == "gateway-writer": + raise RuntimeError("supervise FIFO permission denied") + super().stop(name) + + rec = _FailOnWriter(["coder", "writer", "assistant"]) + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_all_via_service_manager_if_s6("stop") is True + # The two successful ones were called; writer raised before recording. + assert ("stop", "gateway-coder") in rec.calls + assert ("stop", "gateway-assistant") in rec.calls + assert ("stop", "gateway-writer") not in rec.calls + out = capsys.readouterr().out + assert "Stopped 2 profile gateway(s)" in out + assert "Could not stop gateway-writer" in out + assert "supervise FIFO permission denied" in out + + +def test_dispatch_all_empty_list_reports_and_returns_true( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + """With no profile gateways registered the helper still claims the + dispatch (returns True) and prints a friendly message — the host + fallback would just pkill nothing, which isn't useful inside a + container.""" + from hermes_cli import gateway as gw + rec = _ListingRecorder([]) + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_all_via_service_manager_if_s6("stop") is True + assert rec.calls == [] + assert "No profile gateways" in capsys.readouterr().out + + +def test_dispatch_all_unknown_action_returns_false( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """`start --all` is not a supported CLI surface; the helper must + fall through to the host code path rather than no-op.""" + from hermes_cli import gateway as gw + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: pytest.fail( + "manager should not be constructed for unsupported --all action", + ), + ) + assert gw._dispatch_all_via_service_manager_if_s6("start") is False + + +# --------------------------------------------------------------------------- +# Friendly error rendering — GatewayNotRegisteredError / S6CommandError +# (PR #30136 review item I2) +# --------------------------------------------------------------------------- + + +def test_dispatch_renders_gateway_not_registered_friendly( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + """`hermes -p typo gateway start` should print a clear message and + exit 1 — not dump a traceback at the user.""" + from hermes_cli import gateway as gw + from hermes_cli.service_manager import GatewayNotRegisteredError + + class _RaisesMissing: + kind = "s6" + + def start(self, name: str) -> None: + raise GatewayNotRegisteredError("typo") + + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: _RaisesMissing(), + ) + + with pytest.raises(SystemExit) as excinfo: + gw._dispatch_via_service_manager_if_s6("start", profile="typo") + assert excinfo.value.code == 1 + out = capsys.readouterr().out + assert "no such gateway 'typo'" in out + assert "hermes profile create typo" in out + # And critically: no traceback prefix. + assert "Traceback" not in out + + +def test_dispatch_renders_s6_command_error_friendly( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + """An s6-svc failure (e.g. EACCES on the supervise FIFO) should + surface the stderr inline, not as an opaque traceback.""" + from hermes_cli import gateway as gw + from hermes_cli.service_manager import S6CommandError + + class _RaisesS6Error: + kind = "s6" + + def start(self, name: str) -> None: + raise S6CommandError( + service=name, + action="start", + returncode=111, + stderr="s6-svc: fatal: Permission denied", + ) + + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: _RaisesS6Error(), + ) + + with pytest.raises(SystemExit) as excinfo: + gw._dispatch_via_service_manager_if_s6("start", profile="coder") + assert excinfo.value.code == 1 + out = capsys.readouterr().out + assert "rc=111" in out + assert "Permission denied" in out + assert "Traceback" not in out diff --git a/tests/hermes_cli/test_profiles_s6_hooks.py b/tests/hermes_cli/test_profiles_s6_hooks.py new file mode 100644 index 00000000000..db50debdcba --- /dev/null +++ b/tests/hermes_cli/test_profiles_s6_hooks.py @@ -0,0 +1,210 @@ +"""Tests for the Phase 4 s6 hooks in hermes_cli.profiles. + +Specifically: _maybe_register_gateway_service, +_maybe_unregister_gateway_service. The integration with +create_profile and delete_profile is covered indirectly by the +existing TestCreateProfile and TestDeleteProfile classes in +tests/hermes_cli/test_profiles.py; here we only exercise the new +helper surface that doesn't touch the filesystem. +""" +from __future__ import annotations + +from typing import Any + +import pytest + +from hermes_cli.profiles import ( + _maybe_register_gateway_service, + _maybe_unregister_gateway_service, +) + + +# --------------------------------------------------------------------------- +# _maybe_register_gateway_service / _maybe_unregister_gateway_service +# --------------------------------------------------------------------------- + + +class _HostManager: + """Mimics a host backend that doesn't support runtime registration.""" + kind = "systemd" + + def supports_runtime_registration(self) -> bool: + return False + + def register_profile_gateway(self, *args: Any, **kwargs: Any) -> None: + raise AssertionError("host backend register_profile_gateway should not be called") + + def unregister_profile_gateway(self, *args: Any, **kwargs: Any) -> None: + raise AssertionError("host backend unregister_profile_gateway should not be called") + + +class _S6Manager: + """Mimics S6ServiceManager just enough for the hooks.""" + kind = "s6" + + def __init__(self) -> None: + self.registered: list[str] = [] + self.unregistered: list[str] = [] + self.raise_on_register: Exception | None = None + self.raise_on_unregister: Exception | None = None + + def supports_runtime_registration(self) -> bool: + return True + + def register_profile_gateway( + self, profile: str, *, + extra_env: dict[str, str] | None = None, + ) -> None: + if self.raise_on_register is not None: + raise self.raise_on_register + self.registered.append(profile) + + def unregister_profile_gateway(self, profile: str) -> None: + if self.raise_on_unregister is not None: + raise self.raise_on_unregister + self.unregistered.append(profile) + + +def _patch_detect_s6(monkeypatch: pytest.MonkeyPatch) -> None: + """Pretend we're inside an s6 container so the host short-circuit + in :func:`_maybe_register_gateway_service` / + :func:`_maybe_unregister_gateway_service` doesn't fire. + + Without this, ``detect_service_manager()`` runs its real + implementation (host Linux/macOS in CI), returns ``"systemd"`` or + ``"launchd"``, and the hooks return early before reaching the + patched ``get_service_manager``. Each s6-call-through test + explicitly opts into this so the host-no-op tests can still + exercise the early-return path. + """ + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", + lambda: "s6", + ) + + +def test_register_noop_on_host(monkeypatch: pytest.MonkeyPatch) -> None: + # NOTE: deliberately DO NOT patch detect_service_manager — we want + # the real host detection to kick in and short-circuit before + # get_service_manager is ever called. The lambda below is a + # defense-in-depth assertion that get_service_manager is never + # reached on host. + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: _HostManager(), + ) + # Should NOT raise the AssertionError from _HostManager.register + _maybe_register_gateway_service("hostprof") + + +def test_register_calls_through_on_s6(monkeypatch: pytest.MonkeyPatch) -> None: + _patch_detect_s6(monkeypatch) + mgr = _S6Manager() + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_register_gateway_service("coder") + assert mgr.registered == ["coder"] + + +def test_register_swallows_duplicate_value_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A pre-existing s6 registration (from container-boot reconcile) + is a benign condition — register must not propagate ValueError.""" + _patch_detect_s6(monkeypatch) + mgr = _S6Manager() + mgr.raise_on_register = ValueError("already registered") + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + # Should NOT raise + _maybe_register_gateway_service("coder") + + +def test_register_swallows_arbitrary_error( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], +) -> None: + """Even an unexpected exception from the manager must not bring + down `hermes profile create` — print and continue.""" + _patch_detect_s6(monkeypatch) + mgr = _S6Manager() + mgr.raise_on_register = RuntimeError("svscanctl exploded") + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_register_gateway_service("coder") + captured = capsys.readouterr() + assert "Could not register" in captured.out + + +def test_register_swallows_no_backend_runtime_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When `get_service_manager()` raises RuntimeError (no backend + detected), the hook must silently no-op.""" + _patch_detect_s6(monkeypatch) + def _no_backend() -> None: + raise RuntimeError("no supported service manager detected") + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", _no_backend, + ) + # Should NOT raise + _maybe_register_gateway_service("anywhere") + + +def test_register_silent_when_detect_throws( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], +) -> None: + """If detect_service_manager itself raises (e.g. a partial s6 + install on a host machine), the hook must stay silent — no + confusing s6 warning printed to a user who has never touched a + container.""" + def _broken_detect() -> str: + raise RuntimeError("detection blew up") + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", _broken_detect, + ) + # If get_service_manager is reached, the test will assert via + # _HostManager.register. It must NOT be reached. + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: _HostManager(), + ) + _maybe_register_gateway_service("anywhere") + captured = capsys.readouterr() + assert "Could not register" not in captured.out + assert captured.out == "" + + +def test_unregister_noop_on_host(monkeypatch: pytest.MonkeyPatch) -> None: + # Same as test_register_noop_on_host: rely on real host detection. + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: _HostManager(), + ) + _maybe_unregister_gateway_service("hostprof") + + +def test_unregister_calls_through_on_s6(monkeypatch: pytest.MonkeyPatch) -> None: + _patch_detect_s6(monkeypatch) + mgr = _S6Manager() + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_unregister_gateway_service("coder") + assert mgr.unregistered == ["coder"] + + +def test_unregister_swallows_errors( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], +) -> None: + _patch_detect_s6(monkeypatch) + mgr = _S6Manager() + mgr.raise_on_unregister = RuntimeError("svc gone weird") + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_unregister_gateway_service("coder") + captured = capsys.readouterr() + assert "Could not unregister" in captured.out diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py new file mode 100644 index 00000000000..cd5761bb049 --- /dev/null +++ b/tests/hermes_cli/test_service_manager.py @@ -0,0 +1,793 @@ +"""Tests for hermes_cli.service_manager — the abstract ServiceManager +protocol, the detect_service_manager() entry point, and the host-side +adapter wrappers (Systemd / Launchd / Windows). + +The s6 backend is added in Phase 3; its tests live alongside the +implementation in this same file once that phase ships. +""" +from __future__ import annotations + +import pytest + +from hermes_cli.service_manager import ( + LaunchdServiceManager, + S6ServiceManager, + ServiceManager, + ServiceManagerKind, + SystemdServiceManager, + WindowsServiceManager, + detect_service_manager, + get_service_manager, + validate_profile_name, +) + + +# --------------------------------------------------------------------------- +# validate_profile_name +# --------------------------------------------------------------------------- + + +def test_validate_profile_name_accepts_valid_names() -> None: + # Smoke: known-good names should not raise. + validate_profile_name("coder") + validate_profile_name("my-profile") + validate_profile_name("assistant_v2") + validate_profile_name("a") + validate_profile_name("0") + validate_profile_name("0abc") + + +@pytest.mark.parametrize( + "bad", + [ + "", # empty + "Coder", # uppercase + "foo/bar", # path traversal + "../escape", # path traversal + "-leading-dash", # leading dash (s6 reads as a flag) + "_leading_underscore", # leading underscore + "name with spaces", # whitespace + "name.with.dots", # punctuation + "a" * 252, # too long + ], +) +def test_validate_profile_name_rejects_invalid(bad: str) -> None: + with pytest.raises(ValueError): + validate_profile_name(bad) + + +# --------------------------------------------------------------------------- +# detect_service_manager +# --------------------------------------------------------------------------- + + +def test_detect_service_manager_returns_known_value() -> None: + """Without mocking, the function must still return one of the + advertised literals — anything else means a new platform branch + was added without updating ServiceManagerKind.""" + result = detect_service_manager() + assert result in ("systemd", "launchd", "windows", "s6", "none") + + +# --------------------------------------------------------------------------- +# _s6_running — must work for unprivileged users, not just root +# --------------------------------------------------------------------------- + + +def _patch_s6_paths( + monkeypatch: pytest.MonkeyPatch, + *, + comm: str | OSError | None, + basedir_is_dir: bool, +) -> None: + """Stub /proc/1/comm and /run/s6/basedir for _s6_running tests.""" + from pathlib import Path as _Path + + real_read_text = _Path.read_text + real_is_dir = _Path.is_dir + + def fake_read_text(self, *args, **kwargs): # type: ignore[override] + if str(self) == "/proc/1/comm": + if isinstance(comm, OSError): + raise comm + if comm is None: + raise FileNotFoundError(2, "No such file or directory") + return comm + "\n" + return real_read_text(self, *args, **kwargs) + + def fake_is_dir(self): # type: ignore[override] + if str(self) == "/run/s6/basedir": + return basedir_is_dir + return real_is_dir(self) + + monkeypatch.setattr(_Path, "read_text", fake_read_text) + monkeypatch.setattr(_Path, "is_dir", fake_is_dir) + + +def test_s6_running_true_when_comm_and_basedir_match( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli.service_manager import _s6_running + + _patch_s6_paths(monkeypatch, comm="s6-svscan", basedir_is_dir=True) + assert _s6_running() is True + + +def test_s6_running_false_when_comm_is_wrong( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli.service_manager import _s6_running + + # systemd as PID 1, basedir present from some stray s6 install + _patch_s6_paths(monkeypatch, comm="systemd", basedir_is_dir=True) + assert _s6_running() is False + + +def test_s6_running_false_when_basedir_missing( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli.service_manager import _s6_running + + # The comm matches but the basedir is missing — e.g. an unrelated + # process happens to be named "s6-svscan" + _patch_s6_paths(monkeypatch, comm="s6-svscan", basedir_is_dir=False) + assert _s6_running() is False + + +def test_s6_running_false_when_comm_unreadable( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Regression: /proc/1/exe was unreadable to UID 10000 and + resolve() silently returned the unresolved path, making detection + always-False inside the container under the hermes user. The new + probe must FAIL CLOSED — not raise — when /proc/1/comm can't be + read. + """ + from hermes_cli.service_manager import _s6_running + + _patch_s6_paths( + monkeypatch, + comm=PermissionError(13, "Permission denied"), + basedir_is_dir=True, + ) + assert _s6_running() is False + + +def test_s6_running_handles_missing_proc( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """On macOS / Windows / WSL-without-procfs, /proc/1/comm doesn't + exist. Must return False, not raise.""" + from hermes_cli.service_manager import _s6_running + + _patch_s6_paths(monkeypatch, comm=None, basedir_is_dir=False) + assert _s6_running() is False + + +# --------------------------------------------------------------------------- +# Backend wrappers — kind + registration unsupported on hosts +# --------------------------------------------------------------------------- + + +def test_systemd_manager_kind_and_registration_unsupported() -> None: + mgr = SystemdServiceManager() + assert mgr.kind == "systemd" + assert mgr.supports_runtime_registration() is False + with pytest.raises(NotImplementedError): + mgr.register_profile_gateway("foo") + with pytest.raises(NotImplementedError): + mgr.unregister_profile_gateway("foo") + assert mgr.list_profile_gateways() == [] + # Protocol conformance — runtime_checkable lets us assert this. + assert isinstance(mgr, ServiceManager) + + +def test_launchd_manager_kind_and_registration_unsupported() -> None: + mgr = LaunchdServiceManager() + assert mgr.kind == "launchd" + assert mgr.supports_runtime_registration() is False + with pytest.raises(NotImplementedError): + mgr.register_profile_gateway("foo") + assert mgr.list_profile_gateways() == [] + assert isinstance(mgr, ServiceManager) + + +def test_windows_manager_kind_and_registration_unsupported() -> None: + mgr = WindowsServiceManager() + assert mgr.kind == "windows" + assert mgr.supports_runtime_registration() is False + with pytest.raises(NotImplementedError): + mgr.register_profile_gateway("foo") + assert isinstance(mgr, ServiceManager) + + +# --------------------------------------------------------------------------- +# Lifecycle delegation — wrappers must call through to module-level fns +# --------------------------------------------------------------------------- + + +def test_systemd_manager_lifecycle_delegates(monkeypatch: pytest.MonkeyPatch) -> None: + called: list[str] = [] + monkeypatch.setattr( + "hermes_cli.gateway.systemd_start", lambda: called.append("start"), + ) + monkeypatch.setattr( + "hermes_cli.gateway.systemd_stop", lambda: called.append("stop"), + ) + monkeypatch.setattr( + "hermes_cli.gateway.systemd_restart", lambda: called.append("restart"), + ) + monkeypatch.setattr( + "hermes_cli.gateway._probe_systemd_service_running", + lambda *a, **kw: (False, True), + ) + mgr = SystemdServiceManager() + mgr.start("ignored") + mgr.stop("ignored") + mgr.restart("ignored") + assert called == ["start", "stop", "restart"] + assert mgr.is_running("ignored") is True + + +def test_launchd_manager_lifecycle_delegates(monkeypatch: pytest.MonkeyPatch) -> None: + called: list[str] = [] + monkeypatch.setattr( + "hermes_cli.gateway.launchd_start", lambda: called.append("start"), + ) + monkeypatch.setattr( + "hermes_cli.gateway.launchd_stop", lambda: called.append("stop"), + ) + monkeypatch.setattr( + "hermes_cli.gateway.launchd_restart", lambda: called.append("restart"), + ) + monkeypatch.setattr( + "hermes_cli.gateway._probe_launchd_service_running", lambda: False, + ) + mgr = LaunchdServiceManager() + mgr.start("ignored") + mgr.stop("ignored") + mgr.restart("ignored") + assert called == ["start", "stop", "restart"] + assert mgr.is_running("ignored") is False + + +def test_windows_manager_lifecycle_delegates(monkeypatch: pytest.MonkeyPatch) -> None: + called: list[str] = [] + # Force-import the submodule so monkeypatch's attribute lookup + # against the `hermes_cli` package succeeds — gateway_windows is + # imported lazily inside the wrapper and may not yet be loaded. + import hermes_cli.gateway_windows # noqa: F401 + + class _FakeWindowsModule: + @staticmethod + def start() -> None: called.append("start") + @staticmethod + def stop() -> None: called.append("stop") + @staticmethod + def restart() -> None: called.append("restart") + @staticmethod + def is_installed() -> bool: return True + + monkeypatch.setattr("hermes_cli.gateway_windows", _FakeWindowsModule) + monkeypatch.setattr( + "hermes_cli.gateway.find_gateway_pids", + lambda **kw: [12345], + ) + mgr = WindowsServiceManager() + mgr.start("ignored") + mgr.stop("ignored") + mgr.restart("ignored") + assert called == ["start", "stop", "restart"] + assert mgr.is_running("ignored") is True + + +def test_windows_manager_is_running_false_when_not_installed( + monkeypatch: pytest.MonkeyPatch, +) -> None: + import hermes_cli.gateway_windows # noqa: F401 + + class _FakeWindowsModule: + @staticmethod + def is_installed() -> bool: return False + + monkeypatch.setattr("hermes_cli.gateway_windows", _FakeWindowsModule) + monkeypatch.setattr( + "hermes_cli.gateway.find_gateway_pids", + lambda **kw: [12345], # PIDs would otherwise vote "running" + ) + assert WindowsServiceManager().is_running("ignored") is False + + +def test_windows_manager_install_forwards_kwargs(monkeypatch: pytest.MonkeyPatch) -> None: + captured: dict[str, object] = {} + import hermes_cli.gateway_windows # noqa: F401 + + class _FakeWindowsModule: + @staticmethod + def install(*, force, start_now, start_on_login, elevated_handoff) -> None: + captured["force"] = force + captured["start_now"] = start_now + captured["start_on_login"] = start_on_login + captured["elevated_handoff"] = elevated_handoff + + monkeypatch.setattr("hermes_cli.gateway_windows", _FakeWindowsModule) + WindowsServiceManager().install( + force=True, start_now=True, start_on_login=False, elevated_handoff=True, + ) + assert captured == { + "force": True, + "start_now": True, + "start_on_login": False, + "elevated_handoff": True, + } + + +# --------------------------------------------------------------------------- +# get_service_manager factory +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "kind,cls", + [ + ("systemd", SystemdServiceManager), + ("launchd", LaunchdServiceManager), + ("windows", WindowsServiceManager), + ], +) +def test_get_service_manager_returns_correct_backend( + monkeypatch: pytest.MonkeyPatch, + kind: ServiceManagerKind, + cls: type, +) -> None: + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: kind, + ) + assert isinstance(get_service_manager(), cls) + + +def test_get_service_manager_raises_when_unsupported( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "none", + ) + with pytest.raises(RuntimeError, match="no supported service manager"): + get_service_manager() + + +def test_get_service_manager_returns_s6_instance( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """The s6 backend ships in Phase 3 — the factory must return an + S6ServiceManager when running inside a container.""" + from hermes_cli.service_manager import S6ServiceManager + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + assert isinstance(get_service_manager(), S6ServiceManager) + + +# --------------------------------------------------------------------------- +# S6ServiceManager — unit tests against a tmp-path scandir (no real s6) +# --------------------------------------------------------------------------- + + +@pytest.fixture +def s6_scandir(tmp_path): + """Empty scandir for the S6ServiceManager tests.""" + d = tmp_path / "service" + d.mkdir() + return d + + +@pytest.fixture +def fake_subprocess_run(monkeypatch: pytest.MonkeyPatch): + """Capture subprocess.run calls + always return success. Lets the + S6ServiceManager tests run on hosts that don't have s6-svc / + s6-svscanctl installed. + + Records are normalized: leading ``/command/`` is stripped from + cmd[0] so assertions can match on the bare s6-svc / s6-svstat / + s6-svscanctl name regardless of whether the manager calls them + via absolute path or bare name.""" + calls: list[list[str]] = [] + + def _fake(cmd, **kw): + import subprocess as _sp + seq = list(cmd) if isinstance(cmd, (list, tuple)) else [str(cmd)] + if seq and seq[0].startswith("/command/"): + seq[0] = seq[0][len("/command/"):] + calls.append(seq) + return _sp.CompletedProcess(cmd, 0, "", "") + + monkeypatch.setattr("subprocess.run", _fake) + return calls + + +def test_s6_manager_kind_and_supports_registration() -> None: + from hermes_cli.service_manager import S6ServiceManager + mgr = S6ServiceManager() + assert mgr.kind == "s6" + assert mgr.supports_runtime_registration() is True + + +# --------------------------------------------------------------------------- +# _seed_supervise_skeleton — unit tests +# --------------------------------------------------------------------------- +# +# The skeleton helper pre-creates the dirs and FIFOs that s6-supervise +# would otherwise create as root mode 0700, locking out the +# unprivileged hermes user from every lifecycle op. These tests run +# against tmp_path and assert the produced layout — the live-container +# verification (against real s6-svc / s6-svstat) lives in +# tests/docker/test_s6_profile_gateway_integration.py. + + +def test_seed_supervise_skeleton_creates_expected_layout(tmp_path) -> None: + """Verifies the dirs + FIFO + modes the helper lays down.""" + import stat + + from hermes_cli.service_manager import _seed_supervise_skeleton + + svc_dir = tmp_path / "gateway-foo" + svc_dir.mkdir() + + _seed_supervise_skeleton(svc_dir) + + # Top-level event/ — s6-svlisten1 event subscription dir. + event = svc_dir / "event" + assert event.is_dir(), "missing top-level event/" + assert stat.S_IMODE(event.stat().st_mode) == 0o3730, ( + f"event/ mode = {oct(event.stat().st_mode)}, want 03730" + ) + + # supervise/ dir. + supervise = svc_dir / "supervise" + assert supervise.is_dir(), "missing supervise/" + assert stat.S_IMODE(supervise.stat().st_mode) == 0o755 + + # supervise/event/. + supervise_event = supervise / "event" + assert supervise_event.is_dir(), "missing supervise/event/" + assert stat.S_IMODE(supervise_event.stat().st_mode) == 0o3730 + + # supervise/control FIFO. + control = supervise / "control" + assert control.exists(), "missing supervise/control FIFO" + assert stat.S_ISFIFO(control.stat().st_mode), ( + "supervise/control must be a FIFO" + ) + assert stat.S_IMODE(control.stat().st_mode) == 0o660 + + +def test_seed_supervise_skeleton_handles_log_subservice(tmp_path) -> None: + """When a log/ subdir exists, its supervise tree also gets seeded. + + Without this, ``unregister_profile_gateway``'s rmtree would EACCES + on the logger's root-owned supervise dir even after the parent + slot's supervise/ was hermes-owned. + """ + import stat + + from hermes_cli.service_manager import _seed_supervise_skeleton + + svc_dir = tmp_path / "gateway-foo" + svc_dir.mkdir() + (svc_dir / "log").mkdir() # logger subdir present + + _seed_supervise_skeleton(svc_dir) + + # Logger's own supervise tree is seeded the same way. + log_event = svc_dir / "log" / "event" + log_supervise = svc_dir / "log" / "supervise" + log_supervise_event = log_supervise / "event" + log_control = log_supervise / "control" + + assert log_event.is_dir() + assert stat.S_IMODE(log_event.stat().st_mode) == 0o3730 + assert log_supervise.is_dir() + assert log_supervise_event.is_dir() + assert log_control.exists() and stat.S_ISFIFO(log_control.stat().st_mode) + + +def test_seed_supervise_skeleton_skips_when_no_log_subservice(tmp_path) -> None: + """If log/ isn't present, no logger skeleton is created.""" + from hermes_cli.service_manager import _seed_supervise_skeleton + + svc_dir = tmp_path / "gateway-foo" + svc_dir.mkdir() + + _seed_supervise_skeleton(svc_dir) + + assert not (svc_dir / "log").exists(), ( + "helper must not synthesize a log/ subdir on its own" + ) + + +def test_seed_supervise_skeleton_is_idempotent(tmp_path) -> None: + """Calling the helper twice on the same dir is a no-op the second time. + + Important because s6-supervise may have already opened the FIFO + when a re-register / reconcile happens; double-creation would + error out. The helper short-circuits on existence. + """ + from hermes_cli.service_manager import _seed_supervise_skeleton + + svc_dir = tmp_path / "gateway-foo" + svc_dir.mkdir() + + _seed_supervise_skeleton(svc_dir) + _seed_supervise_skeleton(svc_dir) # must not raise + + +def test_s6_register_creates_service_dir_and_triggers_scan( + s6_scandir, fake_subprocess_run, +) -> None: + from hermes_cli.service_manager import S6ServiceManager + mgr = S6ServiceManager(scandir=s6_scandir) + mgr.register_profile_gateway("coder") + + svc_dir = s6_scandir / "gateway-coder" + assert svc_dir.is_dir() + assert (svc_dir / "type").read_text().strip() == "longrun" + + run_path = svc_dir / "run" + assert run_path.is_file() + assert run_path.stat().st_mode & 0o111 # executable + run_text = run_path.read_text() + assert "hermes -p coder gateway run" in run_text + assert "s6-setuidgid hermes" in run_text + + log_run = svc_dir / "log" / "run" + assert log_run.is_file() + log_text = log_run.read_text() + # CRITICAL: HERMES_HOME must be a runtime env-var expansion, NOT + # a Python-substituted absolute path. Negative-assert the wrong + # form so future regressions are caught. + assert "$HERMES_HOME" in log_text + assert "logs/gateways/coder" in log_text + assert "/opt/data/logs/gateways/coder" not in log_text, ( + "log_dir was hard-coded; must use ${HERMES_HOME} at run time" + ) + + # s6-svscanctl -a was invoked against the scandir + assert any( + cmd[0] == "s6-svscanctl" and "-a" in cmd + and str(s6_scandir) in cmd + for cmd in fake_subprocess_run + ), f"s6-svscanctl -a not invoked; saw: {fake_subprocess_run}" + + +def test_s6_register_extra_env_is_quoted(s6_scandir, fake_subprocess_run) -> None: + from hermes_cli.service_manager import S6ServiceManager + mgr = S6ServiceManager(scandir=s6_scandir) + mgr.register_profile_gateway( + "x", extra_env={"FOO": "bar baz", "QUOTED": "a'b"}, + ) + run_text = (s6_scandir / "gateway-x" / "run").read_text() + # shlex.quote should have wrapped both values + assert "export FOO='bar baz'" in run_text + assert "export QUOTED='a'\"'\"'b'" in run_text + + +def test_s6_register_rejects_invalid_profile_name(s6_scandir) -> None: + from hermes_cli.service_manager import S6ServiceManager + mgr = S6ServiceManager(scandir=s6_scandir) + with pytest.raises(ValueError): + mgr.register_profile_gateway("Bad/Name") + + +def test_s6_register_rejects_duplicate(s6_scandir, fake_subprocess_run) -> None: + from hermes_cli.service_manager import S6ServiceManager + mgr = S6ServiceManager(scandir=s6_scandir) + (s6_scandir / "gateway-coder").mkdir(parents=True) + with pytest.raises(ValueError, match="already registered"): + mgr.register_profile_gateway("coder") + + +def test_s6_register_rolls_back_on_svscanctl_failure( + s6_scandir, monkeypatch: pytest.MonkeyPatch, +) -> None: + """If s6-svscanctl fails the service dir must be cleaned up so the + next register call doesn't see a stale duplicate.""" + import subprocess as _sp + from hermes_cli.service_manager import S6ServiceManager + + def _fail_scanctl(cmd, **kw): + # Manager calls s6-svscanctl by absolute path; match on basename. + if cmd[0].endswith("/s6-svscanctl"): + return _sp.CompletedProcess(cmd, 1, "", "rescan failed") + return _sp.CompletedProcess(cmd, 0, "", "") + monkeypatch.setattr("subprocess.run", _fail_scanctl) + + mgr = S6ServiceManager(scandir=s6_scandir) + with pytest.raises(RuntimeError, match="s6-svscanctl failed"): + mgr.register_profile_gateway("coder") + assert not (s6_scandir / "gateway-coder").exists() + + +def test_s6_unregister_removes_service_dir( + s6_scandir, fake_subprocess_run, +) -> None: + from hermes_cli.service_manager import S6ServiceManager + svc_dir = s6_scandir / "gateway-coder" + svc_dir.mkdir(parents=True) + (svc_dir / "type").write_text("longrun\n") + + mgr = S6ServiceManager(scandir=s6_scandir) + mgr.unregister_profile_gateway("coder") + + # s6-svc -d was issued + assert any( + cmd[0] == "s6-svc" and "-d" in cmd + for cmd in fake_subprocess_run + ) + # Service dir was removed + assert not svc_dir.exists() + # Rescan was triggered + assert any(cmd[0] == "s6-svscanctl" for cmd in fake_subprocess_run) + + +def test_s6_unregister_absent_profile_is_noop(s6_scandir) -> None: + from hermes_cli.service_manager import S6ServiceManager + # Should NOT raise even though "ghost" doesn't exist + S6ServiceManager(scandir=s6_scandir).unregister_profile_gateway("ghost") + + +def test_s6_list_profile_gateways(s6_scandir) -> None: + from hermes_cli.service_manager import S6ServiceManager + # Three gateway profiles + one unrelated service + one hidden dir + (s6_scandir / "gateway-coder").mkdir() + (s6_scandir / "gateway-assistant").mkdir() + (s6_scandir / "gateway-writer").mkdir() + (s6_scandir / "s6-linux-init-shutdownd").mkdir() # filtered out + (s6_scandir / ".lock").mkdir() # filtered out (hidden) + + profiles = sorted(S6ServiceManager(scandir=s6_scandir).list_profile_gateways()) + assert profiles == ["assistant", "coder", "writer"] + + +def test_s6_list_profile_gateways_empty_when_scandir_missing(tmp_path) -> None: + from hermes_cli.service_manager import S6ServiceManager + missing = tmp_path / "does-not-exist" + assert S6ServiceManager(scandir=missing).list_profile_gateways() == [] + + +def test_s6_lifecycle_dispatches_to_s6_svc( + s6_scandir, fake_subprocess_run, +) -> None: + from hermes_cli.service_manager import S6ServiceManager + mgr = S6ServiceManager(scandir=s6_scandir) + # _run_svc now verifies the slot exists before invoking s6-svc, so + # we have to pre-seed the dir. In real use the slot is created by + # register_profile_gateway or the cont-init.d reconciler. + (s6_scandir / "gateway-coder").mkdir() + mgr.start("gateway-coder") + mgr.stop("gateway-coder") + mgr.restart("gateway-coder") + + flags = [c[1] for c in fake_subprocess_run if c[0] == "s6-svc"] + assert flags == ["-u", "-d", "-t"] + + +# --------------------------------------------------------------------------- +# Lifecycle errors — friendly messages, not raw CalledProcessError +# --------------------------------------------------------------------------- + + +def test_lifecycle_raises_gateway_not_registered_for_missing_slot( + s6_scandir, fake_subprocess_run, +) -> None: + """When the service slot doesn't exist, the lifecycle methods + must raise GatewayNotRegisteredError BEFORE invoking s6-svc, so + the user sees a clear 'no such gateway' message instead of an + opaque CalledProcessError stacktrace.""" + from hermes_cli.service_manager import ( + GatewayNotRegisteredError, + S6ServiceManager, + ) + + mgr = S6ServiceManager(scandir=s6_scandir) + # No gateway-typo/ directory exists — slot is missing. + with pytest.raises(GatewayNotRegisteredError) as excinfo: + mgr.start("gateway-typo") + assert excinfo.value.profile == "typo" + assert excinfo.value.service == "gateway-typo" + msg = str(excinfo.value) + assert "'typo'" in msg + assert "hermes profile create typo" in msg + # And critically: s6-svc was NOT invoked. + assert not any(c[0] == "s6-svc" for c in fake_subprocess_run) + + +@pytest.mark.parametrize("action,method_name", [ + ("start", "start"), + ("stop", "stop"), + ("restart", "restart"), +]) +def test_all_lifecycle_methods_check_for_missing_slot( + s6_scandir, + fake_subprocess_run, + action: str, + method_name: str, +) -> None: + """start/stop/restart all check for missing slots the same way.""" + from hermes_cli.service_manager import ( + GatewayNotRegisteredError, + S6ServiceManager, + ) + + mgr = S6ServiceManager(scandir=s6_scandir) + with pytest.raises(GatewayNotRegisteredError): + getattr(mgr, method_name)("gateway-absent") + + +def test_gateway_not_registered_unprefixed_service_name(s6_scandir) -> None: + """If the caller passes a name without the 'gateway-' prefix (the + Protocol allows arbitrary service names), the error still carries + that name verbatim as the 'profile' so error messages don't + accidentally strip user-provided text.""" + from hermes_cli.service_manager import ( + GatewayNotRegisteredError, + S6ServiceManager, + ) + + mgr = S6ServiceManager(scandir=s6_scandir) + with pytest.raises(GatewayNotRegisteredError) as excinfo: + mgr.start("not-prefixed") + assert excinfo.value.profile == "not-prefixed" + + +def test_lifecycle_raises_s6_command_error_on_subprocess_failure( + s6_scandir, monkeypatch: pytest.MonkeyPatch, +) -> None: + """When s6-svc itself fails (non-zero exit) — e.g. EACCES on the + supervise control FIFO — the lifecycle methods translate the + CalledProcessError into a named S6CommandError carrying the + return code and stderr.""" + import subprocess as _sp + from hermes_cli.service_manager import S6CommandError, S6ServiceManager + + # Pre-create the slot so we reach the s6-svc call. + (s6_scandir / "gateway-coder").mkdir() + + def _fail(cmd, **kw): + raise _sp.CalledProcessError( + returncode=111, + cmd=cmd, + stderr="s6-svc: fatal: unable to control supervise/control: " + "Permission denied\n", + ) + monkeypatch.setattr("subprocess.run", _fail) + + mgr = S6ServiceManager(scandir=s6_scandir) + with pytest.raises(S6CommandError) as excinfo: + mgr.start("gateway-coder") + assert excinfo.value.service == "gateway-coder" + assert excinfo.value.action == "start" + assert excinfo.value.returncode == 111 + assert "Permission denied" in excinfo.value.stderr + assert "Permission denied" in str(excinfo.value) + assert "rc=111" in str(excinfo.value) + + +def test_s6_is_running_parses_svstat( + s6_scandir, monkeypatch: pytest.MonkeyPatch, +) -> None: + import subprocess as _sp + from hermes_cli.service_manager import S6ServiceManager + + def _svstat(cmd, **kw): + if cmd[0].endswith("/s6-svstat"): + return _sp.CompletedProcess(cmd, 0, "up (pid 42) 17 seconds\n", "") + return _sp.CompletedProcess(cmd, 0, "", "") + monkeypatch.setattr("subprocess.run", _svstat) + assert S6ServiceManager(scandir=s6_scandir).is_running("gateway-coder") is True + + def _svstat_down(cmd, **kw): + if cmd[0].endswith("/s6-svstat"): + return _sp.CompletedProcess(cmd, 0, "down 5 seconds\n", "") + return _sp.CompletedProcess(cmd, 0, "", "") + monkeypatch.setattr("subprocess.run", _svstat_down) + assert S6ServiceManager(scandir=s6_scandir).is_running("gateway-coder") is False diff --git a/tests/tools/test_docker_environment.py b/tests/tools/test_docker_environment.py index cd3b7aae6f6..439d59bd76c 100644 --- a/tests/tools/test_docker_environment.py +++ b/tests/tools/test_docker_environment.py @@ -385,18 +385,19 @@ def test_normalize_env_dict_rejects_complex_values(): assert result == {"GOOD": "string"} -def test_security_args_include_setuid_setgid_for_gosu_drop(monkeypatch): +def test_security_args_include_setuid_setgid_for_privdrop(monkeypatch): """The default (run_as_host_user=False) invocation must include SETUID and - SETGID caps so the image entrypoint can drop from root to the non-root - `hermes` user via gosu. + SETGID caps so the image's init can drop from root to a non-root user + (e.g. via ``s6-setuidgid`` in the bundled Hermes image, or ``gosu``/``su`` + in user-provided images). - Without these caps gosu exits with - ``error: failed switching to 'hermes': operation not permitted`` - and the container exits immediately (exit 1) before running any work. + Without these caps the privilege-drop helper fails with + ``operation not permitted`` and the container exits immediately (exit 1) + before running any work. - `no-new-privileges` is kept, so gosu still cannot escalate back to root - after the drop — the drop is a one-way transition performed before the - `no_new_privs` bit is enforced on the exec boundary. + ``no-new-privileges`` is kept, so the dropped process still cannot + escalate back to root after the drop — the drop is a one-way transition + performed before the ``no_new_privs`` bit is enforced on the exec boundary. """ monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") calls = _mock_subprocess_run(monkeypatch) @@ -412,8 +413,8 @@ def test_security_args_include_setuid_setgid_for_gosu_drop(monkeypatch): for i, flag in enumerate(run_args[:-1]) if flag == "--cap-add" } - assert "SETUID" in added, "SETUID cap missing — gosu drop in entrypoint will fail" - assert "SETGID" in added, "SETGID cap missing — gosu drop in entrypoint will fail" + assert "SETUID" in added, "SETUID cap missing — image privilege-drop will fail" + assert "SETGID" in added, "SETGID cap missing — image privilege-drop will fail" # ── run_as_host_user tests ──────────────────────────────────────── @@ -441,8 +442,9 @@ def test_run_as_host_user_passes_uid_gid(monkeypatch): def test_run_as_host_user_drops_setuid_setgid_caps(monkeypatch): - """When --user is passed, the container never needs gosu, so SETUID/SETGID - caps are omitted for a tighter security posture.""" + """When --user is passed, the container already starts unprivileged and + never needs a privilege drop, so SETUID/SETGID caps are omitted for a + tighter security posture.""" monkeypatch.setattr(docker_env, "find_docker", lambda: "/usr/bin/docker") monkeypatch.setattr(docker_env.os, "getuid", lambda: 1000, raising=False) monkeypatch.setattr(docker_env.os, "getgid", lambda: 1000, raising=False) @@ -459,10 +461,10 @@ def test_run_as_host_user_drops_setuid_setgid_caps(monkeypatch): if flag == "--cap-add" } assert "SETUID" not in added, ( - "SETUID cap should be dropped when running as host user — no gosu drop is needed" + "SETUID cap should be dropped when running as host user — no privilege drop is needed" ) assert "SETGID" not in added, ( - "SETGID cap should be dropped when running as host user — no gosu drop is needed" + "SETGID cap should be dropped when running as host user — no privilege drop is needed" ) # Core non-privilege-drop caps must still be there (pip/npm/apt need them). assert "DAC_OVERRIDE" in added diff --git a/tests/tools/test_dockerfile_pid1_reaping.py b/tests/tools/test_dockerfile_pid1_reaping.py index 70d95807aa7..88382534fba 100644 --- a/tests/tools/test_dockerfile_pid1_reaping.py +++ b/tests/tools/test_dockerfile_pid1_reaping.py @@ -5,11 +5,17 @@ they deliberately avoid snapshotting specific package versions, line numbers, or exact flag choices. What they DO assert is that the Dockerfile maintains the properties required for correct production behaviour: -- A PID-1 init (tini) is installed and wraps the entrypoint, so that orphaned +- A PID-1 init is installed and wraps the entrypoint, so that orphaned subprocesses (MCP stdio servers, git, bun, browser daemons) get reaped instead of accumulating as zombies (#15012). - Signal forwarding runs through the init so ``docker stop`` triggers hermes's own graceful-shutdown path. + +The init can be any reaper-capable PID-1: the historical lineage was +``tini``; the current image uses s6-overlay's ``/init`` (which execs +``s6-svscan`` as PID 1, with the same SIGCHLD-reaping property). The +checks below accept either family — the contract is behavioural, not +nominal. """ from __future__ import annotations @@ -24,6 +30,21 @@ DOCKERFILE = REPO_ROOT / "Dockerfile" DOCKERIGNORE = REPO_ROOT / ".dockerignore" +# Init-process families this repo accepts as PID 1. ``tini`` / +# ``dumb-init`` / ``catatonit`` are classic minimal reapers; s6-overlay +# ships ``/init`` which execs ``s6-svscan`` as PID 1 (same reaper +# contract, plus supervision of declared services). Either family +# satisfies the zombie-reaping invariant — see issue #15012. +_KNOWN_INIT_TOKENS: tuple[str, ...] = ( + "tini", + "dumb-init", + "catatonit", + "s6-overlay", + "s6-svscan", + "/init", +) + + @pytest.fixture(scope="module") def dockerfile_text() -> str: if not DOCKERFILE.exists(): @@ -57,8 +78,17 @@ def _run_steps(dockerfile_text: str) -> list[str]: ] +def _instruction_text(dockerfile_text: str) -> str: + """Join every non-comment Dockerfile instruction into one searchable + string. Crucially excludes comments — otherwise the historical + explanation of "we used to use tini" would silently satisfy a + substring check long after tini was removed from the build. + """ + return "\n".join(_dockerfile_instructions(dockerfile_text)) + + def test_dockerfile_installs_an_init_for_zombie_reaping(dockerfile_text): - """Some init (tini, dumb-init, catatonit) must be installed. + """Some init (tini, dumb-init, catatonit, s6-overlay) must be installed. Without a PID-1 init that handles SIGCHLD, hermes accumulates zombie processes from MCP stdio subprocesses, git operations, browser @@ -67,12 +97,17 @@ def test_dockerfile_installs_an_init_for_zombie_reaping(dockerfile_text): """ # Accept any of the common reapers. The contract is behavioural: # something must be installed that reaps orphans. - known_inits = ("tini", "dumb-init", "catatonit") - installed = any(name in dockerfile_text for name in known_inits) + # + # Scan instructions only (no comments) so a stale historical mention + # in a comment can't masquerade as a current install. Without this, + # removing tini from the actual build but leaving the word in a + # comment would silently keep the test green. + instructions = _instruction_text(dockerfile_text) + installed = any(name in instructions for name in _KNOWN_INIT_TOKENS) assert installed, ( - "No PID-1 init detected in Dockerfile (looked for: " - f"{', '.join(known_inits)}). Without an init process to reap " - "orphaned subprocesses, hermes accumulates zombies in Docker " + "No PID-1 init detected in Dockerfile instructions (looked for: " + f"{', '.join(_KNOWN_INIT_TOKENS)}). Without an init process to " + "reap orphaned subprocesses, hermes accumulates zombies in Docker " "deployments. See issue #15012." ) @@ -80,8 +115,8 @@ def test_dockerfile_installs_an_init_for_zombie_reaping(dockerfile_text): def test_dockerfile_entrypoint_routes_through_the_init(dockerfile_text): """The ENTRYPOINT must invoke the init, not the entrypoint script directly. - Installing tini is only half the fix — the container must actually run - with tini as PID 1. If the ENTRYPOINT executes the shell script + Installing the init is only half the fix — the container must actually + run with it as PID 1. If the ENTRYPOINT executes the shell script directly, the shell becomes PID 1 and will ``exec`` into hermes, which then runs as PID 1 without any zombie reaping. """ @@ -96,12 +131,12 @@ def test_dockerfile_entrypoint_routes_through_the_init(dockerfile_text): assert entrypoint_line is not None, "Dockerfile is missing an ENTRYPOINT directive" - known_inits = ("tini", "dumb-init", "catatonit") - routes_through_init = any(name in entrypoint_line for name in known_inits) + routes_through_init = any(name in entrypoint_line for name in _KNOWN_INIT_TOKENS) assert routes_through_init, ( - f"ENTRYPOINT does not route through an init: {entrypoint_line!r}. " - "If tini is only installed but not wired into ENTRYPOINT, hermes " - "still runs as PID 1 and zombies will accumulate (#15012)." + f"ENTRYPOINT does not route through a PID-1 init: {entrypoint_line!r}. " + f"Expected one of {_KNOWN_INIT_TOKENS}. If the init is installed but " + "not wired into ENTRYPOINT, hermes still runs as PID 1 and zombies " + "will accumulate (#15012)." ) diff --git a/tools/environments/docker.py b/tools/environments/docker.py index 1cd72ce8552..ed53cd07c41 100644 --- a/tools/environments/docker.py +++ b/tools/environments/docker.py @@ -148,12 +148,14 @@ def find_docker() -> Optional[str]: # We drop all capabilities then add back the minimum needed: # DAC_OVERRIDE - root can write to bind-mounted dirs owned by host user # CHOWN/FOWNER - package managers (pip, npm, apt) need to set file ownership -# SETUID/SETGID - the image entrypoint drops from root to the 'hermes' -# user via `gosu`, which requires these caps. Combined with -# `no-new-privileges`, gosu still cannot escalate back to root after -# the drop, so the security posture is preserved. Omitted entirely -# when the container starts as a non-root user via --user, since -# no gosu drop is needed in that mode. +# SETUID/SETGID - the image's init drops from root to the 'hermes' +# user (via `s6-setuidgid` in the bundled image, or whatever +# privilege-drop helper a user image uses), which requires these +# caps. Combined with `no-new-privileges`, the dropped process +# still cannot escalate back to root, so the security posture is +# preserved. Omitted entirely when the container starts as a +# non-root user via --user, since no privilege drop is needed +# in that mode. # Block privilege escalation and limit PIDs. # /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds). _BASE_SECURITY_ARGS = [ @@ -168,10 +170,11 @@ _BASE_SECURITY_ARGS = [ "--tmpfs", "/run:rw,noexec,nosuid,size=64m", ] -# Extra caps needed when the container starts as root and an entrypoint -# must drop privileges via gosu/su. Skipped when --user is passed because -# the container already starts unprivileged and never needs to switch. -_GOSU_CAP_ARGS = [ +# Extra caps needed when the container starts as root and an init/entrypoint +# must drop privileges (via `s6-setuidgid`, `gosu`, `su`, or similar). +# Skipped when --user is passed because the container already starts +# unprivileged and never needs to switch. +_PRIVDROP_CAP_ARGS = [ "--cap-add", "SETUID", "--cap-add", "SETGID", ] @@ -181,7 +184,7 @@ def _build_security_args(run_as_host_user: bool) -> list[str]: """Return the security/cap/tmpfs args tailored to the privilege mode.""" if run_as_host_user: return list(_BASE_SECURITY_ARGS) - return list(_BASE_SECURITY_ARGS) + list(_GOSU_CAP_ARGS) + return list(_BASE_SECURITY_ARGS) + list(_PRIVDROP_CAP_ARGS) def _resolve_host_user_spec() -> Optional[str]: @@ -473,7 +476,7 @@ class DockerEnvironment(BaseEnvironment): "image default user." ) # Fall back to the full cap set — without --user, an image's - # entrypoint may still need gosu/su to drop privileges. + # init may still need s6-setuidgid/gosu/su to drop privileges. security_args = _build_security_args(run_as_host_user and bool(user_args)) logger.info(f"Docker volume_args: {volume_args}") diff --git a/website/docs/user-guide/docker.md b/website/docs/user-guide/docker.md index 0cecc96c08e..334ce794340 100644 --- a/website/docs/user-guide/docker.md +++ b/website/docs/user-guide/docker.md @@ -84,7 +84,15 @@ The entrypoint starts `hermes dashboard` in the background (running as the non-r By default, the dashboard stays on loopback to avoid exposing the unauthenticated web surface over the network. To publish it intentionally, set `HERMES_DASHBOARD_HOST=0.0.0.0` and configure your own trusted network boundary/reverse proxy. In that case you must explicitly add `--insecure` behavior by passing host/flags in your command path (the entrypoint no longer auto-enables insecure mode). :::note -The dashboard side-process is **not supervised** — if it crashes, it stays down until the container restarts. Running it as a separate container is not supported: the dashboard's gateway-liveness detection requires a shared PID namespace with the gateway process. +The dashboard runs as a supervised s6 service inside the container. If +the dashboard process crashes, s6-overlay restarts it automatically +after a short backoff — you'll see a new PID without needing to +restart the container. Logs and crash output are visible via +`docker logs ` (s6 forwards service stdout/stderr there). + +Running the dashboard as a separate container is not supported: its +gateway-liveness detection requires a shared PID namespace with the +gateway process. ::: ## Running interactively (CLI chat) @@ -260,24 +268,51 @@ The official image is based on `debian:13.4` and includes: - Python 3 with all Hermes dependencies (`uv pip install -e ".[all]"`) - Node.js + npm (for browser automation and WhatsApp bridge) - Playwright with Chromium (`npx playwright install --with-deps chromium --only-shell`) -- ripgrep, ffmpeg, git, and tini as system utilities +- ripgrep, ffmpeg, git, and `xz-utils` as system utilities - **`docker-cli`** — so agents running inside the container can drive the host's Docker daemon (bind-mount `/var/run/docker.sock` to opt in) for `docker build`, `docker run`, container inspection, etc. - **`openssh-client`** — enables the [SSH terminal backend](/docs/user-guide/configuration#ssh-backend) from inside the container. The SSH backend shells out to the system `ssh` binary; without this, it failed silently in containerized installs. - The WhatsApp bridge (`scripts/whatsapp-bridge/`) +- **[`s6-overlay`](https://github.com/just-containers/s6-overlay) v3** as PID 1 (replaces the older `tini`) — supervises the dashboard and per-profile gateways with auto-restart on crash, reaps zombie subprocesses, and forwards signals. -The entrypoint script (`docker/entrypoint.sh`) bootstraps the data volume on first run: -- Creates the directory structure (`sessions/`, `memories/`, `skills/`, etc.) -- Copies `.env.example` → `.env` if no `.env` exists -- Copies default `config.yaml` if missing -- Copies default `SOUL.md` if missing -- Syncs bundled skills using a manifest-based approach (preserves user edits) -- Optionally launches `hermes dashboard` as a background side-process when `HERMES_DASHBOARD=1` (see [Running the dashboard](#running-the-dashboard)) -- Then runs `hermes` with whatever arguments you pass +The container's `ENTRYPOINT` is s6-overlay's `/init`. On boot it: +1. Runs `/etc/cont-init.d/01-hermes-setup` (= `docker/stage2-hook.sh`) as root: optional UID/GID remap, fixes volume ownership, seeds `.env` / `config.yaml` / `SOUL.md` on first boot, syncs bundled skills. +2. Runs `/etc/cont-init.d/02-reconcile-profiles` (= `hermes_cli.container_boot`): walks `$HERMES_HOME/profiles//`, recreates the per-profile gateway s6 service slot under `/run/service/gateway-/`, and auto-starts only those whose last recorded state was `running` (see [Per-profile gateway supervision](#per-profile-gateway-supervision)). +3. Starts the static `main-hermes` and `dashboard` s6-rc services. +4. Exec's the container's CMD as the main program (`/opt/hermes/docker/main-wrapper.sh`), which routes the arguments the user passed to `docker run`: + - no args → `hermes` (the default) + - first arg is an executable on PATH (e.g. `sleep`, `bash`) → exec it directly + - anything else → `hermes ` (subcommand passthrough) + The container exits when this main program exits, with its exit code. -:::warning -Do not override the image entrypoint unless you keep `/opt/hermes/docker/entrypoint.sh` in the command chain. The entrypoint drops root privileges to the `hermes` user before gateway state files are created. Starting `hermes gateway run` as root inside the official image is refused by default because it can leave root-owned files in `/opt/data` and break later dashboard or gateway starts. Set `HERMES_ALLOW_ROOT_GATEWAY=1` only when you intentionally accept that risk. +:::warning Breaking change vs. pre-s6 images +The container ENTRYPOINT is now `/init` (s6-overlay), not `/usr/bin/tini`. All five documented `docker run` invocation patterns (no args, `chat -q "…"`, `sleep infinity`, `bash`, `--tui`) behave identically to the tini-based image. If you have a downstream wrapper that depended on tini-specific signal behavior or hard-coded `/usr/bin/tini --` invocation, pin to the previous image tag. ::: +:::warning Privilege model +Do not override the image entrypoint unless you keep `/init` (or, equivalently, the legacy `docker/entrypoint.sh` shim that forwards to the stage2 hook) in the command chain. s6-overlay's `/init` runs as root so it can chown the volume on first boot, then drops to the `hermes` user via `s6-setuidgid` for every supervised service AND for the main program. Starting `hermes gateway run` as root inside the official image is refused by default because it can leave root-owned files in `/opt/data` and break later dashboard or gateway starts. Set `HERMES_ALLOW_ROOT_GATEWAY=1` only when you intentionally accept that risk. +::: + +### Per-profile gateway supervision + +Inside the container, each profile created with `hermes profile create ` automatically gets an s6-supervised gateway service registered at `/run/service/gateway-/`. The lifecycle commands you'd run on the host work the same way: + +```sh +hermes profile create coder # registers gateway-coder s6 slot +hermes -p coder gateway start # s6-svc -u → supervised gateway +hermes -p coder gateway stop # s6-svc -d → service down +hermes -p coder gateway restart # s6-svc -t → SIGTERM the supervisor +hermes profile delete coder # tears down the s6 slot +``` + +**Supervision benefits over the pre-s6 image:** + +- Gateway crashes are auto-restarted by `s6-supervise` after a ~1s backoff. +- Dashboard crashes are auto-restarted (set `HERMES_DASHBOARD=1` to start it). +- `docker restart` preserves running gateways: the cont-init reconciler reads `$HERMES_HOME/profiles//gateway_state.json` and brings the slot back up if the last recorded state was `running`. Stopped gateways stay stopped. +- Per-profile gateway logs persist under `$HERMES_HOME/logs/gateways//current` (rotated by `s6-log`), and the reconciler's actions are appended to `$HERMES_HOME/logs/container-boot.log` per boot. + +`hermes status` inside the container reports `Manager: s6 (container supervisor)`. Use `/command/s6-svstat /run/service/gateway-` for the raw supervisor view (note `/command/` is on PATH for supervision-tree processes only; pass the absolute path when calling from `docker exec`). + ## Upgrading Pull the latest image and recreate the container. Your data directory is untouched. @@ -528,7 +563,7 @@ Check logs: `docker logs hermes`. Common causes: ### "Permission denied" errors -The container's entrypoint drops privileges to the non-root `hermes` user (UID 10000) via `gosu`. If your host `~/.hermes/` is owned by a different UID, set `HERMES_UID`/`HERMES_GID` to match your host user, or ensure the data directory is writable: +The container's stage2 hook drops privileges to the non-root `hermes` user (UID 10000) via `s6-setuidgid` inside each supervised service. If your host `~/.hermes/` is owned by a different UID, set `HERMES_UID`/`HERMES_GID` to match your host user, or ensure the data directory is writable: ```sh chmod -R 755 ~/.hermes diff --git a/website/docs/user-guide/profiles.md b/website/docs/user-guide/profiles.md index 73ea0a8cadd..dfbd1d95e5f 100644 --- a/website/docs/user-guide/profiles.md +++ b/website/docs/user-guide/profiles.md @@ -172,6 +172,10 @@ assistant gateway install # creates hermes-gateway-assistant service Each profile gets its own service name. They run independently. +:::note Inside the official Docker image +Per-profile gateways are supervised by [s6-overlay](https://github.com/just-containers/s6-overlay) (PID 1 in the container), so `hermes profile create ` automatically registers an s6 service slot at `/run/service/gateway-/`. `hermes -p gateway start/stop/restart` dispatches to `s6-svc` instead of spawning a bare process — crashes are auto-restarted and `docker restart` preserves the previously-running set of gateways. See [Per-profile gateway supervision](/docs/user-guide/docker#per-profile-gateway-supervision) for details. +::: + ## Configuring profiles Each profile has its own: