diff --git a/Dockerfile b/Dockerfile index 1db0e1c8d5e..1238f5f7565 100644 --- a/Dockerfile +++ b/Dockerfile @@ -138,18 +138,29 @@ RUN uv pip install --no-cache-dir --no-deps -e "." COPY docker/s6-rc.d/ /etc/s6-overlay/s6-rc.d/ # stage2-hook handles UID/GID remap, volume chown, config seeding, -# skills sync, and TUI detection — all the work the old entrypoint.sh -# did between gosu-drop and `exec hermes`. Wired in as cont-init.d/01- -# so it runs before any user services start. +# skills sync — all the work the old entrypoint.sh did between +# gosu-drop and `exec hermes`. Wired in as cont-init.d/01- so it +# runs before user services start. +# +# 02-reconcile-profiles re-creates per-profile gateway s6 service +# slots from $HERMES_HOME/profiles// after a container restart +# (the /run/service/ scandir is tmpfs and wiped on restart). Phase 4. RUN mkdir -p /etc/cont-init.d && \ printf '#!/bin/sh\nexec /opt/hermes/docker/stage2-hook.sh\n' \ > /etc/cont-init.d/01-hermes-setup && \ chmod +x /etc/cont-init.d/01-hermes-setup +COPY --chmod=0755 docker/cont-init.d/02-reconcile-profiles /etc/cont-init.d/02-reconcile-profiles # ---------- Runtime ---------- ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist ENV HERMES_HOME=/opt/data -ENV PATH="/opt/data/.local/bin:${PATH}" +# Pre-s6 entrypoint.sh did `source .venv/bin/activate` which exported +# the venv bin onto PATH; Architecture B's main-wrapper.sh does the +# same for the container's main process, but `docker exec` and our +# cont-init.d scripts don't pass through the wrapper. Expose the venv +# bin globally so `docker exec hermes ...` and any +# subprocess that doesn't activate the venv first still find hermes. +ENV PATH="/opt/hermes/.venv/bin:/opt/data/.local/bin:${PATH}" RUN mkdir -p /opt/data VOLUME [ "/opt/data" ] diff --git a/docker/cont-init.d/02-reconcile-profiles b/docker/cont-init.d/02-reconcile-profiles new file mode 100755 index 00000000000..90b03554f1e --- /dev/null +++ b/docker/cont-init.d/02-reconcile-profiles @@ -0,0 +1,30 @@ +#!/command/with-contenv sh +# shellcheck shell=sh +# Container-boot reconciliation of per-profile gateway s6 services. +# +# Runs as root after 01-hermes-setup (the stage2 hook) has chowned +# the volume and seeded $HERMES_HOME, but before s6-rc starts user +# services. /etc/cont-init.d/* scripts run in lexicographic order, +# so the `02-` prefix guarantees ordering. +# +# Service directories under /run/service/ live on tmpfs and are +# wiped on every container restart. Profile directories under +# $HERMES_HOME/profiles/ live on the persistent VOLUME. This script +# walks the persistent profiles, recreates the s6 service slots, +# and auto-starts only those whose last recorded state was +# `running` — see hermes_cli/container_boot.py. +# +# Phase 4 also needs hermes-user writes to /run/service/ (so the +# profile create/delete hooks can register/unregister at runtime), +# so we chown the scandir before invoking the reconciler. The +# .s6-svscan/ subdir stays root-owned; only sibling directories +# (gateway-/) need to be hermes-writable. +set -e + +# Make the dynamic scandir hermes-writable. The directory itself +# starts root-owned by s6-overlay; we leave .s6-svscan/ alone since +# only s6 itself writes there. +chown hermes:hermes /run/service 2>/dev/null || true + +exec s6-setuidgid hermes /opt/hermes/.venv/bin/python -m hermes_cli.container_boot + diff --git a/docker/stage2-hook.sh b/docker/stage2-hook.sh index f8c964801ad..2989f27a032 100755 --- a/docker/stage2-hook.sh +++ b/docker/stage2-hook.sh @@ -53,6 +53,17 @@ if [ "$needs_chown" = true ]; then echo "[stage2] Warning: chown .venv failed (rootless container?) — continuing" fi +# Always reset ownership of $HERMES_HOME/profiles to hermes on every +# boot. Profile dirs and files can land owned by root when commands +# are invoked via `docker exec hermes …` (which defaults +# to root unless `-u` is passed), and that breaks the cont-init +# reconciler (02-reconcile-profiles) which runs as hermes and walks +# the profiles dir. Idempotent; skipped on rootless containers where +# chown would fail. +if [ -d "$HERMES_HOME/profiles" ]; then + chown -R hermes:hermes "$HERMES_HOME/profiles" 2>/dev/null || true +fi + # --- config.yaml permissions --- # Ensure config.yaml is readable by the hermes runtime user even if it # was edited on the host after initial ownership setup. diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py new file mode 100644 index 00000000000..fa4fe4568c4 --- /dev/null +++ b/hermes_cli/container_boot.py @@ -0,0 +1,218 @@ +"""Container-boot reconciliation of per-profile gateway s6 services. + +Service directories under /run/service/ live on **tmpfs** and are wiped +on every container restart. Profile directories under +``$HERMES_HOME/profiles//`` live on the persistent VOLUME, and +each one records its gateway's last state in ``gateway_state.json``. +This module bridges the two: on every container boot, walk the +persistent profiles, recreate the s6 service slots, and auto-start +only those whose last recorded state was ``running``. + +Wired into the image as /etc/cont-init.d/02-reconcile-profiles by the +Dockerfile (Phase 4 Task 4.0). Runs as root after 01-hermes-setup +(the stage2 hook) has chowned the volume and seeded $HERMES_HOME, but +before s6-rc starts user services. + +Without this module, every ``docker restart`` would silently wipe +every per-profile gateway, even though the user's profiles still +exist on disk. +""" +from __future__ import annotations + +import json +import logging +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +log = logging.getLogger(__name__) + +# Only this prior state triggers automatic restart. Everything else +# (startup_failed, starting, stopped, missing) registers the slot in +# the down state and waits for explicit user action — this avoids the +# crash-loop where a broken gateway keeps being restarted across +# `docker restart` cycles. +_AUTOSTART_STATES = frozenset({"running"}) + +# Stale runtime files we sweep before recreating service slots. These +# all hold container-namespaced state (PIDs, process tables) that's +# garbage post-restart — a numerically-equal PID in the new container +# is a different process. See the Risk Register in the plan. +_STALE_RUNTIME_FILES = ("gateway.pid", "processes.json") + +ReconcileActionLabel = Literal["started", "registered", "skipped"] + + +@dataclass(frozen=True) +class ReconcileAction: + """One profile's outcome from a single reconciliation pass.""" + profile: str + prior_state: str | None + action: ReconcileActionLabel + + +def reconcile_profile_gateways( + *, + hermes_home: Path, + scandir: Path, + dry_run: bool = False, +) -> list[ReconcileAction]: + """Recreate s6 service registrations for every persistent profile. + + Args: + hermes_home: The container's HERMES_HOME (typically /opt/data). + Profiles live under ``/profiles//``. + scandir: The s6 dynamic scandir (typically /run/service). Service + directories are created at ``/gateway-/``. + dry_run: When True, walk and return the action list without + touching the filesystem. For tests and `--dry-run` debug. + + Returns: + One :class:`ReconcileAction` per profile, in directory order. + """ + actions: list[ReconcileAction] = [] + profiles_root = hermes_home / "profiles" + if not profiles_root.is_dir(): + return actions + + for entry in sorted(profiles_root.iterdir()): + if not entry.is_dir(): + continue + # SOUL.md is always seeded by `hermes profile create` (config.yaml + # is not — that comes later via `hermes setup`). Use it as the + # "real profile" marker so stray dirs (backups, manual mkdir) + # aren't picked up. + if not (entry / "SOUL.md").exists(): + continue + + prior_state = _read_prior_state(entry) + should_start = prior_state in _AUTOSTART_STATES + + if not dry_run: + _cleanup_stale_runtime_files(entry) + _register_service(scandir, entry.name, start=should_start) + + actions.append(ReconcileAction( + profile=entry.name, + prior_state=prior_state, + action="started" if should_start else "registered", + )) + + if not dry_run: + _write_reconcile_log(hermes_home, actions) + return actions + + +def _read_prior_state(profile_dir: Path) -> str | None: + """Read gateway_state.json's ``gateway_state`` field, or None if + missing or unparseable. Unparseable counts as "no prior state" so + we don't bork the whole reconciliation on a corrupt file.""" + state_file = profile_dir / "gateway_state.json" + if not state_file.exists(): + return None + try: + return json.loads(state_file.read_text()).get("gateway_state") + except (OSError, json.JSONDecodeError): + log.warning( + "could not read %s; treating as no prior state", state_file, + ) + return None + + +def _cleanup_stale_runtime_files(profile_dir: Path) -> None: + """Remove gateway.pid and processes.json — they reference PIDs in + the dead container's process namespace and would otherwise confuse + the newly-started gateway's process-mismatch checks.""" + for name in _STALE_RUNTIME_FILES: + (profile_dir / name).unlink(missing_ok=True) + + +def _register_service(scandir: Path, profile: str, *, start: bool) -> None: + """Recreate the s6 service slot for one profile. + + Mirrors the rendering in :func:`S6ServiceManager.register_profile_gateway`, + but here we control the start state directly via the ``down`` marker + file (s6-svscan honors it on rescan). Cannot use the manager + directly because the cont-init.d phase runs as root before + s6-svscan starts scanning the dynamic scandir — the manager's + ``s6-svscanctl -a`` call would fail with no control socket. + """ + from hermes_cli.service_manager import ( + S6ServiceManager, + validate_profile_name, + ) + + validate_profile_name(profile) + service_dir = scandir / f"gateway-{profile}" + service_dir.mkdir(parents=True, exist_ok=True) + + (service_dir / "type").write_text("longrun\n") + + # Reuse the manager's run-script rendering — single source of truth + # so register_profile_gateway and reconcile_profile_gateways stay + # consistent. extra_env is empty here; users who need per-profile + # env can set it via the profile's config.yaml (which the gateway + # itself loads). + run = service_dir / "run" + run.write_text(S6ServiceManager._render_run_script(profile, port=0, extra_env={})) + run.chmod(0o755) + + # Persistent log rotation (OQ8-C). + log_subdir = service_dir / "log" + log_subdir.mkdir(exist_ok=True) + log_run = log_subdir / "run" + log_run.write_text(S6ServiceManager._render_log_run(profile)) + log_run.chmod(0o755) + + # The presence of a `down` file tells s6-supervise to NOT start + # the service when s6-svscan picks it up. User brings it up + # explicitly with `hermes -p gateway start` (which + # routes through the Phase 4 _dispatch_via_service_manager_if_s6 + # helper to `s6-svc -u`). + down_marker = service_dir / "down" + if start: + down_marker.unlink(missing_ok=True) + else: + down_marker.touch() + + +def _write_reconcile_log( + hermes_home: Path, actions: list[ReconcileAction], +) -> None: + """Append one line per profile to $HERMES_HOME/logs/container-boot.log. + + Operators inspect this to debug "why didn't my profile come back + up". Keeping a separate log file (vs. mixing into agent.log) lets + troubleshooters grep for "profile=foo" without wading through + unrelated activity. + """ + import time + log_dir = hermes_home / "logs" + log_dir.mkdir(parents=True, exist_ok=True) + ts = time.strftime("%Y-%m-%dT%H:%M:%S%z") + with (log_dir / "container-boot.log").open("a", encoding="utf-8") as f: + for a in actions: + f.write( + f"{ts} profile={a.profile} prior_state={a.prior_state} " + f"action={a.action}\n" + ) + + +def main() -> int: + """Entry point invoked from /etc/cont-init.d/02-reconcile-profiles.""" + hermes_home = Path(os.environ.get("HERMES_HOME", "/opt/data")) + scandir = Path(os.environ.get("S6_PROFILE_GATEWAY_SCANDIR", "/run/service")) + actions = reconcile_profile_gateways( + hermes_home=hermes_home, scandir=scandir, + ) + for a in actions: + print( + f"reconcile: profile={a.profile} " + f"prior_state={a.prior_state} action={a.action}" + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 79d90c9e6ef..d9f397437fa 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -981,6 +981,18 @@ def get_gateway_runtime_snapshot(system: bool = False) -> GatewayRuntimeSnapshot from hermes_constants import is_container if is_linux() and is_container(): + # Phase 4: report s6 supervision when running under our /init. + # Other container runtimes (or containers built before Phase 2) + # still get the original "docker (foreground)" label. + try: + from hermes_cli.service_manager import detect_service_manager + if detect_service_manager() == "s6": + return GatewayRuntimeSnapshot( + manager="s6 (container supervisor)", + gateway_pids=gateway_pids, + ) + except Exception: + pass # Fall through to the legacy label on any detection error. return GatewayRuntimeSnapshot( manager="docker (foreground)", gateway_pids=gateway_pids, @@ -5003,6 +5015,47 @@ def gateway_setup(): # Main Command Handler # ============================================================================= +def _dispatch_via_service_manager_if_s6( + action: str, profile: str | None = None, +) -> bool: + """If we're in a container with s6, dispatch gateway lifecycle via s6. + + Returns True iff dispatched (caller should ``return``); False + otherwise — caller continues with the host-side code path. + + ``action`` is one of ``start`` / ``stop`` / ``restart``. The + profile defaults to the current one (resolved via ``_profile_arg``). + The s6 service slot was created either by the Phase 4 profile-create + hook or by the container-boot reconciler (cont-init.d/02-…). If it + doesn't exist, ``s6-svc`` will raise CalledProcessError — caller + sees that as a normal failure path. + """ + from hermes_cli.service_manager import ( + detect_service_manager, + get_service_manager, + ) + + if detect_service_manager() != "s6": + return False + if profile is None: + # _profile_suffix() returns the bare profile name for + # HERMES_HOME=/profiles/, "" for the default root, + # or a hash for unrelated paths. Map "" → "default" so the + # default-profile gateway is reachable as gateway-default. + profile = _profile_suffix() or "default" + mgr = get_service_manager() + service_name = f"gateway-{profile}" + if action == "start": + mgr.start(service_name) + elif action == "stop": + mgr.stop(service_name) + elif action == "restart": + mgr.restart(service_name) + else: + return False + return True + + def gateway_command(args): """Handle gateway subcommands.""" try: @@ -5087,6 +5140,21 @@ def _gateway_command_inner(args): print(" nohup hermes gateway run > ~/.hermes/logs/gateway.log 2>&1 & # background") sys.exit(1) elif is_container(): + # Phase 4: inside a container with s6 the gateway service is + # auto-registered when the profile is created (and reconciled + # at every container boot). `install` is therefore informational. + from hermes_cli.service_manager import detect_service_manager + if detect_service_manager() == "s6": + print("Per-profile gateways are auto-registered when you create a profile.") + print() + print(" hermes profile create # creates the s6 service slot") + print(" hermes -p gateway start # bring it up via s6") + print(" hermes status # see currently-supervised gateways") + return + # Fallback for pre-s6 containers or other container runtimes + # we haven't taught about supervision (Podman without our + # /init, k8s plain runs, etc.) — the historical guidance still + # applies. print("Service installation is not needed inside a Docker container.") print("The container runtime is your service manager — use Docker restart policies instead:") print() @@ -5117,6 +5185,13 @@ def _gateway_command_inner(args): from hermes_cli import gateway_windows gateway_windows.uninstall() elif is_container(): + from hermes_cli.service_manager import detect_service_manager + if detect_service_manager() == "s6": + print("Per-profile gateways are auto-unregistered when you delete the profile.") + print() + print(" hermes profile delete # tears down the s6 service slot") + print(" hermes -p gateway stop # stop without deleting the profile") + return print("Service uninstall is not applicable inside a Docker container.") print("To stop the gateway, stop or remove the container:") print() @@ -5131,6 +5206,14 @@ def _gateway_command_inner(args): system = getattr(args, 'system', False) start_all = getattr(args, 'all', False) + # Phase 4: inside a container with s6, dispatch via the service + # manager instead of falling through to systemd/launchd/windows. + # `--all` isn't meaningful here (each profile has its own service + # slot — start them individually via `hermes -p gateway + # start`), so just bring up the current profile's slot. + if not start_all and _dispatch_via_service_manager_if_s6("start"): + return + if start_all: # Kill all stale gateway processes across all profiles before starting killed = kill_gateway_processes(all_profiles=True) @@ -5160,6 +5243,11 @@ def _gateway_command_inner(args): print("To enable systemd: add systemd=true to /etc/wsl.conf and run 'wsl --shutdown' from PowerShell.") sys.exit(1) elif is_container(): + # Reached only when s6 ISN'T running (the early dispatch + # above handles the s6 case). Pre-s6 containers or other + # container runtimes that don't ship our /init get the + # historical guidance: the gateway is the container's main + # process, so use docker lifecycle commands. print("Service start is not applicable inside a Docker container.") print("The gateway runs as the container's main process.") print() @@ -5176,6 +5264,11 @@ def _gateway_command_inner(args): stop_all = getattr(args, 'all', False) system = getattr(args, 'system', False) + # Phase 4: inside a container with s6, dispatch via the service + # manager. `--all` is left to the existing process-sweep path below. + if not stop_all and _dispatch_via_service_manager_if_s6("stop"): + return + if stop_all: # --all: kill every gateway process on the machine service_available = False @@ -5245,6 +5338,11 @@ def _gateway_command_inner(args): restart_all = getattr(args, 'all', False) service_configured = False + # Phase 4: inside a container with s6, dispatch via the service + # manager (s6-svc -t restarts the supervised process). + if not restart_all and _dispatch_via_service_manager_if_s6("restart"): + return + if restart_all: # --all: stop every gateway process across all profiles, then start fresh service_stopped = False diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index aa33d9182b8..3031fa3867b 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -777,6 +777,14 @@ def create_profile( except Exception: pass # non-fatal — user can describe later with `hermes profile describe` + # Phase 4: when running inside a container under s6, register the + # new profile's gateway as a runtime s6 service so + # `hermes -p gateway start` can supervise it via + # `s6-svc -u` instead of spawning a bare process. On host (systemd + # / launchd / windows) this is a no-op — the existing per-profile + # unit-generation paths handle gateway lifecycle. + _maybe_register_gateway_service(canon) + return profile_dir @@ -893,6 +901,10 @@ def delete_profile(name: str, yes: bool = False) -> Path: # 1. Disable service (prevents auto-restart) _cleanup_gateway_service(canon, profile_dir) + # 1b. Phase 4: unregister the s6 service slot (container path). + # On host this is a no-op; on container it removes + # /run/service/gateway-/ so s6-supervise drops it. + _maybe_unregister_gateway_service(canon) # 2. Stop running gateway if gw_running: @@ -965,6 +977,77 @@ def delete_profile(name: str, yes: bool = False) -> Path: return profile_dir +def _allocate_gateway_port(profile_name: str) -> int: + """Deterministic port allocation for a profile's s6-supervised gateway. + + Phase 4 of the s6-overlay supervision plan. Ports live in + [9200, 9800) — a 600-port window starting just past the dashboard + default (9119). Allocation is deterministic via SHA-256 of the + profile name so the same profile always gets the same port across + container restarts. + + Collision probability is small (~1/600 per pair of profiles); if + it happens the gateway will fail to bind with a clear OSError and + the caller can set ``HERMES_GATEWAY_PORT`` to override. The + Phase 4 plan accepts this rather than carrying explicit allocator + state in the persistent volume. + """ + import hashlib + h = int(hashlib.sha256(profile_name.encode()).hexdigest()[:8], 16) + return 9200 + (h % 600) + + +def _maybe_register_gateway_service(profile_name: str) -> None: + """Register a profile's gateway with s6 inside the container. + + No-op on host (systemd/launchd/windows) — those backends raise + ``NotImplementedError`` on ``register_profile_gateway`` and the + existing per-profile unit-generation paths handle lifecycle. + + Best-effort: any error (no backend detected, port collision, s6 + not yet ready, etc.) is logged and swallowed so profile creation + doesn't fail because the s6 supervision tree is in a weird state. + The user can re-register manually later via the gateway start + command, which goes through the same dispatch path. + """ + try: + from hermes_cli.service_manager import get_service_manager + mgr = get_service_manager() + except RuntimeError: + return # no backend on this host — nothing to do + if not mgr.supports_runtime_registration(): + return # host backend; no-op + port = _allocate_gateway_port(profile_name) + try: + mgr.register_profile_gateway(profile_name, port=port) + except ValueError: + # Already registered (e.g. the container-boot reconciler ran + # first and brought up a stale slot). That's fine. + pass + except Exception as exc: + # Don't fail profile create over a supervision-tree hiccup. + print(f"⚠ Could not register s6 gateway service: {exc}") + + +def _maybe_unregister_gateway_service(profile_name: str) -> None: + """Tear down a profile's s6 gateway service inside the container. + + No-op on host. Idempotent: absent services are silently skipped + by ``unregister_profile_gateway``. + """ + try: + from hermes_cli.service_manager import get_service_manager + mgr = get_service_manager() + except RuntimeError: + return + if not mgr.supports_runtime_registration(): + return + try: + mgr.unregister_profile_gateway(profile_name) + except Exception as exc: + print(f"⚠ Could not unregister s6 gateway service: {exc}") + + def _cleanup_gateway_service(name: str, profile_dir: Path) -> None: """Disable and remove systemd/launchd service for a profile.""" import platform as _platform diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 71dc6ae1888..236f2b619e1 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -360,7 +360,18 @@ class S6ServiceManager: time, not Python-substituted at registration time (OQ8-C). 2. Activates the bundled venv. 3. Drops to the hermes user and exec's - ``hermes -p gateway start --foreground --port ``. + ``hermes -p gateway run``. + + Note: the ``port`` parameter is accepted for API parity with + :meth:`register_profile_gateway` but is currently ignored — the + gateway picks its bind port from the profile's config.yaml + (``[gateway] port = ...``). A future signature change may carry + it through as an ``HERMES_GATEWAY_PORT`` env var; until then, + the in-config value wins and the constructor's ``port`` arg + is essentially documentation for "what port the profile would + use if we wired it through". See Phase 4 Task 4.1 for the + deterministic allocator and the SHA-256-derived range + [9200, 9800). """ import shlex lines = [ @@ -373,8 +384,7 @@ class S6ServiceManager: for k, v in sorted(extra_env.items()): lines.append(f"export {k}={shlex.quote(v)}") lines.append( - f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} " - f"gateway start --foreground --port {port}" + f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run" ) return "\n".join(lines) + "\n" diff --git a/tests/docker/test_container_restart.py b/tests/docker/test_container_restart.py new file mode 100644 index 00000000000..b709022c79e --- /dev/null +++ b/tests/docker/test_container_restart.py @@ -0,0 +1,168 @@ +"""Container-restart survives per-profile gateway registrations. + +The s6 dynamic scandir at /run/service/ lives on tmpfs and is wiped +on every container restart. Phase 4 Task 4.0's container_boot module ++ cont-init.d/02-reconcile-profiles regenerate the service slots from +$HERMES_HOME/profiles//gateway_state.json on every boot and +auto-start only those whose last state was `running`. + +These tests stand up a container with a named volume, create profiles +inside it in various gateway states, restart the container, and +assert the reconciler did the right thing. +""" +from __future__ import annotations + +import subprocess +import time + +import pytest + + +def _docker(*args: str, **kw) -> subprocess.CompletedProcess[str]: + return subprocess.run( + ["docker", *args], + capture_output=True, text=True, timeout=kw.pop("timeout", 60), + **kw, + ) + + +def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]: + return _docker("exec", container, *args, timeout=timeout) + + +def _sh(container: str, cmd: str, timeout: int = 30) -> subprocess.CompletedProcess[str]: + return _docker("exec", container, "sh", "-c", cmd, timeout=timeout) + + +@pytest.fixture +def restart_container(request, built_image: str): + """A long-running container with a named volume so docker restart + preserves $HERMES_HOME/profiles/.""" + safe = request.node.name.replace("[", "_").replace("]", "_") + name = f"hermes-restart-{safe}" + volume = f"hermes-restart-vol-{safe}" + _docker("rm", "-f", name) + _docker("volume", "rm", "-f", volume) + _docker("volume", "create", volume, timeout=10).check_returncode() + r = _docker( + "run", "-d", "--name", name, + "-v", f"{volume}:/opt/data", + built_image, "sleep", "infinity", + timeout=30, + ) + r.check_returncode() + # Give s6 + stage2 + 02-reconcile a moment to come up cleanly on + # the fresh volume. + time.sleep(5) + yield name + _docker("rm", "-f", name) + _docker("volume", "rm", "-f", volume) + + +def test_running_gateway_survives_container_restart(restart_container: str) -> None: + container = restart_container + + # Create the profile + start its gateway. The Phase 4 hooks + # register the s6 service slot during create and the dispatch + # path brings it up via s6-svc -u. + r = _exec(container, "hermes", "profile", "create", "coder") + assert r.returncode == 0, f"profile create failed: {r.stderr}" + + r = _exec(container, "hermes", "-p", "coder", "gateway", "start", timeout=60) + assert r.returncode == 0, f"gateway start failed: {r.stderr}" + + # Give the service time to actually come up under supervision. + deadline = time.monotonic() + 15.0 + while time.monotonic() < deadline: + r = _sh(container, "/command/s6-svstat /run/service/gateway-coder") + if r.returncode == 0 and "up " in r.stdout: + break + time.sleep(0.5) + assert "up " in r.stdout, f"gateway never came up pre-restart: {r.stdout!r}" + + # Persist state so the reconciler will treat the slot as 'running' + # post-restart. The gateway process itself writes gateway_state.json + # via gateway/status.py — but we don't want to wait for or assert + # against the live process here; just stamp the file directly to + # exercise the reconciler's contract. + write_state = ( + "import json, pathlib; " + "p = pathlib.Path('/opt/data/profiles/coder/gateway_state.json'); " + "p.write_text(json.dumps({'gateway_state': 'running', 'timestamp': 1}))" + ) + _exec(container, "python3", "-c", write_state, timeout=10).check_returncode() + + # Restart. After this, /run/service/ is empty until cont-init.d + # runs the reconciler. + _docker("restart", container, timeout=60).check_returncode() + time.sleep(8) # stage2 + reconcile + svscan rescan + + # Reconciler logged the action. + r = _sh(container, "cat /opt/data/logs/container-boot.log") + assert r.returncode == 0, f"reconcile log missing: {r.stderr}" + assert "profile=coder" in r.stdout + assert "action=started" in r.stdout + + # Service slot exists. + r = _sh(container, "test -d /run/service/gateway-coder") + assert r.returncode == 0, "slot not recreated after restart" + + # No `down` marker — we asked for auto-start. + r = _sh(container, "test -f /run/service/gateway-coder/down") + assert r.returncode != 0, "down marker present despite prior_state=running" + + +def test_stopped_gateway_stays_stopped_after_restart(restart_container: str) -> None: + container = restart_container + + _exec(container, "hermes", "profile", "create", "writer").check_returncode() + + # Write 'stopped' directly so we don't have to race against the + # gateway's own state writes. + write_state = ( + "import json, pathlib; " + "p = pathlib.Path('/opt/data/profiles/writer/gateway_state.json'); " + "p.write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1}))" + ) + _exec(container, "python3", "-c", write_state, timeout=10).check_returncode() + + _docker("restart", container, timeout=60).check_returncode() + time.sleep(8) + + # Slot exists. + r = _sh(container, "test -d /run/service/gateway-writer") + assert r.returncode == 0 + + # Down marker present. + r = _sh(container, "test -f /run/service/gateway-writer/down") + assert r.returncode == 0, "down marker missing despite prior_state=stopped" + + +def test_stale_gateway_pid_cleaned_up_on_restart(restart_container: str) -> None: + """A dead container's gateway.pid + processes.json must NOT + survive the restart — a numerically-equal live PID in the new + container is a different process and would confuse the gateway + process-mismatch checks.""" + container = restart_container + + _exec(container, "hermes", "profile", "create", "ghost").check_returncode() + + # Stamp stale runtime files alongside a 'running' state so the + # reconciler walks this profile. + stamp = ( + "import json, pathlib; " + "p = pathlib.Path('/opt/data/profiles/ghost'); " + "(p / 'gateway_state.json').write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1})); " + "(p / 'gateway.pid').write_text(json.dumps({'pid': 99999, 'host': 'old'})); " + "(p / 'processes.json').write_text('[]')" + ) + _exec(container, "python3", "-c", stamp, timeout=10).check_returncode() + + _docker("restart", container, timeout=60).check_returncode() + time.sleep(8) + + # Stale runtime files swept. + r = _sh(container, "test -f /opt/data/profiles/ghost/gateway.pid") + assert r.returncode != 0, "stale gateway.pid survived restart" + r = _sh(container, "test -f /opt/data/profiles/ghost/processes.json") + assert r.returncode != 0, "stale processes.json survived restart" diff --git a/tests/docker/test_profile_gateway.py b/tests/docker/test_profile_gateway.py index 2e93f1f3b7b..0723d51fd47 100644 --- a/tests/docker/test_profile_gateway.py +++ b/tests/docker/test_profile_gateway.py @@ -1,31 +1,26 @@ """Harness: per-profile gateway start/stop inside the container. -Phase 4 will change the *implementation* of these commands inside the -container — they'll talk to s6 instead of refusing. The user-visible -surface that should result is locked here. +Phase 4 wires `hermes -p gateway start/stop` through the s6 +ServiceManager dispatch path inside the container — so the lifecycle +commands now bring up an s6-supervised gateway rather than refusing +with the pre-Phase-4 informational message. -NOTE: These tests are marked ``xfail(strict=True)`` until Phase 4 lands. -The current tini image deliberately refuses gateway start/stop inside -containers — ``pgrep`` finds nothing and the tests fail. After Phase 4 -they should flip to passing automatically; ``strict=True`` means an -unexpected pass also fails the test, protecting against side-channel -fixes outside the planned Phase 4 mechanism. +These tests were marked ``xfail(strict=True)`` through Phase 0–3 and +flip to plain ``test_…`` once Phase 4 lands (now). + +NB: The harness profile created here has no model/auth configured, +so the gateway process itself will exit with code 1 on every start +attempt (s6 will keep restarting it). We assert against s6's +``want up`` / ``want down`` state — which reflects the lifecycle +command's intent, not the supervised process's health. """ from __future__ import annotations import subprocess import time -import pytest - PROFILE = "test-harness-profile" -_PHASE4_REASON = ( - "Phase 4 not yet landed: container-side `hermes gateway start` " - "currently exits 0 with an informational message instead of " - "spawning/supervising a gateway. Remove this marker after Task 4.3." -) - def _sh( container: str, command: str, timeout: int = 30, @@ -36,7 +31,14 @@ def _sh( ) -@pytest.mark.xfail(reason=_PHASE4_REASON, strict=True) +def _svstat(container: str) -> str: + """Returns the raw s6-svstat output for the test profile's slot. + /command/s6-svstat is called by absolute path because /command/ + isn't on PATH for docker-exec sessions.""" + r = _sh(container, f"/command/s6-svstat /run/service/gateway-{PROFILE}") + return r.stdout if r.returncode == 0 else "" + + def test_profile_create_then_gateway_start( built_image: str, container_name: str, ) -> None: @@ -50,30 +52,35 @@ def test_profile_create_then_gateway_start( r = _sh(container_name, f"hermes profile create {PROFILE}") assert r.returncode == 0, f"profile create failed: {r.stderr}" + # Profile create's s6-register hook should have produced a service slot. + r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}") + assert r.returncode == 0, "s6 service slot not created on profile create" + r = _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60) assert r.returncode == 0, ( f"gateway start failed: stderr={r.stderr!r} stdout={r.stdout!r}" ) - time.sleep(3) - - r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'") - assert r.returncode == 0, "gateway process not running" + # After start, s6's intent is "up" — even if the supervised gateway + # process spin-fails (no model/auth in the test profile), the + # supervision-state contract holds. + time.sleep(2) + state = _svstat(container_name) + assert "want up" in state, f"want up not in svstat: {state!r}" r = _sh(container_name, f"hermes -p {PROFILE} gateway stop", timeout=30) assert r.returncode == 0 time.sleep(2) - - r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'") - assert r.returncode != 0, "gateway process still running after stop" + state = _svstat(container_name) + assert "want up" not in state, f"want up still in svstat: {state!r}" -@pytest.mark.xfail(reason=_PHASE4_REASON, strict=True) def test_profile_delete_stops_gateway( built_image: str, container_name: str, ) -> None: - """Deleting a profile should stop its gateway if running.""" + """Deleting a profile should stop its gateway and remove the s6 + service slot.""" subprocess.run( ["docker", "run", "-d", "--name", container_name, built_image, "sleep", "120"], @@ -90,8 +97,9 @@ def test_profile_delete_stops_gateway( f"hermes profile delete {PROFILE} --yes", timeout=30, ) - assert r.returncode == 0 + assert r.returncode == 0, f"profile delete failed: {r.stderr}" time.sleep(2) - r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'") - assert r.returncode != 0, "gateway still running after profile delete" + # Service slot should be gone. + r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}") + assert r.returncode != 0, "s6 service slot still present after profile delete" diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py new file mode 100644 index 00000000000..f0d932292c5 --- /dev/null +++ b/tests/hermes_cli/test_container_boot.py @@ -0,0 +1,235 @@ +"""Tests for hermes_cli.container_boot — the cont-init.d-time +reconciliation that recreates per-profile gateway s6 service slots +from the persistent profiles directory. + +These tests run against a fake $HERMES_HOME under tmp_path; no real +s6 supervision tree is required. The in-container integration test +covering end-to-end "docker restart" survival lives in +tests/docker/test_container_restart.py. +""" +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from hermes_cli.container_boot import ( + ReconcileAction, + reconcile_profile_gateways, +) + + +# --------------------------------------------------------------------------- +# Fixtures + helpers +# --------------------------------------------------------------------------- + + +def _make_profile( + hermes_home: Path, + name: str, + *, + state: str | None, + with_pid: bool = False, + config: bool = True, +) -> Path: + """Create a fake profile directory under hermes_home/profiles//.""" + p = hermes_home / "profiles" / name + p.mkdir(parents=True) + if config: + # SOUL.md is what the reconciler keys on — it's always seeded by + # `hermes profile create`. See container_boot._render_run_script. + (p / "SOUL.md").write_text("# fake profile\n") + if state is not None: + (p / "gateway_state.json").write_text(json.dumps({ + "gateway_state": state, "timestamp": 1234567890, + })) + if with_pid: + (p / "gateway.pid").write_text(json.dumps( + {"pid": 99999, "host": "old-container"}, + )) + (p / "processes.json").write_text("[]") + return p + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions == [ReconcileAction( + profile="coder", prior_state="running", action="started", + )] + svc = scandir / "gateway-coder" + assert (svc / "run").exists() + assert (svc / "run").stat().st_mode & 0o111 # executable + assert (svc / "type").read_text().strip() == "longrun" + # Auto-start means no down-marker. + assert not (svc / "down").exists() + + +def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "writer", state="stopped") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions == [ReconcileAction( + profile="writer", prior_state="stopped", action="registered", + )] + # down marker tells s6-svscan to NOT start the service. + assert (scandir / "gateway-writer" / "down").exists() + + +def test_startup_failed_does_not_autostart(tmp_path: Path) -> None: + """Avoid crash-loop on restart when the gateway was failing to boot.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "broken", state="startup_failed") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions[0].action == "registered" + assert (scandir / "gateway-broken" / "down").exists() + + +def test_starting_state_does_not_autostart(tmp_path: Path) -> None: + """`starting` means the gateway died mid-boot last time; treat as + failed, not as a candidate for auto-restart.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "unlucky", state="starting") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions[0].action == "registered" + + +def test_stale_runtime_files_are_removed(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + profile = _make_profile(tmp_path, "coder", state="running", with_pid=True) + assert (profile / "gateway.pid").exists() + assert (profile / "processes.json").exists() + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert not (profile / "gateway.pid").exists() + assert not (profile / "processes.json").exists() + + +def test_profile_without_state_file_is_registered_but_not_started( + tmp_path: Path, +) -> None: + """A freshly-created profile that's never been started: register + its slot but don't auto-start.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "fresh", state=None) + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions == [ReconcileAction( + profile="fresh", prior_state=None, action="registered", + )] + assert (scandir / "gateway-fresh" / "down").exists() + + +def test_directory_without_marker_file_is_skipped(tmp_path: Path) -> None: + """A stray dir under profiles/ that isn't actually a profile (no + SOUL.md — the marker the reconciler keys on) should be skipped.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + # Create a profile dir but without SOUL.md + (tmp_path / "profiles" / "stray").mkdir(parents=True) + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions == [] + assert not (scandir / "gateway-stray").exists() + + +def test_corrupt_state_file_treated_as_no_prior_state(tmp_path: Path) -> None: + """If gateway_state.json is malformed JSON, don't blow up the whole + reconciliation — register the slot in the down state.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + profile = _make_profile(tmp_path, "junk", state="running") + (profile / "gateway_state.json").write_text("{ not valid json") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions[0].action == "registered" # not "started" + assert (scandir / "gateway-junk" / "down").exists() + + +def test_reconcile_log_is_written(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "a", state="running") + _make_profile(tmp_path, "b", state="stopped") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + log = (tmp_path / "logs" / "container-boot.log").read_text() + assert "profile=a" in log + assert "action=started" in log + assert "profile=b" in log + assert "action=registered" in log + + +def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + profile = _make_profile(tmp_path, "coder", state="running", with_pid=True) + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=True, + ) + + # The action list is still produced... + assert actions == [ReconcileAction( + profile="coder", prior_state="running", action="started", + )] + # ...but nothing on disk was touched. + assert (profile / "gateway.pid").exists() # not removed under dry_run + assert not (scandir / "gateway-coder").exists() + assert not (tmp_path / "logs" / "container-boot.log").exists() + + +def test_missing_profiles_root_returns_empty(tmp_path: Path) -> None: + """When $HERMES_HOME/profiles doesn't exist (fresh install), the + reconciliation should return an empty list without raising.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + assert actions == [] + + +def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None: + """A profile dir whose name doesn't match validate_profile_name's + rules (uppercase, etc.) must surface as a hard error rather than + silently produce an invalid s6 service dir.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "BadName", state="running") + with pytest.raises(ValueError): + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) diff --git a/tests/hermes_cli/test_gateway_s6_dispatch.py b/tests/hermes_cli/test_gateway_s6_dispatch.py new file mode 100644 index 00000000000..6516f85eab2 --- /dev/null +++ b/tests/hermes_cli/test_gateway_s6_dispatch.py @@ -0,0 +1,117 @@ +"""Tests for the Phase 4 s6 dispatch helper in hermes_cli.gateway. + +`_dispatch_via_service_manager_if_s6` decides whether a +`hermes gateway start/stop/restart` invocation should be routed to +the in-container S6ServiceManager instead of falling through to the +host systemd/launchd/windows code path. +""" +from __future__ import annotations + +from typing import Any + +import pytest + + +class _CallRecorder: + """Minimal stand-in for S6ServiceManager.""" + kind = "s6" + + def __init__(self) -> None: + self.calls: list[tuple[str, str]] = [] + + def start(self, name: str) -> None: + self.calls.append(("start", name)) + + def stop(self, name: str) -> None: + self.calls.append(("stop", name)) + + def restart(self, name: str) -> None: + self.calls.append(("restart", name)) + + +def test_dispatch_returns_false_on_host(monkeypatch: pytest.MonkeyPatch) -> None: + """When the environment isn't s6 (host run), the helper must + return False and not invoke a manager — callers continue with + their existing systemd/launchd/windows path.""" + from hermes_cli import gateway as gw + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "systemd", + ) + # Should not even attempt to construct a manager. + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: pytest.fail("manager should not be constructed on host"), + ) + assert gw._dispatch_via_service_manager_if_s6("start", profile="x") is False + + +def test_dispatch_returns_true_and_calls_start_on_s6( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli import gateway as gw + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_via_service_manager_if_s6("start", profile="coder") is True + assert rec.calls == [("start", "gateway-coder")] + + +@pytest.mark.parametrize("action,expected", [ + ("start", "start"), + ("stop", "stop"), + ("restart", "restart"), +]) +def test_dispatch_translates_action_to_manager_method( + monkeypatch: pytest.MonkeyPatch, action: str, expected: str, +) -> None: + from hermes_cli import gateway as gw + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_via_service_manager_if_s6(action, profile="x") is True + assert rec.calls == [(expected, "gateway-x")] + + +def test_dispatch_unknown_action_returns_false( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """An unrecognized action (e.g. 'install') must not silently + succeed — return False so the host code path handles it.""" + from hermes_cli import gateway as gw + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_via_service_manager_if_s6("install", profile="x") is False + assert rec.calls == [] + + +def test_dispatch_defaults_profile_to_default( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When profile is None, the helper resolves it via _profile_arg(). + With no profile context set anywhere, that resolves to "default".""" + from hermes_cli import gateway as gw + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + monkeypatch.setattr( + "hermes_cli.gateway._profile_suffix", lambda: "", + ) + assert gw._dispatch_via_service_manager_if_s6("start") is True + assert rec.calls == [("start", "gateway-default")] diff --git a/tests/hermes_cli/test_profiles_s6_hooks.py b/tests/hermes_cli/test_profiles_s6_hooks.py new file mode 100644 index 00000000000..73a25f90d8f --- /dev/null +++ b/tests/hermes_cli/test_profiles_s6_hooks.py @@ -0,0 +1,190 @@ +"""Tests for the Phase 4 s6 hooks in hermes_cli.profiles. + +Specifically: _allocate_gateway_port, _maybe_register_gateway_service, +_maybe_unregister_gateway_service. The integration with +create_profile and delete_profile is covered indirectly by the +existing TestCreateProfile and TestDeleteProfile classes in +tests/hermes_cli/test_profiles.py; here we only exercise the new +helper surface that doesn't touch the filesystem. +""" +from __future__ import annotations + +from typing import Any + +import pytest + +from hermes_cli.profiles import ( + _allocate_gateway_port, + _maybe_register_gateway_service, + _maybe_unregister_gateway_service, +) + + +# --------------------------------------------------------------------------- +# _allocate_gateway_port +# --------------------------------------------------------------------------- + + +def test_allocate_gateway_port_is_deterministic() -> None: + """Same profile name → same port across calls. This matters because + a profile's gateway must come back up on the same port across + container restarts.""" + a = _allocate_gateway_port("coder") + b = _allocate_gateway_port("coder") + assert a == b + + +def test_allocate_gateway_port_in_advertised_range() -> None: + """[9200, 9800) — the window the helper's docstring promises.""" + for name in ("a", "b", "coder", "assistant", "very-long-profile-name-here"): + port = _allocate_gateway_port(name) + assert 9200 <= port < 9800, f"{name} got {port}" + + +def test_allocate_gateway_port_distributes_across_range() -> None: + """Sanity check: ports for ~100 random-ish names should land in + enough distinct buckets that the distribution is plausibly uniform. + Catches accidental hash truncation that would collapse the range.""" + ports = {_allocate_gateway_port(f"profile-{i}") for i in range(100)} + # 100 inputs mapped into 600 slots — expect at least ~60 distinct. + assert len(ports) >= 60, f"Only {len(ports)} distinct ports across 100 names" + + +# --------------------------------------------------------------------------- +# _maybe_register_gateway_service / _maybe_unregister_gateway_service +# --------------------------------------------------------------------------- + + +class _HostManager: + """Mimics a host backend that doesn't support runtime registration.""" + kind = "systemd" + + def supports_runtime_registration(self) -> bool: + return False + + def register_profile_gateway(self, *args: Any, **kwargs: Any) -> None: + raise AssertionError("host backend register_profile_gateway should not be called") + + def unregister_profile_gateway(self, *args: Any, **kwargs: Any) -> None: + raise AssertionError("host backend unregister_profile_gateway should not be called") + + +class _S6Manager: + """Mimics S6ServiceManager just enough for the hooks.""" + kind = "s6" + + def __init__(self) -> None: + self.registered: list[tuple[str, int]] = [] + self.unregistered: list[str] = [] + self.raise_on_register: Exception | None = None + self.raise_on_unregister: Exception | None = None + + def supports_runtime_registration(self) -> bool: + return True + + def register_profile_gateway( + self, profile: str, *, port: int, + extra_env: dict[str, str] | None = None, + ) -> None: + if self.raise_on_register is not None: + raise self.raise_on_register + self.registered.append((profile, port)) + + def unregister_profile_gateway(self, profile: str) -> None: + if self.raise_on_unregister is not None: + raise self.raise_on_unregister + self.unregistered.append(profile) + + +def test_register_noop_on_host(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: _HostManager(), + ) + # Should NOT raise the AssertionError from _HostManager.register + _maybe_register_gateway_service("hostprof") + + +def test_register_calls_through_on_s6(monkeypatch: pytest.MonkeyPatch) -> None: + mgr = _S6Manager() + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_register_gateway_service("coder") + assert len(mgr.registered) == 1 + profile, port = mgr.registered[0] + assert profile == "coder" + assert 9200 <= port < 9800 + + +def test_register_swallows_duplicate_value_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A pre-existing s6 registration (from container-boot reconcile) + is a benign condition — register must not propagate ValueError.""" + mgr = _S6Manager() + mgr.raise_on_register = ValueError("already registered") + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + # Should NOT raise + _maybe_register_gateway_service("coder") + + +def test_register_swallows_arbitrary_error( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], +) -> None: + """Even an unexpected exception from the manager must not bring + down `hermes profile create` — print and continue.""" + mgr = _S6Manager() + mgr.raise_on_register = RuntimeError("svscanctl exploded") + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_register_gateway_service("coder") + captured = capsys.readouterr() + assert "Could not register" in captured.out + + +def test_register_swallows_no_backend_runtime_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When `get_service_manager()` raises RuntimeError (no backend + detected), the hook must silently no-op.""" + def _no_backend() -> None: + raise RuntimeError("no supported service manager detected") + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", _no_backend, + ) + # Should NOT raise + _maybe_register_gateway_service("anywhere") + + +def test_unregister_noop_on_host(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: _HostManager(), + ) + _maybe_unregister_gateway_service("hostprof") + + +def test_unregister_calls_through_on_s6(monkeypatch: pytest.MonkeyPatch) -> None: + mgr = _S6Manager() + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_unregister_gateway_service("coder") + assert mgr.unregistered == ["coder"] + + +def test_unregister_swallows_errors( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], +) -> None: + mgr = _S6Manager() + mgr.raise_on_unregister = RuntimeError("svc gone weird") + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: mgr, + ) + _maybe_unregister_gateway_service("coder") + captured = capsys.readouterr() + assert "Could not unregister" in captured.out diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index fc2ab6a7896..37076113a09 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -332,8 +332,7 @@ def test_s6_register_creates_service_dir_and_triggers_scan( assert run_path.is_file() assert run_path.stat().st_mode & 0o111 # executable run_text = run_path.read_text() - assert "hermes -p coder gateway start" in run_text - assert "--port 9150" in run_text + assert "hermes -p coder gateway run" in run_text assert "s6-setuidgid hermes" in run_text log_run = svc_dir / "log" / "run"