"""Abstract service manager interface. Wraps the existing systemd (Linux host), launchd (macOS host), Windows Scheduled Task (native Windows host), and s6 (container) backends behind a common Protocol. Only the s6 backend supports runtime registration (for per-profile gateways) — host backends raise NotImplementedError from those methods, and callers MUST check supports_runtime_registration() before invoking them. Host-side call sites (setup wizard, uninstall, status) continue to use the existing module-level functions in hermes_cli.gateway and hermes_cli.gateway_windows directly. This protocol is a thin facade used by new code that needs to be backend-agnostic — specifically the profile create/delete hooks (Phase 4) and the s6 dispatch path in ``hermes gateway start/stop/restart`` when running inside a container. """ from __future__ import annotations import re from pathlib import Path from typing import Literal, Protocol, runtime_checkable ServiceManagerKind = Literal["systemd", "launchd", "windows", "s6", "none"] # Profile name → service directory mapping. Profile names must be safe # as filesystem directory names because the s6 backend creates a service # directory at ``/gateway-/``. We reject anything that # could traverse paths, span filesystems, or break s6's own naming rules. _VALID_PROFILE_RE = re.compile(r"^[a-z0-9][a-z0-9_-]*$") _MAX_PROFILE_LEN = 251 # s6-svscan default name_max def validate_profile_name(name: str) -> None: """Raise ValueError if ``name`` is not usable as a profile name. Profile names are used as s6 service directory names, so they must match a conservative subset of filesystem-safe characters. Reject empty strings, uppercase, paths-traversal sequences, and anything longer than s6's default ``name_max``. """ if not name: raise ValueError("profile name must not be empty") if len(name) > _MAX_PROFILE_LEN: raise ValueError( f"profile name too long ({len(name)} > {_MAX_PROFILE_LEN})" ) if not _VALID_PROFILE_RE.match(name): raise ValueError( f"profile name must match [a-z0-9][a-z0-9_-]*, got {name!r}" ) @runtime_checkable class ServiceManager(Protocol): """Abstract interface for init-system-specific service operations. Lifecycle methods (start / stop / restart / is_running) are implemented by every backend. Runtime registration (register_profile_gateway / unregister_profile_gateway / list_profile_gateways) is implemented only by the s6 backend — callers MUST check ``supports_runtime_registration()`` before invoking the registration methods. """ kind: ServiceManagerKind # Lifecycle of a pre-declared service. def start(self, name: str) -> None: ... def stop(self, name: str) -> None: ... def restart(self, name: str) -> None: ... def is_running(self, name: str) -> bool: ... # Runtime registration (s6 only). def supports_runtime_registration(self) -> bool: ... def register_profile_gateway( self, profile: str, *, extra_env: dict[str, str] | None = None, ) -> None: ... def unregister_profile_gateway(self, profile: str) -> None: ... def list_profile_gateways(self) -> list[str]: ... def detect_service_manager() -> ServiceManagerKind: """Detect which service manager is available in this environment. Returns: "s6" — inside a container when /init is s6-svscan (Phase 2+) "windows" — native Windows host "launchd" — macOS host "systemd" — Linux host with a working user/system bus "none" — anything else (Termux, sandbox shells, etc.) This function does NOT replace ``supports_systemd_services()`` — host call sites continue to use that. It exists for new backend- agnostic code (profile create/delete hooks, the s6 dispatch path in ``hermes gateway start/stop/restart``). """ # Imports deferred so importing this module doesn't drag in the # whole gateway dependency graph for callers that only need the # Protocol type or validate_profile_name(). from hermes_constants import is_container from hermes_cli.gateway import ( is_macos, is_windows, supports_systemd_services, ) if is_container() and _s6_running(): return "s6" if is_windows(): return "windows" if is_macos(): return "launchd" if supports_systemd_services(): return "systemd" return "none" def _s6_running() -> bool: """True when s6-svscan is running as PID 1 in this container. Detection has to work for **both** root and the unprivileged hermes user (UID 10000). The obvious probe — ``Path('/proc/1/exe').resolve()`` — only works as root: for any other UID, the symlink at ``/proc/1/exe`` is unreadable and ``resolve()`` silently returns the path unchanged, so the resolved name is the literal ``"exe"`` and detection always fails. Since every Hermes runtime call inside the container drops to hermes via ``s6-setuidgid``, that silent failure made the entire service-manager runtime-registration path inert in production (PR #30136 review). Probe instead via: * ``/proc/1/comm`` — world-readable, contains the process comm (``s6-svscan`` when s6-overlay is PID 1). * ``/run/s6/basedir`` — s6-overlay-specific directory created by stage1. World-readable. More specific than ``/run/s6`` (which other tools occasionally create). Both signals are required; either alone could false-positive (e.g. a container with the s6 binaries installed but a different init, or an unrelated process named ``s6-svscan``). """ try: comm = Path("/proc/1/comm").read_text(encoding="utf-8").strip() except OSError: return False if comm != "s6-svscan": return False return Path("/run/s6/basedir").is_dir() # --------------------------------------------------------------------------- # Backend wrappers # # These adapters are thin facades over the existing module-level functions # in ``hermes_cli.gateway`` (systemd/launchd) and ``hermes_cli.gateway_windows`` # (Windows Scheduled Tasks). The protocol's ``name`` parameter is currently # unused for host backends — they operate on whichever profile is currently # active (set via the ``hermes -p `` flag before the call). This # matches existing host-side semantics; the parameter shape is designed # for s6 where each profile maps to a distinct service directory. # --------------------------------------------------------------------------- class _RegistrationUnsupportedMixin: """Mixin for host backends that don't support runtime registration.""" def supports_runtime_registration(self) -> bool: return False def register_profile_gateway( self, profile: str, *, extra_env: dict[str, str] | None = None, ) -> None: raise NotImplementedError( f"{type(self).__name__} does not support runtime profile " "gateway registration (container-only feature)" ) def unregister_profile_gateway(self, profile: str) -> None: raise NotImplementedError( f"{type(self).__name__} does not support runtime profile " "gateway unregistration (container-only feature)" ) def list_profile_gateways(self) -> list[str]: return [] class SystemdServiceManager(_RegistrationUnsupportedMixin): """Thin wrapper around the ``systemd_*`` functions in hermes_cli.gateway. Existing host call sites continue to use those functions directly; this wrapper exists for new code that needs to be backend-agnostic (the Phase 4 profile create/delete hooks). """ kind: ServiceManagerKind = "systemd" def start(self, name: str) -> None: from hermes_cli.gateway import systemd_start systemd_start() def stop(self, name: str) -> None: from hermes_cli.gateway import systemd_stop systemd_stop() def restart(self, name: str) -> None: from hermes_cli.gateway import systemd_restart systemd_restart() def is_running(self, name: str) -> bool: from hermes_cli.gateway import _probe_systemd_service_running _, running = _probe_systemd_service_running() return running class LaunchdServiceManager(_RegistrationUnsupportedMixin): """Thin wrapper around the ``launchd_*`` functions in hermes_cli.gateway.""" kind: ServiceManagerKind = "launchd" def start(self, name: str) -> None: from hermes_cli.gateway import launchd_start launchd_start() def stop(self, name: str) -> None: from hermes_cli.gateway import launchd_stop launchd_stop() def restart(self, name: str) -> None: from hermes_cli.gateway import launchd_restart launchd_restart() def is_running(self, name: str) -> bool: from hermes_cli.gateway import _probe_launchd_service_running return _probe_launchd_service_running() class WindowsServiceManager(_RegistrationUnsupportedMixin): """Thin wrapper around ``hermes_cli.gateway_windows`` (Scheduled Task / Startup-folder fallback). The native Windows backend uses a Scheduled Task rather than a true init-system service, but for protocol purposes the lifecycle is the same: start / stop / restart / is_running. ``install`` accepts a handful of Windows-specific kwargs (start_now, start_on_login, elevated_handoff) that are passed straight through — non-Windows callers should never invoke ``install`` on this wrapper. """ kind: ServiceManagerKind = "windows" def install( self, *, force: bool = False, start_now: bool | None = None, start_on_login: bool | None = None, elevated_handoff: bool = False, ) -> None: from hermes_cli import gateway_windows gateway_windows.install( force=force, start_now=start_now, start_on_login=start_on_login, elevated_handoff=elevated_handoff, ) def start(self, name: str) -> None: from hermes_cli import gateway_windows gateway_windows.start() def stop(self, name: str) -> None: from hermes_cli import gateway_windows gateway_windows.stop() def restart(self, name: str) -> None: from hermes_cli import gateway_windows gateway_windows.restart() def is_running(self, name: str) -> bool: from hermes_cli import gateway_windows from hermes_cli.gateway import find_gateway_pids if not gateway_windows.is_installed(): return False return bool(find_gateway_pids()) def get_service_manager() -> ServiceManager: """Return the ServiceManager instance for the current environment. Raises: RuntimeError: when no supported backend is available. """ kind = detect_service_manager() if kind == "systemd": return SystemdServiceManager() if kind == "launchd": return LaunchdServiceManager() if kind == "windows": return WindowsServiceManager() if kind == "s6": return S6ServiceManager() raise RuntimeError("no supported service manager detected") # --------------------------------------------------------------------------- # S6ServiceManager (container-only) # # Per-profile gateways are registered dynamically when `hermes profile create` # runs inside the container (Phase 4). Static services (main-hermes, dashboard) # live in /etc/s6-overlay/s6-rc.d/ and are NOT managed by this class — they're # part of the image, not runtime-created. # --------------------------------------------------------------------------- # s6-overlay's dynamic scandir for runtime-registered services. Lives on # tmpfs and is the directory s6-svscan watches. Writes here trigger # automatic supervision on the next rescan. S6_DYNAMIC_SCANDIR = Path("/run/service") S6_SERVICE_PREFIX = "gateway-" # s6-overlay installs its binaries under /command/ and only adds that # directory to PATH for processes started under the supervision tree # (services started by s6-svscan, cont-init.d scripts, etc.). Code # that runs via `docker exec` or any other out-of-tree entry point — # notably our Phase 4 profile create/delete hooks — inherits the # container's base PATH which does NOT include /command/. # # Rather than asking every caller to fix up its environment, the # S6ServiceManager calls s6-* binaries by absolute path via this # constant. We don't use `/usr/bin/s6-…` symlinks because the # s6-overlay-symlinks-noarch tarball only links a subset, and we # want every s6 invocation to be guaranteed-findable. _S6_BIN_DIR = "/command" # UID/GID of the in-image ``hermes`` user. Hardcoded to match what # ``stage2-hook.sh`` enforces (the runtime invariant — see also # tests/docker/test_uid_remap.py). The container starts s6-supervise # under root and immediately drops to this UID via ``s6-setuidgid``. _HERMES_UID = 10000 _HERMES_GID = 10000 def _seed_supervise_skeleton(svc_dir: Path) -> None: """Pre-create the ``supervise/`` and top-level ``event/`` skeleton inside a service directory, owned by the hermes user. Why this exists --------------- When s6-supervise spawns a service it tries to ``mkdir`` two directories: ``/event`` and ``/supervise``, both with mode ``0700``. It also ``mkfifo``s ``/supervise/control`` with mode ``0600``. Because s6-supervise runs as PID 1's effective UID (root) these dirs end up root-owned mode 0700, and an unprivileged client (the ``hermes`` user — UID 10000 — running every Hermes runtime operation via ``s6-setuidgid``) gets ``EACCES`` on any ``s6-svc``, ``s6-svstat``, or ``s6-svwait`` invocation against the slot. The PR #30136 review surfaced this as a real product gap: the entire S6ServiceManager lifecycle (``register/start/stop/unregister _profile_gateway``) was inert in production because every operation is dispatched as the hermes user. Why this works -------------- Reading s6's source (src/supervision/s6-supervise.c::trymkdir + control_init): the ``mkdir`` and ``mkfifo`` calls both treat ``EEXIST`` as success. If the directory is already present, the chown/chmod fix-up that would normally make event/ ``03730 root:root`` is **skipped** entirely — s6-supervise just opens the pre-existing FIFOs and proceeds. So if we lay the skeleton down with hermes ownership before triggering ``s6-svscanctl -a``, s6-supervise inherits our layout and never touches it. Layout produced --------------- ``svc_dir/`` hermes:hermes, 0755 (parent must already exist) ``svc_dir/event/`` hermes:hermes, 03730 (setgid + g+rwx + sticky) ``svc_dir/supervise/`` hermes:hermes, 0755 ``svc_dir/supervise/event/`` hermes:hermes, 03730 ``svc_dir/supervise/control`` hermes:hermes, 0660 (FIFO) The ``death_tally``, ``lock``, and ``status`` regular files end up written by s6-supervise itself (as root), but those land mode 0644 — world-readable — and ``s6-svstat`` only needs read access, so the hermes user reads them fine. If ``svc_dir/log/`` is present (the canonical s6 logger pattern — one s6-supervise instance per service, plus a second for its logger), the same skeleton is seeded under ``log/`` as well: ``log/event/``, ``log/supervise/``, ``log/supervise/event/``, ``log/supervise/control``. Without this, unregister teardown would EACCES on the logger's supervise dir even after the parent slot's supervise/ was hermes-owned. Idempotency ----------- Safe to call against a directory where the skeleton already exists. Existing entries are left untouched (the helper doesn't try to re-chown / re-chmod live FIFOs that s6-supervise may have already opened). Reference --------- Discussed at length on the skarnet `skaware` mailing list in 2020 (``_); see also just-containers/s6-overlay#130. The pre-creation pattern was historically called out as forward-compatibility-fragile, but the EEXIST handling in s6-supervise has been stable since 2015 — it's the same pattern ``s6-svperms`` and ``fix-attrs.d`` rely on. """ import os def _mkdir_owned(path: Path, mode: int) -> None: if path.exists(): return path.mkdir(parents=False, exist_ok=False) path.chmod(mode) try: os.chown(path, _HERMES_UID, _HERMES_GID) except PermissionError: # Running as the hermes user already — directory is hermes- # owned by default. The chown is a no-op in that case, so # swallowing this keeps both root and unprivileged callers # on one code path. pass # Top-level event/ dir (this is the s6-svlisten1 event-subscription # dir at the service root, distinct from supervise/event/). _mkdir_owned(svc_dir / "event", 0o3730) # supervise/ dir + its inner event/ dir. supervise = svc_dir / "supervise" _mkdir_owned(supervise, 0o755) _mkdir_owned(supervise / "event", 0o3730) # supervise/control FIFO. Same EEXIST-safe pattern: if it's already # there (s6-supervise has already started against this slot), leave # it alone. The explicit chmod after mkfifo is required because # mkfifo honors the process umask, which can strip group-write # (e.g. the default 0022 on most dev hosts → 0o660 becomes 0o640). # The container runs with umask 0 inside s6-overlay's stage2, but # being defensive here keeps the helper consistent under any # invocation context. control = supervise / "control" if not control.exists(): os.mkfifo(control, 0o660) control.chmod(0o660) try: os.chown(control, _HERMES_UID, _HERMES_GID) except PermissionError: pass # If a log/ subdir is present (the canonical s6 logger pattern — # see servicedir(7)), it gets its own s6-supervise instance and # needs the same skeleton. Without this, unregister teardown # would EACCES on the logger's root-owned supervise/ dir even # when the parent slot's supervise/ is hermes-owned. log_dir = svc_dir / "log" if log_dir.is_dir(): _mkdir_owned(log_dir / "event", 0o3730) log_supervise = log_dir / "supervise" _mkdir_owned(log_supervise, 0o755) _mkdir_owned(log_supervise / "event", 0o3730) log_control = log_supervise / "control" if not log_control.exists(): os.mkfifo(log_control, 0o660) log_control.chmod(0o660) try: os.chown(log_control, _HERMES_UID, _HERMES_GID) except PermissionError: pass class S6Error(RuntimeError): """Base error for S6ServiceManager lifecycle failures. Concrete subclasses carry the slot name (and, where useful, the underlying subprocess output) so the CLI can render an actionable message instead of leaking a raw ``CalledProcessError`` traceback. """ def __init__(self, message: str, *, service: str | None = None) -> None: super().__init__(message) self.service = service class GatewayNotRegisteredError(S6Error): """Raised when a lifecycle method targets a slot that doesn't exist. Most commonly: ``hermes -p typo gateway start`` when no profile ``typo`` exists. Carries the unprefixed profile name (not the full ``gateway-`` service-dir name) so callers can phrase a user-facing message like "no such gateway 'typo'". """ def __init__(self, profile: str) -> None: self.profile = profile super().__init__( f"no such gateway {profile!r}: register it with " f"`hermes profile create {profile}` first, or pass " "an existing profile name via `-p `", service=f"gateway-{profile}", ) class S6CommandError(S6Error): """Raised when an s6 command fails for a reason other than a missing slot — e.g. permission denied on the supervise control FIFO, or s6-svc returning a non-zero exit for an unexpected reason. Carries the stderr from the failing command so callers can surface it. """ def __init__( self, *, service: str, action: str, returncode: int, stderr: str, ) -> None: self.action = action self.returncode = returncode self.stderr = stderr message = ( f"s6-svc {action} on {service!r} failed (rc={returncode})" ) if stderr.strip(): message += f": {stderr.strip()}" super().__init__(message, service=service) class S6ServiceManager: """Per-profile gateway supervision via s6-overlay. Only handles runtime-registered services under ``S6_DYNAMIC_SCANDIR``. Static services (main-hermes, dashboard) are managed by s6-rc at image-build time and are out of scope. """ kind: ServiceManagerKind = "s6" def __init__(self, scandir: Path = S6_DYNAMIC_SCANDIR) -> None: self.scandir = scandir # -- internal helpers -------------------------------------------------- def _service_dir(self, profile: str) -> Path: validate_profile_name(profile) return self.scandir / f"{S6_SERVICE_PREFIX}{profile}" def _service_name(self, profile: str) -> str: return f"{S6_SERVICE_PREFIX}{profile}" @staticmethod def _render_run_script( profile: str, extra_env: dict[str, str], ) -> str: """Generate the run script for a profile-gateway s6 service. The script: 1. Sources HERMES_HOME (and any extra env) via with-contenv — so e.g. ``-e HERMES_HOME=/data/hermes`` is honored at run time, not Python-substituted at registration time (OQ8-C). 2. Resets ``HOME`` to ``/opt/data`` before the privilege drop so with-contenv's root HOME does not leak into the unprivileged gateway process. 3. Activates the bundled venv. 4. Drops to the hermes user and exec's ``hermes -p gateway run`` (or just ``hermes gateway run`` for the default profile — see below). Special case: ``profile == "default"`` emits ``hermes gateway run`` with **no** ``-p`` flag. This is the sentinel for "the root HERMES_HOME profile" (the implicit profile that exists at the top of $HERMES_HOME, not under profiles/). It must be spelled this way because ``_profile_suffix()`` returns the empty string for the root profile, and the dispatcher in ``hermes_cli.gateway`` maps that empty string to the ``gateway-default`` service slot. Passing ``-p default`` here would instead look up ``$HERMES_HOME/profiles/default/`` — a completely different (and almost always nonexistent) profile. Port selection: the gateway picks its bind port from the profile's ``config.yaml`` (``[gateway] port = ...``) — that is the single source of truth. Previously this method took a ``port`` parameter that was passed in but never substituted into the rendered script (it was carried in for "API parity" with a deterministic SHA-256 allocator in ``hermes_cli.profiles._allocate_gateway_port``). PR #30136 review item I5 retired both the allocator and the parameter because they were dead code through the entire stack. """ import shlex lines = [ "#!/command/with-contenv sh", "# shellcheck shell=sh", "set -e", "export HOME=/opt/data", "cd /opt/data", ". /opt/hermes/.venv/bin/activate", ] for k, v in sorted(extra_env.items()): lines.append(f"export {k}={shlex.quote(v)}") # Sentinel for the supervised-child path. Prevents recursive # redirect when the supervised gateway re-enters # `_gateway_command_inner` with subcmd == "run" — without it the # supervisor would dispatch `gateway start` which would re-exec # `gateway run --replace` which would re-dispatch `gateway # start`, etc. See `_gateway_command_inner` for the matching # guard. lines.append("export HERMES_S6_SUPERVISED_CHILD=1") if profile == "default": lines.append("exec s6-setuidgid hermes hermes gateway run") else: lines.append( f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run" ) return "\n".join(lines) + "\n" @staticmethod def _render_log_run(profile: str) -> str: """Generate the log/run script for a profile-gateway service. OQ8-C: persist to ``${HERMES_HOME}/logs/gateways//``. CRITICAL: the HERMES_HOME path is sourced from the runtime env via with-contenv — NOT Python-substituted at registration time — so a container started with ``-e HERMES_HOME=/data/hermes`` gets its logs under /data/hermes/logs/..., not the build-time default. Output routing — the script is two action directives, applied per line, in order: 1. ``1`` (forward to stdout) — propagates the line up the s6-supervise pipeline to /init's stdout, which is the container's stdout, which is ``docker logs``. Without this, supervised stdout would be terminated inside s6-log and never reach the container's log stream; users would have to ``docker exec`` and ``tail`` the file just to see startup banners. (Python's ``logging`` module defaults to stderr, which s6-supervise leaves unfiltered — so warnings/errors already reach docker logs. This change is specifically about the rich-console banner output and other plain stdout writes.) 2. ``T `` — also write a timestamped copy to the rotated log directory (``current`` + archived ``@*.s`` files). This is what ``hermes logs`` reads and what persists across container restarts via the volume mount. ``T`` is non-sticky: it only prefixes lines for the next action directive. We deliberately put ``T`` between ``1`` and the log dir (not before ``1``) so: * ``docker logs`` shows raw lines — Python's logging formatter has its own timestamps, and ``docker logs --timestamps`` adds a third layer when desired. No double-stamping in the most common reading path. * The persisted file gets s6-log's own ISO 8601 timestamp so even output that lacked a Python-logger timestamp (rich banners, third-party libs' raw prints) is correlatable in ``current``. """ import shlex prof = shlex.quote(profile) return ( f"#!/command/with-contenv sh\n" f"# shellcheck shell=sh\n" f': "${{HERMES_HOME:=/opt/data}}"\n' f'log_dir="$HERMES_HOME/logs/gateways/{prof}"\n' f'mkdir -p "$log_dir"\n' f'chown -R hermes:hermes "$log_dir" 2>/dev/null || true\n' f'exec s6-setuidgid hermes s6-log 1 n10 s1000000 T "$log_dir"\n' ) # -- lifecycle --------------------------------------------------------- def _run_svc(self, action_flag: str, action_label: str, name: str) -> None: """Shared lifecycle dispatch for start / stop / restart. Translates the two failure modes operators care about into named errors: * ``GatewayNotRegisteredError`` — the service directory at ``//`` doesn't exist. ``s6-svc`` would exit non-zero with a fairly opaque message; we pre-empt it with a clear "no such gateway 'X'" tied to the profile name (without the ``gateway-`` prefix). * ``S6CommandError`` — anything else (EACCES on the supervise control FIFO, timeout, etc.). Carries the subprocess return code and stderr so callers can render them inline. ``action_flag`` is the ``s6-svc`` flag (``-u`` / ``-d`` / ``-t``); ``action_label`` is the human verb (``start`` / ``stop`` / ``restart``) used in error messages. """ import subprocess service_dir = self.scandir / name if not service_dir.is_dir(): # Strip the gateway- prefix back off so the message # matches what the user typed on the CLI (``-p ``). profile = ( name[len(S6_SERVICE_PREFIX):] if name.startswith(S6_SERVICE_PREFIX) else name ) raise GatewayNotRegisteredError(profile) try: subprocess.run( [f"{_S6_BIN_DIR}/s6-svc", action_flag, str(service_dir)], check=True, capture_output=True, text=True, timeout=5, ) except subprocess.CalledProcessError as exc: raise S6CommandError( service=name, action=action_label, returncode=exc.returncode, stderr=exc.stderr or "", ) from exc def start(self, name: str) -> None: """Bring up a registered service (``s6-svc -u``). Raises: GatewayNotRegisteredError: no service directory for ``name``. S6CommandError: s6-svc exited non-zero for any other reason (permission denied on the supervise FIFO, timeout, etc.). """ self._run_svc("-u", "start", name) def stop(self, name: str) -> None: """Bring down a registered service (``s6-svc -d``). Raises: GatewayNotRegisteredError: no service directory for ``name``. S6CommandError: s6-svc exited non-zero for any other reason. """ self._run_svc("-d", "stop", name) def restart(self, name: str) -> None: """Restart a registered service (``s6-svc -t`` = SIGTERM). Raises: GatewayNotRegisteredError: no service directory for ``name``. S6CommandError: s6-svc exited non-zero for any other reason. """ self._run_svc("-t", "restart", name) def is_running(self, name: str) -> bool: """True iff ``s6-svstat`` reports the service as up.""" import subprocess result = subprocess.run( [f"{_S6_BIN_DIR}/s6-svstat", str(self.scandir / name)], capture_output=True, text=True, timeout=5, ) return result.returncode == 0 and "up " in result.stdout # -- runtime registration --------------------------------------------- def supports_runtime_registration(self) -> bool: return True def register_profile_gateway( self, profile: str, *, extra_env: dict[str, str] | None = None, ) -> None: """Create the s6 service directory for a profile gateway. Triggers ``s6-svscanctl -a`` so s6-svscan picks the new directory up immediately. The service is created in the *up* state — to register without auto-starting, follow up with ``stop(profile)`` (or pass the start flag via the future ``start_now=False`` arg, which the Phase 4 reconciliation path uses via a ``down`` marker file written directly). Raises: ValueError: if the profile name is invalid or the service directory already exists. RuntimeError: if ``s6-svscanctl`` fails. """ import shutil import subprocess svc_dir = self._service_dir(profile) if svc_dir.exists(): raise ValueError( f"profile gateway {profile!r} already registered at {svc_dir}" ) # Build the service directory atomically: write to a sibling # temp dir, then rename. Avoids s6-svscan observing a half- # populated directory on a fast rescan. tmp_dir = svc_dir.with_name(svc_dir.name + ".tmp") if tmp_dir.exists(): shutil.rmtree(tmp_dir, ignore_errors=True) tmp_dir.mkdir(parents=True) try: (tmp_dir / "type").write_text("longrun\n") run_script = self._render_run_script(profile, extra_env or {}) run_path = tmp_dir / "run" run_path.write_text(run_script) run_path.chmod(0o755) # Persistent log rotation (OQ8-C). log_subdir = tmp_dir / "log" log_subdir.mkdir() log_run = log_subdir / "run" log_run.write_text(self._render_log_run(profile)) log_run.chmod(0o755) # Pre-create the supervise/ skeleton with hermes ownership # BEFORE we publish the slot. s6-supervise will EEXIST our # dirs/FIFOs and inherit the ownership, so the runtime # s6-svc / s6-svstat / s6-svwait calls (all dispatched as # the hermes user) won't hit EACCES on root-owned 0700 # dirs. See ``_seed_supervise_skeleton`` for the full # rationale. _seed_supervise_skeleton(tmp_dir) tmp_dir.rename(svc_dir) except Exception: shutil.rmtree(tmp_dir, ignore_errors=True) raise # Trigger rescan so s6-svscan picks up the new service. result = subprocess.run( [f"{_S6_BIN_DIR}/s6-svscanctl", "-a", str(self.scandir)], capture_output=True, text=True, timeout=5, ) if result.returncode != 0: # Clean up: rescan failed, leave the directory in place would # be confusing (no supervisor watching it). shutil.rmtree(svc_dir, ignore_errors=True) raise RuntimeError( f"s6-svscanctl failed: {result.stderr or result.stdout}" ) def unregister_profile_gateway(self, profile: str) -> None: """Stop the profile gateway service and remove its directory. Idempotent: absent services are a no-op. Best-effort stop + wait-for-down before removal so the running gateway process gets a chance to shut down cleanly before its service dir disappears. Teardown ordering matters: ``s6-svscanctl -an`` is fired **before** ``rmtree`` so s6-svscan reaps the supervise child process (releasing its handle on ``supervise/lock`` and the regular files inside the supervise dir), giving us a clean directory to remove. Without the reap-first ordering, the rmtree races s6-supervise on a set of root-owned files inside the supervise dir and the dir is left half-removed. """ import shutil import subprocess import time svc_dir = self._service_dir(profile) if not svc_dir.exists(): return # Stop the service (best effort — service may already be down). subprocess.run( [f"{_S6_BIN_DIR}/s6-svc", "-d", str(svc_dir)], capture_output=True, text=True, timeout=5, check=False, ) # Wait for it to actually go down (up to 10s). subprocess.run( [f"{_S6_BIN_DIR}/s6-svwait", "-D", "-t", "10000", str(svc_dir)], capture_output=True, text=True, timeout=15, check=False, ) # Reap the supervise child FIRST: -n tells s6-svscan to drop # any supervise processes whose service dir is gone (which # includes any service dir we're about to remove). This # releases the file handles s6-supervise holds against the # supervise/lock + supervise/status + supervise/death_tally # files inside the slot, so the upcoming rmtree doesn't race. subprocess.run( [f"{_S6_BIN_DIR}/s6-svscanctl", "-an", str(self.scandir)], capture_output=True, text=True, timeout=5, check=False, ) # Give s6-svscan a moment to reap. There's no synchronous # "scan completed" handshake — the -a/-n trigger just sets a # flag s6-svscan reads on its next loop iteration. 200ms is # comfortably above the loop's resolution but well under any # user-perceived latency. time.sleep(0.2) # Now the supervise dir's files are no longer held open by a # live s6-supervise, so rmtree can remove them. Files inside # supervise/ are root-owned (death_tally, lock, status, written # by s6-supervise itself) — but the parent supervise/ directory # is hermes-owned (see ``_seed_supervise_skeleton``), and on # POSIX you only need write+execute on the parent to remove # contained files regardless of file ownership. shutil.rmtree(svc_dir, ignore_errors=True) def list_profile_gateways(self) -> list[str]: """Return the profile names of all currently-registered gateway services. Filters the scandir to entries that match the ``gateway-`` prefix. Other services (e.g. ``s6-linux-init-shutdownd``) are ignored. """ if not self.scandir.exists(): return [] profiles: list[str] = [] for entry in self.scandir.iterdir(): if entry.name.startswith("."): continue if not entry.is_dir(): continue if not entry.name.startswith(S6_SERVICE_PREFIX): continue profiles.append(entry.name[len(S6_SERVICE_PREFIX):]) return profiles