From fc39296e1ffc6f41d44880e4923a4c5ddb4a26a9 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 23 May 2026 14:56:39 +1000 Subject: [PATCH] fix(service_manager): s6 detection works for unprivileged hermes user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #30136 review surfaced two issues, both rooted in the same audit gap: docker integration tests were running as root, not the unprivileged `hermes` user (UID 10000) that the runtime actually uses via `s6-setuidgid hermes`. Anything that probed PID-1 state or wrote to the s6 control surface worked as root in the tests but was inert in production. Fixes: 1. `_s6_running()` previously called `Path("/proc/1/exe").resolve()`, which is root-only readable. For UID 10000 the symlink yields PermissionError, `resolve()` silently returns the unresolved path, and `exe.name == "exe"` — so detection always returned False, the service-manager runtime-registration path was inert, and every `hermes profile create` / `hermes -p X gateway start` silently skipped the s6 hook. Replace with `/proc/1/comm` (world-readable) + `/run/s6/basedir` (s6-overlay-specific) — both required, fail closed. 2. `02-reconcile-profiles` now also chowns `/run/service/.s6-svscan/` {control,lock} to hermes so `s6-svscanctl -a/-an` works without root. Previously the directory chown stopped at `/run/service` and the FIFO inside stayed root-owned, so `register_profile_gateway` from hermes failed at the rescan-trigger step with EACCES — the wrapper in profiles.py caught the exception and printed a swallowed warning, so profile creation appeared to succeed while the slot was rolled back. Audit changes to flush this class of bug next time: - Add `docker_exec` / `docker_exec_sh` helpers to `tests/docker/conftest.py` that default to `-u hermes`. The module docstring explains why and flags `user="root"` as opt-in only for tests that explicitly need root (none currently do). - Refactor every `docker exec` call in tests/docker/ through the new helpers (test_dashboard.py, test_zombie_reaping.py, test_profile_gateway.py, test_container_restart.py, test_s6_profile_gateway_integration.py). - Add 5 unit tests covering `_s6_running` under various probe states (both signals present; comm wrong; basedir missing; PermissionError on /proc/1/comm; missing /proc — non-Linux). The PermissionError test is the explicit regression guard for the original bug. Known follow-up: the per-service `supervise/control` FIFO inside each `/run/service/gateway-/supervise/` is created root-owned by s6-supervise (which runs as root because s6-svscan is PID 1). `s6-svc -u/-d/-t` from the hermes user will get EACCES on those. The audit under `-u hermes` will reveal this in lifecycle tests — surfacing the issue cleanly so it can be fixed in a focused follow-up (likely via a small SUID helper or a polling chown loop in cont-init.d). The detection + svscanctl fixes here are independent and complete on their own. --- docker/cont-init.d/02-reconcile-profiles | 26 ++++- hermes_cli/service_manager.py | 32 +++++-- tests/docker/conftest.py | 53 +++++++++++ tests/docker/test_container_restart.py | 10 +- tests/docker/test_dashboard.py | 37 +++----- tests/docker/test_profile_gateway.py | 11 ++- .../test_s6_profile_gateway_integration.py | 13 ++- tests/docker/test_zombie_reaping.py | 17 ++-- tests/hermes_cli/test_service_manager.py | 95 +++++++++++++++++++ 9 files changed, 241 insertions(+), 53 deletions(-) diff --git a/docker/cont-init.d/02-reconcile-profiles b/docker/cont-init.d/02-reconcile-profiles index 90b03554f1e..98b1f59ee89 100755 --- a/docker/cont-init.d/02-reconcile-profiles +++ b/docker/cont-init.d/02-reconcile-profiles @@ -16,15 +16,31 @@ # # Phase 4 also needs hermes-user writes to /run/service/ (so the # profile create/delete hooks can register/unregister at runtime), -# so we chown the scandir before invoking the reconciler. The -# .s6-svscan/ subdir stays root-owned; only sibling directories -# (gateway-/) need to be hermes-writable. +# so we chown the scandir before invoking the reconciler. We +# additionally chown the s6-svscan control FIFO so the hermes user +# can send rescan signals via ``s6-svscanctl -a``; without this the +# entire runtime-registration path is inert under UID 10000 (the +# Python wrapper catches the resulting EACCES, prints a warning, +# and swallows the failure). set -e # Make the dynamic scandir hermes-writable. The directory itself -# starts root-owned by s6-overlay; we leave .s6-svscan/ alone since -# only s6 itself writes there. +# starts root-owned by s6-overlay. chown hermes:hermes /run/service 2>/dev/null || true +# Make the svscan control FIFO hermes-writable so s6-svscanctl -a +# / -an work for the hermes user. The FIFO is created by s6-svscan +# at PID-1 startup, so by the time this cont-init.d script runs it +# already exists. Both ``control`` and ``lock`` need to be writable +# for the various svscanctl operations; the directory itself stays +# root-owned (we only need to touch the two FIFOs/locks inside). +if [ -d /run/service/.s6-svscan ]; then + for entry in control lock; do + if [ -e "/run/service/.s6-svscan/$entry" ]; then + chown hermes:hermes "/run/service/.s6-svscan/$entry" 2>/dev/null || true + fi + done +fi + exec s6-setuidgid hermes /opt/hermes/.venv/bin/python -m hermes_cli.container_boot diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 236f2b619e1..18b6ef01664 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -122,16 +122,34 @@ def detect_service_manager() -> ServiceManagerKind: def _s6_running() -> bool: """True when s6-svscan is running as PID 1 in this container. - s6-overlay's /init exec's s6-svscan, so ``/proc/1/exe`` resolves - to it (or to ``init`` on some kernel configurations that hide the - exe link). The ``/run/s6/`` directory is created by stage1, so its - presence is a second necessary signal. + Detection has to work for **both** root and the unprivileged hermes + user (UID 10000). The obvious probe — ``Path('/proc/1/exe').resolve()`` + — only works as root: for any other UID, the symlink at + ``/proc/1/exe`` is unreadable and ``resolve()`` silently returns the + path unchanged, so the resolved name is the literal ``"exe"`` and + detection always fails. Since every Hermes runtime call inside the + container drops to hermes via ``s6-setuidgid``, that silent failure + made the entire service-manager runtime-registration path inert in + production (PR #30136 review). + + Probe instead via: + * ``/proc/1/comm`` — world-readable, contains the process comm + (``s6-svscan`` when s6-overlay is PID 1). + * ``/run/s6/basedir`` — s6-overlay-specific directory created by + stage1. World-readable. More specific than ``/run/s6`` (which + other tools occasionally create). + + Both signals are required; either alone could false-positive + (e.g. a container with the s6 binaries installed but a different + init, or an unrelated process named ``s6-svscan``). """ try: - exe = Path("/proc/1/exe").resolve() - return exe.name in ("s6-svscan", "init") and Path("/run/s6").exists() - except (OSError, RuntimeError): + comm = Path("/proc/1/comm").read_text().strip() + except OSError: return False + if comm != "s6-svscan": + return False + return Path("/run/s6/basedir").is_dir() # --------------------------------------------------------------------------- diff --git a/tests/docker/conftest.py b/tests/docker/conftest.py index 088a71b5fe9..4281a292fae 100644 --- a/tests/docker/conftest.py +++ b/tests/docker/conftest.py @@ -84,3 +84,56 @@ def container_name(request) -> Iterator[str]: ["docker", "rm", "-f", name], capture_output=True, timeout=10, ) + + +# --------------------------------------------------------------------------- +# docker_exec — default to the unprivileged hermes user +# --------------------------------------------------------------------------- +# +# Background: every Hermes runtime path inside the container drops to UID +# 10000 (the ``hermes`` user) via ``s6-setuidgid hermes``. ``docker exec`` +# without ``-u`` runs as root, which is **not** representative of how +# production code executes. PR #30136 review caught a real regression +# this way — ``Path('/proc/1/exe').resolve()`` works as root and silently +# fails (PermissionError swallowed) for hermes, so a test that ran as root +# couldn't catch a feature that was inert for the actual runtime user. +# +# Tests in this directory MUST exercise the realistic user context. The +# helpers below run every probe under ``-u hermes`` unless a specific +# test explicitly opts into ``user="root"`` (rare — e.g. inspecting +# /proc/1/exe itself, chowning a volume). +# --------------------------------------------------------------------------- + + +def docker_exec( + container: str, + *args: str, + user: str = "hermes", + timeout: int = 30, + extra_docker_args: tuple[str, ...] = (), +) -> subprocess.CompletedProcess[str]: + """Run a command inside ``container`` as ``user`` (default: hermes). + + Returns the CompletedProcess with text=True, capture_output=True. + + Pass ``user="root"`` only when the test specifically needs root + capabilities (e.g. reading /proc/1/exe, manipulating ownership). + Most tests should use the default. + """ + cmd = ["docker", "exec", "-u", user, *extra_docker_args, container, *args] + return subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, + ) + + +def docker_exec_sh( + container: str, + command: str, + *, + user: str = "hermes", + timeout: int = 30, +) -> subprocess.CompletedProcess[str]: + """Run ``sh -c `` inside the container as ``user``.""" + return docker_exec( + container, "sh", "-c", command, user=user, timeout=timeout, + ) diff --git a/tests/docker/test_container_restart.py b/tests/docker/test_container_restart.py index b709022c79e..a68057c0c79 100644 --- a/tests/docker/test_container_restart.py +++ b/tests/docker/test_container_restart.py @@ -9,6 +9,10 @@ auto-start only those whose last state was `running`. These tests stand up a container with a named volume, create profiles inside it in various gateway states, restart the container, and assert the reconciler did the right thing. + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec` / :func:`docker_exec_sh` in conftest); see +the conftest module docstring. """ from __future__ import annotations @@ -17,6 +21,8 @@ import time import pytest +from tests.docker.conftest import docker_exec, docker_exec_sh + def _docker(*args: str, **kw) -> subprocess.CompletedProcess[str]: return subprocess.run( @@ -27,11 +33,11 @@ def _docker(*args: str, **kw) -> subprocess.CompletedProcess[str]: def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]: - return _docker("exec", container, *args, timeout=timeout) + return docker_exec(container, *args, timeout=timeout) def _sh(container: str, cmd: str, timeout: int = 30) -> subprocess.CompletedProcess[str]: - return _docker("exec", container, "sh", "-c", cmd, timeout=timeout) + return docker_exec_sh(container, cmd, timeout=timeout) @pytest.fixture diff --git a/tests/docker/test_dashboard.py b/tests/docker/test_dashboard.py index 8f965d5bf05..652a2333851 100644 --- a/tests/docker/test_dashboard.py +++ b/tests/docker/test_dashboard.py @@ -5,12 +5,18 @@ it stays dead. After Phase 2 (s6): dashboard starts once; if it crashes it is restarted under supervision. The restart-after-crash test lives in Phase 2 Task 2.5; this file only locks the opt-in surface (which must not change between tini and s6). + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec`/:func:`docker_exec_sh` in conftest), matching +the realistic runtime context. See the conftest module docstring. """ from __future__ import annotations import subprocess import time +from tests.docker.conftest import docker_exec, docker_exec_sh + def _poll(container: str, probe: str, *, deadline_s: float = 30.0, interval_s: float = 0.5) -> tuple[bool, str]: @@ -19,10 +25,7 @@ def _poll(container: str, probe: str, *, deadline_s: float = 30.0, end = time.monotonic() + deadline_s last = "" while time.monotonic() < end: - r = subprocess.run( - ["docker", "exec", container, "sh", "-c", probe], - capture_output=True, text=True, timeout=10, - ) + r = docker_exec_sh(container, probe, timeout=10) last = r.stdout if r.returncode == 0: return True, last @@ -42,11 +45,7 @@ def test_dashboard_not_running_by_default( # Give the entrypoint enough time to finish bootstrap; if a dashboard # were going to start it'd be visible by now. time.sleep(5) - r = subprocess.run( - ["docker", "exec", container_name, - "pgrep", "-f", "hermes dashboard"], - capture_output=True, text=True, timeout=10, - ) + r = docker_exec(container_name, "pgrep", "-f", "hermes dashboard") # pgrep exits non-zero when no match found assert r.returncode != 0, ( "Dashboard should not be running without HERMES_DASHBOARD" @@ -121,10 +120,8 @@ def test_dashboard_restarts_after_crash( # a couple of times before giving up. first_pid: str | None = None for _attempt in range(10): - first_pid_result = subprocess.run( - ["docker", "exec", container_name, - "pgrep", "-f", "hermes dashboard"], - capture_output=True, text=True, timeout=10, + first_pid_result = docker_exec( + container_name, "pgrep", "-f", "hermes dashboard", ) first_pids = first_pid_result.stdout.strip().split() if first_pids: @@ -133,21 +130,15 @@ def test_dashboard_restarts_after_crash( time.sleep(0.5) assert first_pid is not None, "Could not capture initial dashboard PID" - # Kill the dashboard. - subprocess.run( - ["docker", "exec", container_name, "kill", "-9", first_pid], - capture_output=True, timeout=10, - ) + # Kill the dashboard. The dashboard process runs as hermes, so the + # hermes user can kill it (same UID). + docker_exec(container_name, "kill", "-9", first_pid) # s6 backs off ~1s before restart; allow up to 15s for the new # process to appear with a different PID. deadline = time.monotonic() + 15.0 while time.monotonic() < deadline: - r = subprocess.run( - ["docker", "exec", container_name, - "pgrep", "-f", "hermes dashboard"], - capture_output=True, text=True, timeout=10, - ) + r = docker_exec(container_name, "pgrep", "-f", "hermes dashboard") pids = r.stdout.strip().split() if r.returncode == 0 else [] if pids and pids[0] != first_pid: return # success diff --git a/tests/docker/test_profile_gateway.py b/tests/docker/test_profile_gateway.py index 0723d51fd47..ed038684d71 100644 --- a/tests/docker/test_profile_gateway.py +++ b/tests/docker/test_profile_gateway.py @@ -13,22 +13,25 @@ so the gateway process itself will exit with code 1 on every start attempt (s6 will keep restarting it). We assert against s6's ``want up`` / ``want down`` state — which reflects the lifecycle command's intent, not the supervised process's health. + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec_sh` in conftest); see the conftest module +docstring. """ from __future__ import annotations import subprocess import time +from tests.docker.conftest import docker_exec_sh + PROFILE = "test-harness-profile" def _sh( container: str, command: str, timeout: int = 30, ) -> subprocess.CompletedProcess[str]: - return subprocess.run( - ["docker", "exec", container, "sh", "-c", command], - capture_output=True, text=True, timeout=timeout, - ) + return docker_exec_sh(container, command, timeout=timeout) def _svstat(container: str) -> str: diff --git a/tests/docker/test_s6_profile_gateway_integration.py b/tests/docker/test_s6_profile_gateway_integration.py index eb5cdca4bb8..103664e2895 100644 --- a/tests/docker/test_s6_profile_gateway_integration.py +++ b/tests/docker/test_s6_profile_gateway_integration.py @@ -10,12 +10,20 @@ gateway actually starting (the binary will refuse to start without a valid profile config). The full register → start → supervised-restart → unregister cycle is covered by Phase 4 once profile create/delete hooks land. + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec` in conftest); see the conftest module +docstring. ``/run/service`` is chowned hermes-writable by the +``02-reconcile-profiles`` cont-init.d script, so register/unregister +operations work correctly under UID 10000. """ from __future__ import annotations import subprocess import time +from tests.docker.conftest import docker_exec + _REGISTER_SCRIPT = """ import sys @@ -38,10 +46,7 @@ print("UNREGISTERED") def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess: - return subprocess.run( - ["docker", "exec", container, *args], - capture_output=True, text=True, timeout=timeout, - ) + return docker_exec(container, *args, timeout=timeout) def test_s6_register_creates_service_dir_in_live_container( diff --git a/tests/docker/test_zombie_reaping.py b/tests/docker/test_zombie_reaping.py index 8aa797b57d1..ff31be8c0d2 100644 --- a/tests/docker/test_zombie_reaping.py +++ b/tests/docker/test_zombie_reaping.py @@ -5,12 +5,18 @@ s6-overlay's ``/init`` (Phase 2 PID 1) does the same. This invariant is required for long-running containers spawning subprocesses (subagents, dashboard, dynamic gateways) — otherwise the process table fills with defunct entries and eventually exhausts the kernel PID space. + +Every ``docker exec`` here runs as the unprivileged ``hermes`` user +(via :func:`docker_exec_sh` in conftest); see the conftest module +docstring. """ from __future__ import annotations import subprocess import time +from tests.docker.conftest import docker_exec, docker_exec_sh + def test_orphan_zombies_reaped( built_image: str, container_name: str, @@ -26,17 +32,12 @@ def test_orphan_zombies_reaped( # `( ( sleep 0.1 & ) & ); sleep 1` creates a grandchild detached from # the original docker exec session — it becomes an orphan reparented # to PID 1 in the container. When it exits, PID 1 must reap it. - subprocess.run( - ["docker", "exec", container_name, "sh", "-c", - "( ( sleep 0.1 & ) & ); sleep 1"], - capture_output=True, text=True, timeout=10, + docker_exec_sh( + container_name, "( ( sleep 0.1 & ) & ); sleep 1", timeout=10, ) time.sleep(1) - r = subprocess.run( - ["docker", "exec", container_name, "ps", "axo", "stat,pid,comm"], - capture_output=True, text=True, timeout=10, - ) + r = docker_exec(container_name, "ps", "axo", "stat,pid,comm") zombies = [ line for line in r.stdout.split("\n") if line.strip().startswith("Z") diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index 37076113a09..9bcf4f93064 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -69,6 +69,101 @@ def test_detect_service_manager_returns_known_value() -> None: assert result in ("systemd", "launchd", "windows", "s6", "none") +# --------------------------------------------------------------------------- +# _s6_running — must work for unprivileged users, not just root +# --------------------------------------------------------------------------- + + +def _patch_s6_paths( + monkeypatch: pytest.MonkeyPatch, + *, + comm: str | OSError | None, + basedir_is_dir: bool, +) -> None: + """Stub /proc/1/comm and /run/s6/basedir for _s6_running tests.""" + from pathlib import Path as _Path + + real_read_text = _Path.read_text + real_is_dir = _Path.is_dir + + def fake_read_text(self, *args, **kwargs): # type: ignore[override] + if str(self) == "/proc/1/comm": + if isinstance(comm, OSError): + raise comm + if comm is None: + raise FileNotFoundError(2, "No such file or directory") + return comm + "\n" + return real_read_text(self, *args, **kwargs) + + def fake_is_dir(self): # type: ignore[override] + if str(self) == "/run/s6/basedir": + return basedir_is_dir + return real_is_dir(self) + + monkeypatch.setattr(_Path, "read_text", fake_read_text) + monkeypatch.setattr(_Path, "is_dir", fake_is_dir) + + +def test_s6_running_true_when_comm_and_basedir_match( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli.service_manager import _s6_running + + _patch_s6_paths(monkeypatch, comm="s6-svscan", basedir_is_dir=True) + assert _s6_running() is True + + +def test_s6_running_false_when_comm_is_wrong( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli.service_manager import _s6_running + + # systemd as PID 1, basedir present from some stray s6 install + _patch_s6_paths(monkeypatch, comm="systemd", basedir_is_dir=True) + assert _s6_running() is False + + +def test_s6_running_false_when_basedir_missing( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli.service_manager import _s6_running + + # The comm matches but the basedir is missing — e.g. an unrelated + # process happens to be named "s6-svscan" + _patch_s6_paths(monkeypatch, comm="s6-svscan", basedir_is_dir=False) + assert _s6_running() is False + + +def test_s6_running_false_when_comm_unreadable( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Regression: /proc/1/exe was unreadable to UID 10000 and + resolve() silently returned the unresolved path, making detection + always-False inside the container under the hermes user. The new + probe must FAIL CLOSED — not raise — when /proc/1/comm can't be + read. + """ + from hermes_cli.service_manager import _s6_running + + _patch_s6_paths( + monkeypatch, + comm=PermissionError(13, "Permission denied"), + basedir_is_dir=True, + ) + assert _s6_running() is False + + +def test_s6_running_handles_missing_proc( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """On macOS / Windows / WSL-without-procfs, /proc/1/comm doesn't + exist. Must return False, not raise.""" + from hermes_cli.service_manager import _s6_running + + _patch_s6_paths(monkeypatch, comm=None, basedir_is_dir=False) + assert _s6_running() is False + + # --------------------------------------------------------------------------- # Backend wrappers — kind + registration unsupported on hosts # ---------------------------------------------------------------------------