From b28b3f51d3e803bf12cdba17c2769f883636e555 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 23 May 2026 15:20:41 +1000 Subject: [PATCH] fix(service_manager): friendly errors for missing slots and s6-svc failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #30136 review caught: `S6ServiceManager.start/stop/restart` called `subprocess.run(check=True)` on `s6-svc`, so any failure surfaced as a raw `CalledProcessError` traceback. The two cases operators actually hit are: 1. The service slot doesn't exist — most commonly because the user typed a profile name wrong (`hermes -p typo gateway start`). 2. s6-svc itself fails — most commonly EACCES on the supervise control FIFO when running unprivileged. Both deserve named errors with actionable messages, not stacktraces. Changes: * Add `S6Error` base + two concrete errors in `hermes_cli.service_manager`: - `GatewayNotRegisteredError(profile)` — carries the unprefixed profile name; message: `no such gateway 'typo': register it with `hermes profile create typo` first, or pass an existing profile name via `-p ``. - `S6CommandError(service, action, returncode, stderr)` — carries the s6-svc rc and stderr; message: `s6-svc start on 'gateway-coder' failed (rc=111): `. * Factor lifecycle dispatch through `_run_svc(flag, label, name)`: pre-checks that the service directory exists (raises GatewayNotRegisteredError before invoking s6-svc), then runs s6-svc and translates any CalledProcessError into S6CommandError. * `_dispatch_via_service_manager_if_s6` in `hermes_cli.gateway` catches both errors and prints `✗ ` + `sys.exit(1)` instead of letting the exception bubble. The dispatch path that used to dump a traceback at the user now gives an actionable one-liner. Tests: 6 new tests for the error types and their CLI rendering; existing lifecycle test pre-seeds the slot directory before calling `mgr.start` etc. --- hermes_cli/gateway.py | 30 ++-- hermes_cli/service_manager.py | 140 ++++++++++++++++--- tests/hermes_cli/test_gateway_s6_dispatch.py | 74 ++++++++++ tests/hermes_cli/test_service_manager.py | 105 ++++++++++++++ 4 files changed, 321 insertions(+), 28 deletions(-) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index d3eeb757fc4..a3b08751257 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -5037,10 +5037,13 @@ def _dispatch_via_service_manager_if_s6( profile defaults to the current one (resolved via ``_profile_arg``). The s6 service slot was created either by the Phase 4 profile-create hook or by the container-boot reconciler (cont-init.d/02-…). If it - doesn't exist, ``s6-svc`` will raise CalledProcessError — caller - sees that as a normal failure path. + doesn't exist or s6 returns an error, the named errors from + :mod:`hermes_cli.service_manager` are caught and surfaced as + actionable CLI messages (no raw ``CalledProcessError`` traceback). """ from hermes_cli.service_manager import ( + GatewayNotRegisteredError, + S6CommandError, detect_service_manager, get_service_manager, ) @@ -5055,14 +5058,21 @@ def _dispatch_via_service_manager_if_s6( profile = _profile_suffix() or "default" mgr = get_service_manager() service_name = f"gateway-{profile}" - if action == "start": - mgr.start(service_name) - elif action == "stop": - mgr.stop(service_name) - elif action == "restart": - mgr.restart(service_name) - else: - return False + try: + if action == "start": + mgr.start(service_name) + elif action == "stop": + mgr.stop(service_name) + elif action == "restart": + mgr.restart(service_name) + else: + return False + except GatewayNotRegisteredError as exc: + print(f"✗ {exc}") + sys.exit(1) + except S6CommandError as exc: + print(f"✗ {exc}") + sys.exit(1) return True diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 461a2c98601..f8f99051317 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -342,6 +342,60 @@ S6_SERVICE_PREFIX = "gateway-" _S6_BIN_DIR = "/command" +class S6Error(RuntimeError): + """Base error for S6ServiceManager lifecycle failures. + + Concrete subclasses carry the slot name (and, where useful, the + underlying subprocess output) so the CLI can render an actionable + message instead of leaking a raw ``CalledProcessError`` traceback. + """ + + def __init__(self, message: str, *, service: str | None = None) -> None: + super().__init__(message) + self.service = service + + +class GatewayNotRegisteredError(S6Error): + """Raised when a lifecycle method targets a slot that doesn't exist. + + Most commonly: ``hermes -p typo gateway start`` when no profile + ``typo`` exists. Carries the unprefixed profile name (not the + full ``gateway-`` service-dir name) so callers can phrase + a user-facing message like "no such gateway 'typo'". + """ + + def __init__(self, profile: str) -> None: + self.profile = profile + super().__init__( + f"no such gateway {profile!r}: register it with " + f"`hermes profile create {profile}` first, or pass " + "an existing profile name via `-p `", + service=f"gateway-{profile}", + ) + + +class S6CommandError(S6Error): + """Raised when an s6 command fails for a reason other than a + missing slot — e.g. permission denied on the supervise control + FIFO, or s6-svc returning a non-zero exit for an unexpected + reason. Carries the stderr from the failing command so callers + can surface it. + """ + + def __init__( + self, *, service: str, action: str, returncode: int, stderr: str, + ) -> None: + self.action = action + self.returncode = returncode + self.stderr = stderr + message = ( + f"s6-svc {action} on {service!r} failed (rc={returncode})" + ) + if stderr.strip(): + message += f": {stderr.strip()}" + super().__init__(message, service=service) + + class S6ServiceManager: """Per-profile gateway supervision via s6-overlay. @@ -446,29 +500,79 @@ class S6ServiceManager: # -- lifecycle --------------------------------------------------------- - def start(self, name: str) -> None: - """Bring up a registered service (``s6-svc -u``).""" + def _run_svc(self, action_flag: str, action_label: str, name: str) -> None: + """Shared lifecycle dispatch for start / stop / restart. + + Translates the two failure modes operators care about into + named errors: + + * ``GatewayNotRegisteredError`` — the service directory at + ``//`` doesn't exist. ``s6-svc`` would + exit non-zero with a fairly opaque message; we pre-empt + it with a clear "no such gateway 'X'" tied to the profile + name (without the ``gateway-`` prefix). + * ``S6CommandError`` — anything else (EACCES on the + supervise control FIFO, timeout, etc.). Carries the + subprocess return code and stderr so callers can render + them inline. + + ``action_flag`` is the ``s6-svc`` flag (``-u`` / ``-d`` / + ``-t``); ``action_label`` is the human verb (``start`` / + ``stop`` / ``restart``) used in error messages. + """ import subprocess - subprocess.run( - [f"{_S6_BIN_DIR}/s6-svc", "-u", str(self.scandir / name)], - check=True, capture_output=True, timeout=5, - ) + + service_dir = self.scandir / name + if not service_dir.is_dir(): + # Strip the gateway- prefix back off so the message + # matches what the user typed on the CLI (``-p ``). + profile = ( + name[len(S6_SERVICE_PREFIX):] + if name.startswith(S6_SERVICE_PREFIX) + else name + ) + raise GatewayNotRegisteredError(profile) + + try: + subprocess.run( + [f"{_S6_BIN_DIR}/s6-svc", action_flag, str(service_dir)], + check=True, capture_output=True, text=True, timeout=5, + ) + except subprocess.CalledProcessError as exc: + raise S6CommandError( + service=name, + action=action_label, + returncode=exc.returncode, + stderr=exc.stderr or "", + ) from exc + + def start(self, name: str) -> None: + """Bring up a registered service (``s6-svc -u``). + + Raises: + GatewayNotRegisteredError: no service directory for ``name``. + S6CommandError: s6-svc exited non-zero for any other reason + (permission denied on the supervise FIFO, timeout, etc.). + """ + self._run_svc("-u", "start", name) def stop(self, name: str) -> None: - """Bring down a registered service (``s6-svc -d``).""" - import subprocess - subprocess.run( - [f"{_S6_BIN_DIR}/s6-svc", "-d", str(self.scandir / name)], - check=True, capture_output=True, timeout=5, - ) + """Bring down a registered service (``s6-svc -d``). + + Raises: + GatewayNotRegisteredError: no service directory for ``name``. + S6CommandError: s6-svc exited non-zero for any other reason. + """ + self._run_svc("-d", "stop", name) def restart(self, name: str) -> None: - """Restart a registered service (``s6-svc -t`` = SIGTERM).""" - import subprocess - subprocess.run( - [f"{_S6_BIN_DIR}/s6-svc", "-t", str(self.scandir / name)], - check=True, capture_output=True, timeout=5, - ) + """Restart a registered service (``s6-svc -t`` = SIGTERM). + + Raises: + GatewayNotRegisteredError: no service directory for ``name``. + S6CommandError: s6-svc exited non-zero for any other reason. + """ + self._run_svc("-t", "restart", name) def is_running(self, name: str) -> bool: """True iff ``s6-svstat`` reports the service as up.""" diff --git a/tests/hermes_cli/test_gateway_s6_dispatch.py b/tests/hermes_cli/test_gateway_s6_dispatch.py index e4a1969d3fd..ba83c1a1187 100644 --- a/tests/hermes_cli/test_gateway_s6_dispatch.py +++ b/tests/hermes_cli/test_gateway_s6_dispatch.py @@ -259,3 +259,77 @@ def test_dispatch_all_unknown_action_returns_false( ), ) assert gw._dispatch_all_via_service_manager_if_s6("start") is False + + +# --------------------------------------------------------------------------- +# Friendly error rendering — GatewayNotRegisteredError / S6CommandError +# (PR #30136 review item I2) +# --------------------------------------------------------------------------- + + +def test_dispatch_renders_gateway_not_registered_friendly( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + """`hermes -p typo gateway start` should print a clear message and + exit 1 — not dump a traceback at the user.""" + from hermes_cli import gateway as gw + from hermes_cli.service_manager import GatewayNotRegisteredError + + class _RaisesMissing: + kind = "s6" + + def start(self, name: str) -> None: + raise GatewayNotRegisteredError("typo") + + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: _RaisesMissing(), + ) + + with pytest.raises(SystemExit) as excinfo: + gw._dispatch_via_service_manager_if_s6("start", profile="typo") + assert excinfo.value.code == 1 + out = capsys.readouterr().out + assert "no such gateway 'typo'" in out + assert "hermes profile create typo" in out + # And critically: no traceback prefix. + assert "Traceback" not in out + + +def test_dispatch_renders_s6_command_error_friendly( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + """An s6-svc failure (e.g. EACCES on the supervise FIFO) should + surface the stderr inline, not as an opaque traceback.""" + from hermes_cli import gateway as gw + from hermes_cli.service_manager import S6CommandError + + class _RaisesS6Error: + kind = "s6" + + def start(self, name: str) -> None: + raise S6CommandError( + service=name, + action="start", + returncode=111, + stderr="s6-svc: fatal: Permission denied", + ) + + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: _RaisesS6Error(), + ) + + with pytest.raises(SystemExit) as excinfo: + gw._dispatch_via_service_manager_if_s6("start", profile="coder") + assert excinfo.value.code == 1 + out = capsys.readouterr().out + assert "rc=111" in out + assert "Permission denied" in out + assert "Traceback" not in out diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index 9bcf4f93064..e9c85f33267 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -550,6 +550,10 @@ def test_s6_lifecycle_dispatches_to_s6_svc( ) -> None: from hermes_cli.service_manager import S6ServiceManager mgr = S6ServiceManager(scandir=s6_scandir) + # _run_svc now verifies the slot exists before invoking s6-svc, so + # we have to pre-seed the dir. In real use the slot is created by + # register_profile_gateway or the cont-init.d reconciler. + (s6_scandir / "gateway-coder").mkdir() mgr.start("gateway-coder") mgr.stop("gateway-coder") mgr.restart("gateway-coder") @@ -558,6 +562,107 @@ def test_s6_lifecycle_dispatches_to_s6_svc( assert flags == ["-u", "-d", "-t"] +# --------------------------------------------------------------------------- +# Lifecycle errors — friendly messages, not raw CalledProcessError +# --------------------------------------------------------------------------- + + +def test_lifecycle_raises_gateway_not_registered_for_missing_slot( + s6_scandir, fake_subprocess_run, +) -> None: + """When the service slot doesn't exist, the lifecycle methods + must raise GatewayNotRegisteredError BEFORE invoking s6-svc, so + the user sees a clear 'no such gateway' message instead of an + opaque CalledProcessError stacktrace.""" + from hermes_cli.service_manager import ( + GatewayNotRegisteredError, + S6ServiceManager, + ) + + mgr = S6ServiceManager(scandir=s6_scandir) + # No gateway-typo/ directory exists — slot is missing. + with pytest.raises(GatewayNotRegisteredError) as excinfo: + mgr.start("gateway-typo") + assert excinfo.value.profile == "typo" + assert excinfo.value.service == "gateway-typo" + msg = str(excinfo.value) + assert "'typo'" in msg + assert "hermes profile create typo" in msg + # And critically: s6-svc was NOT invoked. + assert not any(c[0] == "s6-svc" for c in fake_subprocess_run) + + +@pytest.mark.parametrize("action,method_name", [ + ("start", "start"), + ("stop", "stop"), + ("restart", "restart"), +]) +def test_all_lifecycle_methods_check_for_missing_slot( + s6_scandir, + fake_subprocess_run, + action: str, + method_name: str, +) -> None: + """start/stop/restart all check for missing slots the same way.""" + from hermes_cli.service_manager import ( + GatewayNotRegisteredError, + S6ServiceManager, + ) + + mgr = S6ServiceManager(scandir=s6_scandir) + with pytest.raises(GatewayNotRegisteredError): + getattr(mgr, method_name)("gateway-absent") + + +def test_gateway_not_registered_unprefixed_service_name(s6_scandir) -> None: + """If the caller passes a name without the 'gateway-' prefix (the + Protocol allows arbitrary service names), the error still carries + that name verbatim as the 'profile' so error messages don't + accidentally strip user-provided text.""" + from hermes_cli.service_manager import ( + GatewayNotRegisteredError, + S6ServiceManager, + ) + + mgr = S6ServiceManager(scandir=s6_scandir) + with pytest.raises(GatewayNotRegisteredError) as excinfo: + mgr.start("not-prefixed") + assert excinfo.value.profile == "not-prefixed" + + +def test_lifecycle_raises_s6_command_error_on_subprocess_failure( + s6_scandir, monkeypatch: pytest.MonkeyPatch, +) -> None: + """When s6-svc itself fails (non-zero exit) — e.g. EACCES on the + supervise control FIFO — the lifecycle methods translate the + CalledProcessError into a named S6CommandError carrying the + return code and stderr.""" + import subprocess as _sp + from hermes_cli.service_manager import S6CommandError, S6ServiceManager + + # Pre-create the slot so we reach the s6-svc call. + (s6_scandir / "gateway-coder").mkdir() + + def _fail(cmd, **kw): + raise _sp.CalledProcessError( + returncode=111, + cmd=cmd, + stderr="s6-svc: fatal: unable to control supervise/control: " + "Permission denied\n", + ) + monkeypatch.setattr("subprocess.run", _fail) + + mgr = S6ServiceManager(scandir=s6_scandir) + with pytest.raises(S6CommandError) as excinfo: + mgr.start("gateway-coder") + assert excinfo.value.service == "gateway-coder" + assert excinfo.value.action == "start" + assert excinfo.value.returncode == 111 + assert "Permission denied" in excinfo.value.stderr + assert "Permission denied" in str(excinfo.value) + assert "rc=111" in str(excinfo.value) + + def test_s6_is_running_parses_svstat( s6_scandir, monkeypatch: pytest.MonkeyPatch, ) -> None: