fix(service_manager): friendly errors for missing slots and s6-svc failures

PR #30136 review caught: `S6ServiceManager.start/stop/restart` called
`subprocess.run(check=True)` on `s6-svc`, so any failure surfaced as
a raw `CalledProcessError` traceback. The two cases operators
actually hit are:

  1. The service slot doesn't exist — most commonly because the user
     typed a profile name wrong (`hermes -p typo gateway start`).
  2. s6-svc itself fails — most commonly EACCES on the supervise
     control FIFO when running unprivileged.

Both deserve named errors with actionable messages, not stacktraces.

Changes:

* Add `S6Error` base + two concrete errors in `hermes_cli.service_manager`:
    - `GatewayNotRegisteredError(profile)` — carries the unprefixed
      profile name; message: `no such gateway 'typo': register it
      with `hermes profile create typo` first, or pass an existing
      profile name via `-p <name>``.
    - `S6CommandError(service, action, returncode, stderr)` — carries
      the s6-svc rc and stderr; message: `s6-svc start on
      'gateway-coder' failed (rc=111): <stderr>`.

* Factor lifecycle dispatch through `_run_svc(flag, label, name)`:
  pre-checks that the service directory exists (raises
  GatewayNotRegisteredError before invoking s6-svc), then runs
  s6-svc and translates any CalledProcessError into S6CommandError.

* `_dispatch_via_service_manager_if_s6` in `hermes_cli.gateway`
  catches both errors and prints `✗ <message>` + `sys.exit(1)`
  instead of letting the exception bubble. The dispatch path that
  used to dump a traceback at the user now gives an actionable
  one-liner.

Tests: 6 new tests for the error types and their CLI rendering;
existing lifecycle test pre-seeds the slot directory before calling
`mgr.start` etc.
This commit is contained in:
Ben 2026-05-23 15:20:41 +10:00 committed by teknium1
parent b044c1ac29
commit b28b3f51d3
No known key found for this signature in database
4 changed files with 321 additions and 28 deletions

View file

@ -5037,10 +5037,13 @@ def _dispatch_via_service_manager_if_s6(
profile defaults to the current one (resolved via ``_profile_arg``).
The s6 service slot was created either by the Phase 4 profile-create
hook or by the container-boot reconciler (cont-init.d/02-). If it
doesn't exist, ``s6-svc`` will raise CalledProcessError — caller
sees that as a normal failure path.
doesn't exist or s6 returns an error, the named errors from
:mod:`hermes_cli.service_manager` are caught and surfaced as
actionable CLI messages (no raw ``CalledProcessError`` traceback).
"""
from hermes_cli.service_manager import (
GatewayNotRegisteredError,
S6CommandError,
detect_service_manager,
get_service_manager,
)
@ -5055,14 +5058,21 @@ def _dispatch_via_service_manager_if_s6(
profile = _profile_suffix() or "default"
mgr = get_service_manager()
service_name = f"gateway-{profile}"
if action == "start":
mgr.start(service_name)
elif action == "stop":
mgr.stop(service_name)
elif action == "restart":
mgr.restart(service_name)
else:
return False
try:
if action == "start":
mgr.start(service_name)
elif action == "stop":
mgr.stop(service_name)
elif action == "restart":
mgr.restart(service_name)
else:
return False
except GatewayNotRegisteredError as exc:
print(f"{exc}")
sys.exit(1)
except S6CommandError as exc:
print(f"{exc}")
sys.exit(1)
return True

View file

@ -342,6 +342,60 @@ S6_SERVICE_PREFIX = "gateway-"
_S6_BIN_DIR = "/command"
class S6Error(RuntimeError):
"""Base error for S6ServiceManager lifecycle failures.
Concrete subclasses carry the slot name (and, where useful, the
underlying subprocess output) so the CLI can render an actionable
message instead of leaking a raw ``CalledProcessError`` traceback.
"""
def __init__(self, message: str, *, service: str | None = None) -> None:
super().__init__(message)
self.service = service
class GatewayNotRegisteredError(S6Error):
"""Raised when a lifecycle method targets a slot that doesn't exist.
Most commonly: ``hermes -p typo gateway start`` when no profile
``typo`` exists. Carries the unprefixed profile name (not the
full ``gateway-<profile>`` service-dir name) so callers can phrase
a user-facing message like "no such gateway 'typo'".
"""
def __init__(self, profile: str) -> None:
self.profile = profile
super().__init__(
f"no such gateway {profile!r}: register it with "
f"`hermes profile create {profile}` first, or pass "
"an existing profile name via `-p <name>`",
service=f"gateway-{profile}",
)
class S6CommandError(S6Error):
"""Raised when an s6 command fails for a reason other than a
missing slot e.g. permission denied on the supervise control
FIFO, or s6-svc returning a non-zero exit for an unexpected
reason. Carries the stderr from the failing command so callers
can surface it.
"""
def __init__(
self, *, service: str, action: str, returncode: int, stderr: str,
) -> None:
self.action = action
self.returncode = returncode
self.stderr = stderr
message = (
f"s6-svc {action} on {service!r} failed (rc={returncode})"
)
if stderr.strip():
message += f": {stderr.strip()}"
super().__init__(message, service=service)
class S6ServiceManager:
"""Per-profile gateway supervision via s6-overlay.
@ -446,29 +500,79 @@ class S6ServiceManager:
# -- lifecycle ---------------------------------------------------------
def start(self, name: str) -> None:
"""Bring up a registered service (``s6-svc -u``)."""
def _run_svc(self, action_flag: str, action_label: str, name: str) -> None:
"""Shared lifecycle dispatch for start / stop / restart.
Translates the two failure modes operators care about into
named errors:
* ``GatewayNotRegisteredError`` the service directory at
``<scandir>/<name>/`` doesn't exist. ``s6-svc`` would
exit non-zero with a fairly opaque message; we pre-empt
it with a clear "no such gateway 'X'" tied to the profile
name (without the ``gateway-`` prefix).
* ``S6CommandError`` anything else (EACCES on the
supervise control FIFO, timeout, etc.). Carries the
subprocess return code and stderr so callers can render
them inline.
``action_flag`` is the ``s6-svc`` flag (``-u`` / ``-d`` /
``-t``); ``action_label`` is the human verb (``start`` /
``stop`` / ``restart``) used in error messages.
"""
import subprocess
subprocess.run(
[f"{_S6_BIN_DIR}/s6-svc", "-u", str(self.scandir / name)],
check=True, capture_output=True, timeout=5,
)
service_dir = self.scandir / name
if not service_dir.is_dir():
# Strip the gateway- prefix back off so the message
# matches what the user typed on the CLI (``-p <profile>``).
profile = (
name[len(S6_SERVICE_PREFIX):]
if name.startswith(S6_SERVICE_PREFIX)
else name
)
raise GatewayNotRegisteredError(profile)
try:
subprocess.run(
[f"{_S6_BIN_DIR}/s6-svc", action_flag, str(service_dir)],
check=True, capture_output=True, text=True, timeout=5,
)
except subprocess.CalledProcessError as exc:
raise S6CommandError(
service=name,
action=action_label,
returncode=exc.returncode,
stderr=exc.stderr or "",
) from exc
def start(self, name: str) -> None:
"""Bring up a registered service (``s6-svc -u``).
Raises:
GatewayNotRegisteredError: no service directory for ``name``.
S6CommandError: s6-svc exited non-zero for any other reason
(permission denied on the supervise FIFO, timeout, etc.).
"""
self._run_svc("-u", "start", name)
def stop(self, name: str) -> None:
"""Bring down a registered service (``s6-svc -d``)."""
import subprocess
subprocess.run(
[f"{_S6_BIN_DIR}/s6-svc", "-d", str(self.scandir / name)],
check=True, capture_output=True, timeout=5,
)
"""Bring down a registered service (``s6-svc -d``).
Raises:
GatewayNotRegisteredError: no service directory for ``name``.
S6CommandError: s6-svc exited non-zero for any other reason.
"""
self._run_svc("-d", "stop", name)
def restart(self, name: str) -> None:
"""Restart a registered service (``s6-svc -t`` = SIGTERM)."""
import subprocess
subprocess.run(
[f"{_S6_BIN_DIR}/s6-svc", "-t", str(self.scandir / name)],
check=True, capture_output=True, timeout=5,
)
"""Restart a registered service (``s6-svc -t`` = SIGTERM).
Raises:
GatewayNotRegisteredError: no service directory for ``name``.
S6CommandError: s6-svc exited non-zero for any other reason.
"""
self._run_svc("-t", "restart", name)
def is_running(self, name: str) -> bool:
"""True iff ``s6-svstat`` reports the service as up."""

View file

@ -259,3 +259,77 @@ def test_dispatch_all_unknown_action_returns_false(
),
)
assert gw._dispatch_all_via_service_manager_if_s6("start") is False
# ---------------------------------------------------------------------------
# Friendly error rendering — GatewayNotRegisteredError / S6CommandError
# (PR #30136 review item I2)
# ---------------------------------------------------------------------------
def test_dispatch_renders_gateway_not_registered_friendly(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
"""`hermes -p typo gateway start` should print a clear message and
exit 1 not dump a traceback at the user."""
from hermes_cli import gateway as gw
from hermes_cli.service_manager import GatewayNotRegisteredError
class _RaisesMissing:
kind = "s6"
def start(self, name: str) -> None:
raise GatewayNotRegisteredError("typo")
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: _RaisesMissing(),
)
with pytest.raises(SystemExit) as excinfo:
gw._dispatch_via_service_manager_if_s6("start", profile="typo")
assert excinfo.value.code == 1
out = capsys.readouterr().out
assert "no such gateway 'typo'" in out
assert "hermes profile create typo" in out
# And critically: no traceback prefix.
assert "Traceback" not in out
def test_dispatch_renders_s6_command_error_friendly(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
"""An s6-svc failure (e.g. EACCES on the supervise FIFO) should
surface the stderr inline, not as an opaque traceback."""
from hermes_cli import gateway as gw
from hermes_cli.service_manager import S6CommandError
class _RaisesS6Error:
kind = "s6"
def start(self, name: str) -> None:
raise S6CommandError(
service=name,
action="start",
returncode=111,
stderr="s6-svc: fatal: Permission denied",
)
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: _RaisesS6Error(),
)
with pytest.raises(SystemExit) as excinfo:
gw._dispatch_via_service_manager_if_s6("start", profile="coder")
assert excinfo.value.code == 1
out = capsys.readouterr().out
assert "rc=111" in out
assert "Permission denied" in out
assert "Traceback" not in out

View file

@ -550,6 +550,10 @@ def test_s6_lifecycle_dispatches_to_s6_svc(
) -> None:
from hermes_cli.service_manager import S6ServiceManager
mgr = S6ServiceManager(scandir=s6_scandir)
# _run_svc now verifies the slot exists before invoking s6-svc, so
# we have to pre-seed the dir. In real use the slot is created by
# register_profile_gateway or the cont-init.d reconciler.
(s6_scandir / "gateway-coder").mkdir()
mgr.start("gateway-coder")
mgr.stop("gateway-coder")
mgr.restart("gateway-coder")
@ -558,6 +562,107 @@ def test_s6_lifecycle_dispatches_to_s6_svc(
assert flags == ["-u", "-d", "-t"]
# ---------------------------------------------------------------------------
# Lifecycle errors — friendly messages, not raw CalledProcessError
# ---------------------------------------------------------------------------
def test_lifecycle_raises_gateway_not_registered_for_missing_slot(
s6_scandir, fake_subprocess_run,
) -> None:
"""When the service slot doesn't exist, the lifecycle methods
must raise GatewayNotRegisteredError BEFORE invoking s6-svc, so
the user sees a clear 'no such gateway' message instead of an
opaque CalledProcessError stacktrace."""
from hermes_cli.service_manager import (
GatewayNotRegisteredError,
S6ServiceManager,
)
mgr = S6ServiceManager(scandir=s6_scandir)
# No gateway-typo/ directory exists — slot is missing.
with pytest.raises(GatewayNotRegisteredError) as excinfo:
mgr.start("gateway-typo")
assert excinfo.value.profile == "typo"
assert excinfo.value.service == "gateway-typo"
msg = str(excinfo.value)
assert "'typo'" in msg
assert "hermes profile create typo" in msg
# And critically: s6-svc was NOT invoked.
assert not any(c[0] == "s6-svc" for c in fake_subprocess_run)
@pytest.mark.parametrize("action,method_name", [
("start", "start"),
("stop", "stop"),
("restart", "restart"),
])
def test_all_lifecycle_methods_check_for_missing_slot(
s6_scandir,
fake_subprocess_run,
action: str,
method_name: str,
) -> None:
"""start/stop/restart all check for missing slots the same way."""
from hermes_cli.service_manager import (
GatewayNotRegisteredError,
S6ServiceManager,
)
mgr = S6ServiceManager(scandir=s6_scandir)
with pytest.raises(GatewayNotRegisteredError):
getattr(mgr, method_name)("gateway-absent")
def test_gateway_not_registered_unprefixed_service_name(s6_scandir) -> None:
"""If the caller passes a name without the 'gateway-' prefix (the
Protocol allows arbitrary service names), the error still carries
that name verbatim as the 'profile' so error messages don't
accidentally strip user-provided text."""
from hermes_cli.service_manager import (
GatewayNotRegisteredError,
S6ServiceManager,
)
mgr = S6ServiceManager(scandir=s6_scandir)
with pytest.raises(GatewayNotRegisteredError) as excinfo:
mgr.start("not-prefixed")
assert excinfo.value.profile == "not-prefixed"
def test_lifecycle_raises_s6_command_error_on_subprocess_failure(
s6_scandir, monkeypatch: pytest.MonkeyPatch,
) -> None:
"""When s6-svc itself fails (non-zero exit) — e.g. EACCES on the
supervise control FIFO the lifecycle methods translate the
CalledProcessError into a named S6CommandError carrying the
return code and stderr."""
import subprocess as _sp
from hermes_cli.service_manager import S6CommandError, S6ServiceManager
# Pre-create the slot so we reach the s6-svc call.
(s6_scandir / "gateway-coder").mkdir()
def _fail(cmd, **kw):
raise _sp.CalledProcessError(
returncode=111,
cmd=cmd,
stderr="s6-svc: fatal: unable to control supervise/control: "
"Permission denied\n",
)
monkeypatch.setattr("subprocess.run", _fail)
mgr = S6ServiceManager(scandir=s6_scandir)
with pytest.raises(S6CommandError) as excinfo:
mgr.start("gateway-coder")
assert excinfo.value.service == "gateway-coder"
assert excinfo.value.action == "start"
assert excinfo.value.returncode == 111
assert "Permission denied" in excinfo.value.stderr
assert "Permission denied" in str(excinfo.value)
assert "rc=111" in str(excinfo.value)
def test_s6_is_running_parses_svstat(
s6_scandir, monkeypatch: pytest.MonkeyPatch,
) -> None: