mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(service_manager): friendly errors for missing slots and s6-svc failures
PR #30136 review caught: `S6ServiceManager.start/stop/restart` called `subprocess.run(check=True)` on `s6-svc`, so any failure surfaced as a raw `CalledProcessError` traceback. The two cases operators actually hit are: 1. The service slot doesn't exist — most commonly because the user typed a profile name wrong (`hermes -p typo gateway start`). 2. s6-svc itself fails — most commonly EACCES on the supervise control FIFO when running unprivileged. Both deserve named errors with actionable messages, not stacktraces. Changes: * Add `S6Error` base + two concrete errors in `hermes_cli.service_manager`: - `GatewayNotRegisteredError(profile)` — carries the unprefixed profile name; message: `no such gateway 'typo': register it with `hermes profile create typo` first, or pass an existing profile name via `-p <name>``. - `S6CommandError(service, action, returncode, stderr)` — carries the s6-svc rc and stderr; message: `s6-svc start on 'gateway-coder' failed (rc=111): <stderr>`. * Factor lifecycle dispatch through `_run_svc(flag, label, name)`: pre-checks that the service directory exists (raises GatewayNotRegisteredError before invoking s6-svc), then runs s6-svc and translates any CalledProcessError into S6CommandError. * `_dispatch_via_service_manager_if_s6` in `hermes_cli.gateway` catches both errors and prints `✗ <message>` + `sys.exit(1)` instead of letting the exception bubble. The dispatch path that used to dump a traceback at the user now gives an actionable one-liner. Tests: 6 new tests for the error types and their CLI rendering; existing lifecycle test pre-seeds the slot directory before calling `mgr.start` etc.
This commit is contained in:
parent
b044c1ac29
commit
b28b3f51d3
4 changed files with 321 additions and 28 deletions
|
|
@ -5037,10 +5037,13 @@ def _dispatch_via_service_manager_if_s6(
|
|||
profile defaults to the current one (resolved via ``_profile_arg``).
|
||||
The s6 service slot was created either by the Phase 4 profile-create
|
||||
hook or by the container-boot reconciler (cont-init.d/02-…). If it
|
||||
doesn't exist, ``s6-svc`` will raise CalledProcessError — caller
|
||||
sees that as a normal failure path.
|
||||
doesn't exist or s6 returns an error, the named errors from
|
||||
:mod:`hermes_cli.service_manager` are caught and surfaced as
|
||||
actionable CLI messages (no raw ``CalledProcessError`` traceback).
|
||||
"""
|
||||
from hermes_cli.service_manager import (
|
||||
GatewayNotRegisteredError,
|
||||
S6CommandError,
|
||||
detect_service_manager,
|
||||
get_service_manager,
|
||||
)
|
||||
|
|
@ -5055,14 +5058,21 @@ def _dispatch_via_service_manager_if_s6(
|
|||
profile = _profile_suffix() or "default"
|
||||
mgr = get_service_manager()
|
||||
service_name = f"gateway-{profile}"
|
||||
if action == "start":
|
||||
mgr.start(service_name)
|
||||
elif action == "stop":
|
||||
mgr.stop(service_name)
|
||||
elif action == "restart":
|
||||
mgr.restart(service_name)
|
||||
else:
|
||||
return False
|
||||
try:
|
||||
if action == "start":
|
||||
mgr.start(service_name)
|
||||
elif action == "stop":
|
||||
mgr.stop(service_name)
|
||||
elif action == "restart":
|
||||
mgr.restart(service_name)
|
||||
else:
|
||||
return False
|
||||
except GatewayNotRegisteredError as exc:
|
||||
print(f"✗ {exc}")
|
||||
sys.exit(1)
|
||||
except S6CommandError as exc:
|
||||
print(f"✗ {exc}")
|
||||
sys.exit(1)
|
||||
return True
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -342,6 +342,60 @@ S6_SERVICE_PREFIX = "gateway-"
|
|||
_S6_BIN_DIR = "/command"
|
||||
|
||||
|
||||
class S6Error(RuntimeError):
|
||||
"""Base error for S6ServiceManager lifecycle failures.
|
||||
|
||||
Concrete subclasses carry the slot name (and, where useful, the
|
||||
underlying subprocess output) so the CLI can render an actionable
|
||||
message instead of leaking a raw ``CalledProcessError`` traceback.
|
||||
"""
|
||||
|
||||
def __init__(self, message: str, *, service: str | None = None) -> None:
|
||||
super().__init__(message)
|
||||
self.service = service
|
||||
|
||||
|
||||
class GatewayNotRegisteredError(S6Error):
|
||||
"""Raised when a lifecycle method targets a slot that doesn't exist.
|
||||
|
||||
Most commonly: ``hermes -p typo gateway start`` when no profile
|
||||
``typo`` exists. Carries the unprefixed profile name (not the
|
||||
full ``gateway-<profile>`` service-dir name) so callers can phrase
|
||||
a user-facing message like "no such gateway 'typo'".
|
||||
"""
|
||||
|
||||
def __init__(self, profile: str) -> None:
|
||||
self.profile = profile
|
||||
super().__init__(
|
||||
f"no such gateway {profile!r}: register it with "
|
||||
f"`hermes profile create {profile}` first, or pass "
|
||||
"an existing profile name via `-p <name>`",
|
||||
service=f"gateway-{profile}",
|
||||
)
|
||||
|
||||
|
||||
class S6CommandError(S6Error):
|
||||
"""Raised when an s6 command fails for a reason other than a
|
||||
missing slot — e.g. permission denied on the supervise control
|
||||
FIFO, or s6-svc returning a non-zero exit for an unexpected
|
||||
reason. Carries the stderr from the failing command so callers
|
||||
can surface it.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, *, service: str, action: str, returncode: int, stderr: str,
|
||||
) -> None:
|
||||
self.action = action
|
||||
self.returncode = returncode
|
||||
self.stderr = stderr
|
||||
message = (
|
||||
f"s6-svc {action} on {service!r} failed (rc={returncode})"
|
||||
)
|
||||
if stderr.strip():
|
||||
message += f": {stderr.strip()}"
|
||||
super().__init__(message, service=service)
|
||||
|
||||
|
||||
class S6ServiceManager:
|
||||
"""Per-profile gateway supervision via s6-overlay.
|
||||
|
||||
|
|
@ -446,29 +500,79 @@ class S6ServiceManager:
|
|||
|
||||
# -- lifecycle ---------------------------------------------------------
|
||||
|
||||
def start(self, name: str) -> None:
|
||||
"""Bring up a registered service (``s6-svc -u``)."""
|
||||
def _run_svc(self, action_flag: str, action_label: str, name: str) -> None:
|
||||
"""Shared lifecycle dispatch for start / stop / restart.
|
||||
|
||||
Translates the two failure modes operators care about into
|
||||
named errors:
|
||||
|
||||
* ``GatewayNotRegisteredError`` — the service directory at
|
||||
``<scandir>/<name>/`` doesn't exist. ``s6-svc`` would
|
||||
exit non-zero with a fairly opaque message; we pre-empt
|
||||
it with a clear "no such gateway 'X'" tied to the profile
|
||||
name (without the ``gateway-`` prefix).
|
||||
* ``S6CommandError`` — anything else (EACCES on the
|
||||
supervise control FIFO, timeout, etc.). Carries the
|
||||
subprocess return code and stderr so callers can render
|
||||
them inline.
|
||||
|
||||
``action_flag`` is the ``s6-svc`` flag (``-u`` / ``-d`` /
|
||||
``-t``); ``action_label`` is the human verb (``start`` /
|
||||
``stop`` / ``restart``) used in error messages.
|
||||
"""
|
||||
import subprocess
|
||||
subprocess.run(
|
||||
[f"{_S6_BIN_DIR}/s6-svc", "-u", str(self.scandir / name)],
|
||||
check=True, capture_output=True, timeout=5,
|
||||
)
|
||||
|
||||
service_dir = self.scandir / name
|
||||
if not service_dir.is_dir():
|
||||
# Strip the gateway- prefix back off so the message
|
||||
# matches what the user typed on the CLI (``-p <profile>``).
|
||||
profile = (
|
||||
name[len(S6_SERVICE_PREFIX):]
|
||||
if name.startswith(S6_SERVICE_PREFIX)
|
||||
else name
|
||||
)
|
||||
raise GatewayNotRegisteredError(profile)
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
[f"{_S6_BIN_DIR}/s6-svc", action_flag, str(service_dir)],
|
||||
check=True, capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise S6CommandError(
|
||||
service=name,
|
||||
action=action_label,
|
||||
returncode=exc.returncode,
|
||||
stderr=exc.stderr or "",
|
||||
) from exc
|
||||
|
||||
def start(self, name: str) -> None:
|
||||
"""Bring up a registered service (``s6-svc -u``).
|
||||
|
||||
Raises:
|
||||
GatewayNotRegisteredError: no service directory for ``name``.
|
||||
S6CommandError: s6-svc exited non-zero for any other reason
|
||||
(permission denied on the supervise FIFO, timeout, etc.).
|
||||
"""
|
||||
self._run_svc("-u", "start", name)
|
||||
|
||||
def stop(self, name: str) -> None:
|
||||
"""Bring down a registered service (``s6-svc -d``)."""
|
||||
import subprocess
|
||||
subprocess.run(
|
||||
[f"{_S6_BIN_DIR}/s6-svc", "-d", str(self.scandir / name)],
|
||||
check=True, capture_output=True, timeout=5,
|
||||
)
|
||||
"""Bring down a registered service (``s6-svc -d``).
|
||||
|
||||
Raises:
|
||||
GatewayNotRegisteredError: no service directory for ``name``.
|
||||
S6CommandError: s6-svc exited non-zero for any other reason.
|
||||
"""
|
||||
self._run_svc("-d", "stop", name)
|
||||
|
||||
def restart(self, name: str) -> None:
|
||||
"""Restart a registered service (``s6-svc -t`` = SIGTERM)."""
|
||||
import subprocess
|
||||
subprocess.run(
|
||||
[f"{_S6_BIN_DIR}/s6-svc", "-t", str(self.scandir / name)],
|
||||
check=True, capture_output=True, timeout=5,
|
||||
)
|
||||
"""Restart a registered service (``s6-svc -t`` = SIGTERM).
|
||||
|
||||
Raises:
|
||||
GatewayNotRegisteredError: no service directory for ``name``.
|
||||
S6CommandError: s6-svc exited non-zero for any other reason.
|
||||
"""
|
||||
self._run_svc("-t", "restart", name)
|
||||
|
||||
def is_running(self, name: str) -> bool:
|
||||
"""True iff ``s6-svstat`` reports the service as up."""
|
||||
|
|
|
|||
|
|
@ -259,3 +259,77 @@ def test_dispatch_all_unknown_action_returns_false(
|
|||
),
|
||||
)
|
||||
assert gw._dispatch_all_via_service_manager_if_s6("start") is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Friendly error rendering — GatewayNotRegisteredError / S6CommandError
|
||||
# (PR #30136 review item I2)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_dispatch_renders_gateway_not_registered_friendly(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
capsys: pytest.CaptureFixture,
|
||||
) -> None:
|
||||
"""`hermes -p typo gateway start` should print a clear message and
|
||||
exit 1 — not dump a traceback at the user."""
|
||||
from hermes_cli import gateway as gw
|
||||
from hermes_cli.service_manager import GatewayNotRegisteredError
|
||||
|
||||
class _RaisesMissing:
|
||||
kind = "s6"
|
||||
|
||||
def start(self, name: str) -> None:
|
||||
raise GatewayNotRegisteredError("typo")
|
||||
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager", lambda: _RaisesMissing(),
|
||||
)
|
||||
|
||||
with pytest.raises(SystemExit) as excinfo:
|
||||
gw._dispatch_via_service_manager_if_s6("start", profile="typo")
|
||||
assert excinfo.value.code == 1
|
||||
out = capsys.readouterr().out
|
||||
assert "no such gateway 'typo'" in out
|
||||
assert "hermes profile create typo" in out
|
||||
# And critically: no traceback prefix.
|
||||
assert "Traceback" not in out
|
||||
|
||||
|
||||
def test_dispatch_renders_s6_command_error_friendly(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
capsys: pytest.CaptureFixture,
|
||||
) -> None:
|
||||
"""An s6-svc failure (e.g. EACCES on the supervise FIFO) should
|
||||
surface the stderr inline, not as an opaque traceback."""
|
||||
from hermes_cli import gateway as gw
|
||||
from hermes_cli.service_manager import S6CommandError
|
||||
|
||||
class _RaisesS6Error:
|
||||
kind = "s6"
|
||||
|
||||
def start(self, name: str) -> None:
|
||||
raise S6CommandError(
|
||||
service=name,
|
||||
action="start",
|
||||
returncode=111,
|
||||
stderr="s6-svc: fatal: Permission denied",
|
||||
)
|
||||
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager", lambda: _RaisesS6Error(),
|
||||
)
|
||||
|
||||
with pytest.raises(SystemExit) as excinfo:
|
||||
gw._dispatch_via_service_manager_if_s6("start", profile="coder")
|
||||
assert excinfo.value.code == 1
|
||||
out = capsys.readouterr().out
|
||||
assert "rc=111" in out
|
||||
assert "Permission denied" in out
|
||||
assert "Traceback" not in out
|
||||
|
|
|
|||
|
|
@ -550,6 +550,10 @@ def test_s6_lifecycle_dispatches_to_s6_svc(
|
|||
) -> None:
|
||||
from hermes_cli.service_manager import S6ServiceManager
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
# _run_svc now verifies the slot exists before invoking s6-svc, so
|
||||
# we have to pre-seed the dir. In real use the slot is created by
|
||||
# register_profile_gateway or the cont-init.d reconciler.
|
||||
(s6_scandir / "gateway-coder").mkdir()
|
||||
mgr.start("gateway-coder")
|
||||
mgr.stop("gateway-coder")
|
||||
mgr.restart("gateway-coder")
|
||||
|
|
@ -558,6 +562,107 @@ def test_s6_lifecycle_dispatches_to_s6_svc(
|
|||
assert flags == ["-u", "-d", "-t"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lifecycle errors — friendly messages, not raw CalledProcessError
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_lifecycle_raises_gateway_not_registered_for_missing_slot(
|
||||
s6_scandir, fake_subprocess_run,
|
||||
) -> None:
|
||||
"""When the service slot doesn't exist, the lifecycle methods
|
||||
must raise GatewayNotRegisteredError BEFORE invoking s6-svc, so
|
||||
the user sees a clear 'no such gateway' message instead of an
|
||||
opaque CalledProcessError stacktrace."""
|
||||
from hermes_cli.service_manager import (
|
||||
GatewayNotRegisteredError,
|
||||
S6ServiceManager,
|
||||
)
|
||||
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
# No gateway-typo/ directory exists — slot is missing.
|
||||
with pytest.raises(GatewayNotRegisteredError) as excinfo:
|
||||
mgr.start("gateway-typo")
|
||||
assert excinfo.value.profile == "typo"
|
||||
assert excinfo.value.service == "gateway-typo"
|
||||
msg = str(excinfo.value)
|
||||
assert "'typo'" in msg
|
||||
assert "hermes profile create typo" in msg
|
||||
# And critically: s6-svc was NOT invoked.
|
||||
assert not any(c[0] == "s6-svc" for c in fake_subprocess_run)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("action,method_name", [
|
||||
("start", "start"),
|
||||
("stop", "stop"),
|
||||
("restart", "restart"),
|
||||
])
|
||||
def test_all_lifecycle_methods_check_for_missing_slot(
|
||||
s6_scandir,
|
||||
fake_subprocess_run,
|
||||
action: str,
|
||||
method_name: str,
|
||||
) -> None:
|
||||
"""start/stop/restart all check for missing slots the same way."""
|
||||
from hermes_cli.service_manager import (
|
||||
GatewayNotRegisteredError,
|
||||
S6ServiceManager,
|
||||
)
|
||||
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
with pytest.raises(GatewayNotRegisteredError):
|
||||
getattr(mgr, method_name)("gateway-absent")
|
||||
|
||||
|
||||
def test_gateway_not_registered_unprefixed_service_name(s6_scandir) -> None:
|
||||
"""If the caller passes a name without the 'gateway-' prefix (the
|
||||
Protocol allows arbitrary service names), the error still carries
|
||||
that name verbatim as the 'profile' so error messages don't
|
||||
accidentally strip user-provided text."""
|
||||
from hermes_cli.service_manager import (
|
||||
GatewayNotRegisteredError,
|
||||
S6ServiceManager,
|
||||
)
|
||||
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
with pytest.raises(GatewayNotRegisteredError) as excinfo:
|
||||
mgr.start("not-prefixed")
|
||||
assert excinfo.value.profile == "not-prefixed"
|
||||
|
||||
|
||||
def test_lifecycle_raises_s6_command_error_on_subprocess_failure(
|
||||
s6_scandir, monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""When s6-svc itself fails (non-zero exit) — e.g. EACCES on the
|
||||
supervise control FIFO — the lifecycle methods translate the
|
||||
CalledProcessError into a named S6CommandError carrying the
|
||||
return code and stderr."""
|
||||
import subprocess as _sp
|
||||
from hermes_cli.service_manager import S6CommandError, S6ServiceManager
|
||||
|
||||
# Pre-create the slot so we reach the s6-svc call.
|
||||
(s6_scandir / "gateway-coder").mkdir()
|
||||
|
||||
def _fail(cmd, **kw):
|
||||
raise _sp.CalledProcessError(
|
||||
returncode=111,
|
||||
cmd=cmd,
|
||||
stderr="s6-svc: fatal: unable to control supervise/control: "
|
||||
"Permission denied\n",
|
||||
)
|
||||
monkeypatch.setattr("subprocess.run", _fail)
|
||||
|
||||
mgr = S6ServiceManager(scandir=s6_scandir)
|
||||
with pytest.raises(S6CommandError) as excinfo:
|
||||
mgr.start("gateway-coder")
|
||||
assert excinfo.value.service == "gateway-coder"
|
||||
assert excinfo.value.action == "start"
|
||||
assert excinfo.value.returncode == 111
|
||||
assert "Permission denied" in excinfo.value.stderr
|
||||
assert "Permission denied" in str(excinfo.value)
|
||||
assert "rc=111" in str(excinfo.value)
|
||||
|
||||
|
||||
def test_s6_is_running_parses_svstat(
|
||||
s6_scandir, monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue