mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-08 08:11:38 +00:00
fix(service_manager): friendly errors for missing slots and s6-svc failures
PR #30136 review caught: `S6ServiceManager.start/stop/restart` called `subprocess.run(check=True)` on `s6-svc`, so any failure surfaced as a raw `CalledProcessError` traceback. The two cases operators actually hit are: 1. The service slot doesn't exist — most commonly because the user typed a profile name wrong (`hermes -p typo gateway start`). 2. s6-svc itself fails — most commonly EACCES on the supervise control FIFO when running unprivileged. Both deserve named errors with actionable messages, not stacktraces. Changes: * Add `S6Error` base + two concrete errors in `hermes_cli.service_manager`: - `GatewayNotRegisteredError(profile)` — carries the unprefixed profile name; message: `no such gateway 'typo': register it with `hermes profile create typo` first, or pass an existing profile name via `-p <name>``. - `S6CommandError(service, action, returncode, stderr)` — carries the s6-svc rc and stderr; message: `s6-svc start on 'gateway-coder' failed (rc=111): <stderr>`. * Factor lifecycle dispatch through `_run_svc(flag, label, name)`: pre-checks that the service directory exists (raises GatewayNotRegisteredError before invoking s6-svc), then runs s6-svc and translates any CalledProcessError into S6CommandError. * `_dispatch_via_service_manager_if_s6` in `hermes_cli.gateway` catches both errors and prints `✗ <message>` + `sys.exit(1)` instead of letting the exception bubble. The dispatch path that used to dump a traceback at the user now gives an actionable one-liner. Tests: 6 new tests for the error types and their CLI rendering; existing lifecycle test pre-seeds the slot directory before calling `mgr.start` etc.
This commit is contained in:
parent
b044c1ac29
commit
b28b3f51d3
4 changed files with 321 additions and 28 deletions
|
|
@ -5037,10 +5037,13 @@ def _dispatch_via_service_manager_if_s6(
|
|||
profile defaults to the current one (resolved via ``_profile_arg``).
|
||||
The s6 service slot was created either by the Phase 4 profile-create
|
||||
hook or by the container-boot reconciler (cont-init.d/02-…). If it
|
||||
doesn't exist, ``s6-svc`` will raise CalledProcessError — caller
|
||||
sees that as a normal failure path.
|
||||
doesn't exist or s6 returns an error, the named errors from
|
||||
:mod:`hermes_cli.service_manager` are caught and surfaced as
|
||||
actionable CLI messages (no raw ``CalledProcessError`` traceback).
|
||||
"""
|
||||
from hermes_cli.service_manager import (
|
||||
GatewayNotRegisteredError,
|
||||
S6CommandError,
|
||||
detect_service_manager,
|
||||
get_service_manager,
|
||||
)
|
||||
|
|
@ -5055,14 +5058,21 @@ def _dispatch_via_service_manager_if_s6(
|
|||
profile = _profile_suffix() or "default"
|
||||
mgr = get_service_manager()
|
||||
service_name = f"gateway-{profile}"
|
||||
if action == "start":
|
||||
mgr.start(service_name)
|
||||
elif action == "stop":
|
||||
mgr.stop(service_name)
|
||||
elif action == "restart":
|
||||
mgr.restart(service_name)
|
||||
else:
|
||||
return False
|
||||
try:
|
||||
if action == "start":
|
||||
mgr.start(service_name)
|
||||
elif action == "stop":
|
||||
mgr.stop(service_name)
|
||||
elif action == "restart":
|
||||
mgr.restart(service_name)
|
||||
else:
|
||||
return False
|
||||
except GatewayNotRegisteredError as exc:
|
||||
print(f"✗ {exc}")
|
||||
sys.exit(1)
|
||||
except S6CommandError as exc:
|
||||
print(f"✗ {exc}")
|
||||
sys.exit(1)
|
||||
return True
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -342,6 +342,60 @@ S6_SERVICE_PREFIX = "gateway-"
|
|||
_S6_BIN_DIR = "/command"
|
||||
|
||||
|
||||
class S6Error(RuntimeError):
|
||||
"""Base error for S6ServiceManager lifecycle failures.
|
||||
|
||||
Concrete subclasses carry the slot name (and, where useful, the
|
||||
underlying subprocess output) so the CLI can render an actionable
|
||||
message instead of leaking a raw ``CalledProcessError`` traceback.
|
||||
"""
|
||||
|
||||
def __init__(self, message: str, *, service: str | None = None) -> None:
|
||||
super().__init__(message)
|
||||
self.service = service
|
||||
|
||||
|
||||
class GatewayNotRegisteredError(S6Error):
|
||||
"""Raised when a lifecycle method targets a slot that doesn't exist.
|
||||
|
||||
Most commonly: ``hermes -p typo gateway start`` when no profile
|
||||
``typo`` exists. Carries the unprefixed profile name (not the
|
||||
full ``gateway-<profile>`` service-dir name) so callers can phrase
|
||||
a user-facing message like "no such gateway 'typo'".
|
||||
"""
|
||||
|
||||
def __init__(self, profile: str) -> None:
|
||||
self.profile = profile
|
||||
super().__init__(
|
||||
f"no such gateway {profile!r}: register it with "
|
||||
f"`hermes profile create {profile}` first, or pass "
|
||||
"an existing profile name via `-p <name>`",
|
||||
service=f"gateway-{profile}",
|
||||
)
|
||||
|
||||
|
||||
class S6CommandError(S6Error):
|
||||
"""Raised when an s6 command fails for a reason other than a
|
||||
missing slot — e.g. permission denied on the supervise control
|
||||
FIFO, or s6-svc returning a non-zero exit for an unexpected
|
||||
reason. Carries the stderr from the failing command so callers
|
||||
can surface it.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, *, service: str, action: str, returncode: int, stderr: str,
|
||||
) -> None:
|
||||
self.action = action
|
||||
self.returncode = returncode
|
||||
self.stderr = stderr
|
||||
message = (
|
||||
f"s6-svc {action} on {service!r} failed (rc={returncode})"
|
||||
)
|
||||
if stderr.strip():
|
||||
message += f": {stderr.strip()}"
|
||||
super().__init__(message, service=service)
|
||||
|
||||
|
||||
class S6ServiceManager:
|
||||
"""Per-profile gateway supervision via s6-overlay.
|
||||
|
||||
|
|
@ -446,29 +500,79 @@ class S6ServiceManager:
|
|||
|
||||
# -- lifecycle ---------------------------------------------------------
|
||||
|
||||
def start(self, name: str) -> None:
|
||||
"""Bring up a registered service (``s6-svc -u``)."""
|
||||
def _run_svc(self, action_flag: str, action_label: str, name: str) -> None:
|
||||
"""Shared lifecycle dispatch for start / stop / restart.
|
||||
|
||||
Translates the two failure modes operators care about into
|
||||
named errors:
|
||||
|
||||
* ``GatewayNotRegisteredError`` — the service directory at
|
||||
``<scandir>/<name>/`` doesn't exist. ``s6-svc`` would
|
||||
exit non-zero with a fairly opaque message; we pre-empt
|
||||
it with a clear "no such gateway 'X'" tied to the profile
|
||||
name (without the ``gateway-`` prefix).
|
||||
* ``S6CommandError`` — anything else (EACCES on the
|
||||
supervise control FIFO, timeout, etc.). Carries the
|
||||
subprocess return code and stderr so callers can render
|
||||
them inline.
|
||||
|
||||
``action_flag`` is the ``s6-svc`` flag (``-u`` / ``-d`` /
|
||||
``-t``); ``action_label`` is the human verb (``start`` /
|
||||
``stop`` / ``restart``) used in error messages.
|
||||
"""
|
||||
import subprocess
|
||||
subprocess.run(
|
||||
[f"{_S6_BIN_DIR}/s6-svc", "-u", str(self.scandir / name)],
|
||||
check=True, capture_output=True, timeout=5,
|
||||
)
|
||||
|
||||
service_dir = self.scandir / name
|
||||
if not service_dir.is_dir():
|
||||
# Strip the gateway- prefix back off so the message
|
||||
# matches what the user typed on the CLI (``-p <profile>``).
|
||||
profile = (
|
||||
name[len(S6_SERVICE_PREFIX):]
|
||||
if name.startswith(S6_SERVICE_PREFIX)
|
||||
else name
|
||||
)
|
||||
raise GatewayNotRegisteredError(profile)
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
[f"{_S6_BIN_DIR}/s6-svc", action_flag, str(service_dir)],
|
||||
check=True, capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
raise S6CommandError(
|
||||
service=name,
|
||||
action=action_label,
|
||||
returncode=exc.returncode,
|
||||
stderr=exc.stderr or "",
|
||||
) from exc
|
||||
|
||||
def start(self, name: str) -> None:
|
||||
"""Bring up a registered service (``s6-svc -u``).
|
||||
|
||||
Raises:
|
||||
GatewayNotRegisteredError: no service directory for ``name``.
|
||||
S6CommandError: s6-svc exited non-zero for any other reason
|
||||
(permission denied on the supervise FIFO, timeout, etc.).
|
||||
"""
|
||||
self._run_svc("-u", "start", name)
|
||||
|
||||
def stop(self, name: str) -> None:
|
||||
"""Bring down a registered service (``s6-svc -d``)."""
|
||||
import subprocess
|
||||
subprocess.run(
|
||||
[f"{_S6_BIN_DIR}/s6-svc", "-d", str(self.scandir / name)],
|
||||
check=True, capture_output=True, timeout=5,
|
||||
)
|
||||
"""Bring down a registered service (``s6-svc -d``).
|
||||
|
||||
Raises:
|
||||
GatewayNotRegisteredError: no service directory for ``name``.
|
||||
S6CommandError: s6-svc exited non-zero for any other reason.
|
||||
"""
|
||||
self._run_svc("-d", "stop", name)
|
||||
|
||||
def restart(self, name: str) -> None:
|
||||
"""Restart a registered service (``s6-svc -t`` = SIGTERM)."""
|
||||
import subprocess
|
||||
subprocess.run(
|
||||
[f"{_S6_BIN_DIR}/s6-svc", "-t", str(self.scandir / name)],
|
||||
check=True, capture_output=True, timeout=5,
|
||||
)
|
||||
"""Restart a registered service (``s6-svc -t`` = SIGTERM).
|
||||
|
||||
Raises:
|
||||
GatewayNotRegisteredError: no service directory for ``name``.
|
||||
S6CommandError: s6-svc exited non-zero for any other reason.
|
||||
"""
|
||||
self._run_svc("-t", "restart", name)
|
||||
|
||||
def is_running(self, name: str) -> bool:
|
||||
"""True iff ``s6-svstat`` reports the service as up."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue