fix(gateway): don't dead-end setup wizard when only system-scope unit is installed

The setup wizard dropped non-root users at a bare shell prompt when
trying to start a system-scope gateway service. Previously
_require_root_for_system_service called sys.exit(1), which the
wizard's `except Exception` guards cannot catch (SystemExit is a
BaseException). Users with a pre-existing /etc/systemd/system unit
(e.g. from an earlier `sudo hermes setup` run) hit this whenever
they re-ran `hermes setup` as a regular user.

- Convert _require_root_for_system_service to raise a typed
  SystemScopeRequiresRootError (RuntimeError subclass) instead of
  sys.exit(1). The direct CLI path (`hermes gateway install|start|stop|
  restart|uninstall` without sudo) still exits 1 cleanly via a new
  catch at the top of gateway_command, matching the existing
  UserSystemdUnavailableError pattern.
- Add _system_scope_wizard_would_need_root() pre-check and
  _print_system_scope_remediation() helper. Both setup wizards
  (hermes_cli/setup.py and hermes_cli/gateway.py::gateway_setup) now
  detect the dead-end before prompting and print actionable guidance:
  either `sudo systemctl start <service>` this time, or uninstall the
  system unit and install a per-user one.
- Defense-in-depth: all 5 wizard prompt sites also catch
  SystemScopeRequiresRootError and fall back to the remediation
  helper if the pre-check is bypassed (race, etc.).

Tests: 12 new tests in TestSystemScopeRequiresRootError,
TestSystemScopeWizardPreCheck, TestSystemScopeRemediationOutput, and
TestGatewayCommandCatchesSystemScopeError covering the exception
contract, pre-check matrix (root vs non-root, system-only vs
user-present vs none vs explicit system=True), remediation output
for each action, and the direct-CLI exit-1 path.
This commit is contained in:
Teknium 2026-05-06 14:08:29 -07:00
parent 04cf4788cc
commit 3cdbf334d5
3 changed files with 285 additions and 7 deletions

View file

@ -967,6 +967,27 @@ class UserSystemdUnavailableError(RuntimeError):
"""
class SystemScopeRequiresRootError(RuntimeError):
"""Raised when a system-scope gateway operation is attempted as non-root.
System-scope units live in ``/etc/systemd/system/`` and require root for
install / uninstall / start / stop / restart via ``systemctl``. The
previous behavior was ``sys.exit(1)`` which blew past the wizard's
``except Exception`` guards and dumped the user at a bare shell prompt
with no guidance. Raising a typed exception lets callers that can
recover (the setup wizard) print actionable remediation instead, while
``gateway_command`` still exits 1 with the same message for the direct
CLI path.
``args[0]`` carries the user-facing message, ``args[1]`` the action name.
``str(e)`` returns only the message (not the tuple repr) so format
strings like ``f"Failed: {e}"`` render cleanly.
"""
def __str__(self) -> str:
return self.args[0] if self.args else ""
def _user_dbus_socket_path() -> Path:
"""Return the expected per-user D-Bus socket path (regardless of existence)."""
xdg = os.environ.get("XDG_RUNTIME_DIR") or f"/run/user/{os.getuid()}"
@ -1382,8 +1403,10 @@ def print_systemd_scope_conflict_warning() -> None:
def _require_root_for_system_service(action: str) -> None:
if os.geteuid() != 0:
print(f"System gateway {action} requires root. Re-run with sudo.")
sys.exit(1)
raise SystemScopeRequiresRootError(
f"System gateway {action} requires root. Re-run with sudo.",
action,
)
def _system_service_identity(run_as_user: str | None = None) -> tuple[str, str, str]:
@ -1930,6 +1953,47 @@ def _select_systemd_scope(system: bool = False) -> bool:
return get_systemd_unit_path(system=True).exists() and not get_systemd_unit_path(system=False).exists()
def _system_scope_wizard_would_need_root(system: bool = False) -> bool:
"""True when the setup wizard is about to trigger a system-scope operation
as a non-root user.
Replicates the decision ``_select_systemd_scope`` makes inside
``systemd_start`` / ``systemd_restart`` / ``systemd_stop`` so the wizard
can detect the dead-end BEFORE prompting, rather than letting
``SystemScopeRequiresRootError`` propagate out and leave the user
staring at a bare shell.
"""
if os.geteuid() == 0:
return False
return _select_systemd_scope(system=system)
def _print_system_scope_remediation(action: str) -> None:
"""Print actionable remediation when the wizard skips a system-scope
prompt because the user isn't root. Keeps the wizard flowing instead of
aborting.
"""
svc = get_service_name()
print_warning(
f"Gateway is installed as a system-wide service — "
f"{action} requires root."
)
print_info(" Options:")
print_info(f" 1. {action.capitalize()} it this time:")
if action == "start":
print_info(f" sudo systemctl start {svc}")
elif action == "stop":
print_info(f" sudo systemctl stop {svc}")
elif action == "restart":
print_info(f" sudo systemctl restart {svc}")
else:
print_info(f" sudo systemctl {action} {svc}")
print_info(" 2. Switch to a per-user service (recommended for personal use):")
print_info(" sudo hermes gateway uninstall --system")
print_info(" hermes gateway install")
print_info(" hermes gateway start")
def _get_restart_drain_timeout() -> float:
"""Return the configured gateway restart drain timeout in seconds."""
raw = os.getenv("HERMES_RESTART_DRAIN_TIMEOUT", "").strip()
@ -4115,7 +4179,9 @@ def gateway_setup():
print_success("Gateway service is installed and running.")
elif service_installed:
print_warning("Gateway service is installed but not running.")
if prompt_yes_no(" Start it now?", True):
if supports_systemd_services() and _system_scope_wizard_would_need_root():
_print_system_scope_remediation("start")
elif prompt_yes_no(" Start it now?", True):
try:
if supports_systemd_services():
systemd_start()
@ -4125,6 +4191,12 @@ def gateway_setup():
print_error(" Failed to start — user systemd not reachable:")
for line in str(e).splitlines():
print(f" {line}")
except SystemScopeRequiresRootError as e:
# Defense in depth: the pre-check above should have caught
# this, but handle the race/edge case gracefully instead of
# letting the exception escape the wizard.
print_error(f" Failed to start: {e}")
_print_system_scope_remediation("start")
except subprocess.CalledProcessError as e:
print_error(f" Failed to start: {e}")
else:
@ -4174,7 +4246,9 @@ def gateway_setup():
service_running = _is_service_running()
if service_running:
if prompt_yes_no(" Restart the gateway to pick up changes?", True):
if supports_systemd_services() and _system_scope_wizard_would_need_root():
_print_system_scope_remediation("restart")
elif prompt_yes_no(" Restart the gateway to pick up changes?", True):
try:
if supports_systemd_services():
systemd_restart()
@ -4187,10 +4261,15 @@ def gateway_setup():
print_error(" Restart failed — user systemd not reachable:")
for line in str(e).splitlines():
print(f" {line}")
except SystemScopeRequiresRootError as e:
print_error(f" Restart failed: {e}")
_print_system_scope_remediation("restart")
except subprocess.CalledProcessError as e:
print_error(f" Restart failed: {e}")
elif service_installed:
if prompt_yes_no(" Start the gateway service?", True):
if supports_systemd_services() and _system_scope_wizard_would_need_root():
_print_system_scope_remediation("start")
elif prompt_yes_no(" Start the gateway service?", True):
try:
if supports_systemd_services():
systemd_start()
@ -4200,6 +4279,9 @@ def gateway_setup():
print_error(" Start failed — user systemd not reachable:")
for line in str(e).splitlines():
print(f" {line}")
except SystemScopeRequiresRootError as e:
print_error(f" Start failed: {e}")
_print_system_scope_remediation("start")
except subprocess.CalledProcessError as e:
print_error(f" Start failed: {e}")
else:
@ -4273,6 +4355,14 @@ def gateway_command(args):
for line in str(e).splitlines():
print(f" {line}")
sys.exit(1)
except SystemScopeRequiresRootError as e:
# The direct ``hermes gateway install|uninstall|start|stop|restart``
# path lands here when the user typed a system-scope action without
# sudo. Same exit code as before — just gives the wizard a way to
# intercept the same condition with friendlier guidance before the
# error is raised.
print(str(e))
sys.exit(1)
def _gateway_command_inner(args):

View file

@ -2462,6 +2462,9 @@ def setup_gateway(config: dict):
launchd_start,
launchd_restart,
UserSystemdUnavailableError,
SystemScopeRequiresRootError,
_system_scope_wizard_would_need_root,
_print_system_scope_remediation,
)
service_installed = _is_service_installed()
@ -2479,7 +2482,9 @@ def setup_gateway(config: dict):
print()
if service_running:
if prompt_yes_no(" Restart the gateway to pick up changes?", True):
if supports_systemd and _system_scope_wizard_would_need_root():
_print_system_scope_remediation("restart")
elif prompt_yes_no(" Restart the gateway to pick up changes?", True):
try:
if supports_systemd:
systemd_restart()
@ -2489,10 +2494,19 @@ def setup_gateway(config: dict):
print_error(" Restart failed — user systemd not reachable:")
for line in str(e).splitlines():
print(f" {line}")
except SystemScopeRequiresRootError as e:
# Defense in depth: the pre-check above should have
# caught this, but a race (unit file appearing mid-run)
# could still land here. Previously this exited the
# whole wizard via sys.exit(1).
print_error(f" Restart failed: {e}")
_print_system_scope_remediation("restart")
except Exception as e:
print_error(f" Restart failed: {e}")
elif service_installed:
if prompt_yes_no(" Start the gateway service?", True):
if supports_systemd and _system_scope_wizard_would_need_root():
_print_system_scope_remediation("start")
elif prompt_yes_no(" Start the gateway service?", True):
try:
if supports_systemd:
systemd_start()
@ -2502,6 +2516,9 @@ def setup_gateway(config: dict):
print_error(" Start failed — user systemd not reachable:")
for line in str(e).splitlines():
print(f" {line}")
except SystemScopeRequiresRootError as e:
print_error(f" Start failed: {e}")
_print_system_scope_remediation("start")
except Exception as e:
print_error(f" Start failed: {e}")
elif supports_service_manager:
@ -2529,6 +2546,9 @@ def setup_gateway(config: dict):
print_error(" Start failed — user systemd not reachable:")
for line in str(e).splitlines():
print(f" {line}")
except SystemScopeRequiresRootError as e:
print_error(f" Start failed: {e}")
_print_system_scope_remediation("start")
except Exception as e:
print_error(f" Start failed: {e}")
except Exception as e:

View file

@ -2177,3 +2177,171 @@ class TestSystemdInstallOffersLegacyRemoval:
assert prompt_called["count"] == 0
assert remove_called["invoked"] is False
class TestSystemScopeRequiresRootError:
"""Tests for the SystemScopeRequiresRootError replacement of sys.exit(1).
Before this change, ``_require_root_for_system_service`` called
``sys.exit(1)`` when non-root code tried a system-scope systemd
operation. The wizard's ``except Exception`` guards don't catch
``SystemExit`` (it's a ``BaseException`` subclass), so the user was
dumped at a bare shell prompt mid-setup. The fix raises a typed
exception instead, which the wizard intercepts and handles with
actionable remediation.
"""
def test_require_root_raises_when_non_root(self, monkeypatch):
monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000)
with pytest.raises(gateway_cli.SystemScopeRequiresRootError) as excinfo:
gateway_cli._require_root_for_system_service("start")
assert excinfo.value.args[0] == "System gateway start requires root. Re-run with sudo."
assert excinfo.value.args[1] == "start"
# str(e) renders only the message, not the tuple repr, so that
# wizard format strings like f"Failed: {e}" print cleanly.
assert str(excinfo.value) == "System gateway start requires root. Re-run with sudo."
assert f"Failed: {excinfo.value}" == "Failed: System gateway start requires root. Re-run with sudo."
def test_require_root_noop_when_root(self, monkeypatch):
monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 0)
# Should not raise, should not exit
gateway_cli._require_root_for_system_service("start")
def test_error_is_runtime_error_subclass(self):
"""Wizards use ``except Exception`` guards — the error must be a
``RuntimeError`` (catchable by ``Exception``), NOT a ``SystemExit``
(``BaseException``), so the wizard can recover from it.
"""
err = gateway_cli.SystemScopeRequiresRootError("msg", "start")
assert isinstance(err, RuntimeError)
assert isinstance(err, Exception)
assert not isinstance(err, SystemExit)
class TestSystemScopeWizardPreCheck:
"""Tests for _system_scope_wizard_would_need_root — the guard the
wizard uses to detect the dead-end BEFORE prompting the user to start
a service that will fail without sudo.
"""
@staticmethod
def _setup_units(tmp_path, monkeypatch, system_present: bool, user_present: bool):
sys_dir = tmp_path / "sys"
usr_dir = tmp_path / "usr"
sys_dir.mkdir()
usr_dir.mkdir()
if system_present:
(sys_dir / "hermes-gateway.service").write_text("[Unit]\n")
if user_present:
(usr_dir / "hermes-gateway.service").write_text("[Unit]\n")
monkeypatch.setattr(
gateway_cli,
"get_systemd_unit_path",
lambda system=False: (sys_dir if system else usr_dir) / "hermes-gateway.service",
)
def test_non_root_with_only_system_unit_returns_true(self, tmp_path, monkeypatch):
self._setup_units(tmp_path, monkeypatch, system_present=True, user_present=False)
monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000)
assert gateway_cli._system_scope_wizard_would_need_root() is True
def test_root_never_needs_root(self, tmp_path, monkeypatch):
self._setup_units(tmp_path, monkeypatch, system_present=True, user_present=False)
monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 0)
assert gateway_cli._system_scope_wizard_would_need_root() is False
def test_non_root_with_user_unit_present_returns_false(self, tmp_path, monkeypatch):
# User-scope unit present — user can start it themselves, no sudo needed.
self._setup_units(tmp_path, monkeypatch, system_present=True, user_present=True)
monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000)
assert gateway_cli._system_scope_wizard_would_need_root() is False
def test_non_root_with_no_units_returns_false(self, tmp_path, monkeypatch):
self._setup_units(tmp_path, monkeypatch, system_present=False, user_present=False)
monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000)
assert gateway_cli._system_scope_wizard_would_need_root() is False
def test_non_root_with_explicit_system_arg_returns_true(self, tmp_path, monkeypatch):
# Caller passed system=True explicitly (e.g. ``hermes gateway start --system``).
self._setup_units(tmp_path, monkeypatch, system_present=False, user_present=False)
monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000)
assert gateway_cli._system_scope_wizard_would_need_root(system=True) is True
class TestSystemScopeRemediationOutput:
"""Tests for _print_system_scope_remediation — the actionable guidance
shown when the wizard detects a system-scope-only setup as non-root.
"""
def test_start_remediation_mentions_sudo_systemctl_and_uninstall(self, capsys, monkeypatch):
monkeypatch.setattr(gateway_cli, "get_service_name", lambda: "hermes-gateway")
gateway_cli._print_system_scope_remediation("start")
out = capsys.readouterr().out
assert "system-wide service" in out
assert "start requires root" in out
assert "sudo systemctl start hermes-gateway" in out
assert "sudo hermes gateway uninstall --system" in out
assert "hermes gateway install" in out
def test_restart_remediation_uses_systemctl_restart(self, capsys, monkeypatch):
monkeypatch.setattr(gateway_cli, "get_service_name", lambda: "hermes-gateway")
gateway_cli._print_system_scope_remediation("restart")
out = capsys.readouterr().out
assert "restart requires root" in out
assert "sudo systemctl restart hermes-gateway" in out
def test_stop_remediation_uses_systemctl_stop(self, capsys, monkeypatch):
monkeypatch.setattr(gateway_cli, "get_service_name", lambda: "hermes-gateway")
gateway_cli._print_system_scope_remediation("stop")
out = capsys.readouterr().out
assert "stop requires root" in out
assert "sudo systemctl stop hermes-gateway" in out
class TestGatewayCommandCatchesSystemScopeError:
"""The direct CLI path (``hermes gateway start --system`` etc.) must
still exit 1 with a clean message when non-root. The top-level
``gateway_command`` catches ``SystemScopeRequiresRootError`` and
converts it back to ``sys.exit(1)``, preserving existing CLI behavior.
"""
def test_non_root_system_start_exits_one_with_clean_message(self, tmp_path, monkeypatch, capsys):
sys_dir = tmp_path / "sys"
usr_dir = tmp_path / "usr"
sys_dir.mkdir()
usr_dir.mkdir()
(sys_dir / "hermes-gateway.service").write_text("[Unit]\n")
monkeypatch.setattr(
gateway_cli,
"get_systemd_unit_path",
lambda system=False: (sys_dir if system else usr_dir) / "hermes-gateway.service",
)
monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000)
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
monkeypatch.setattr(gateway_cli, "kill_gateway_processes", lambda **kw: 0)
args = SimpleNamespace(gateway_command="start", system=True, all=False)
with pytest.raises(SystemExit) as excinfo:
gateway_cli.gateway_command(args)
assert excinfo.value.code == 1
out = capsys.readouterr().out
# Renders the message, NOT the ``('msg', 'action')`` tuple repr
assert "System gateway start requires root. Re-run with sudo." in out
assert "('" not in out # no tuple repr leaking through