fix(gateway): route --all stop/restart through s6 under container

PR #30136 review caught that `hermes gateway stop --all` and
`... restart --all` were broken under s6. The Phase 4 dispatcher was
gated on `not stop_all` (and the symmetric restart_all), so `--all`
fell through to `kill_gateway_processes(all_profiles=True)`. pkill
SIGTERMed every gateway, s6-supervise observed the crashes, and
restarted every gateway ~1s later — net effect: `--all` *kicked*
gateways instead of *stopping* them.

Add `_dispatch_all_via_service_manager_if_s6(action)` that iterates
`mgr.list_profile_gateways()` and routes stop/restart through each
service slot. s6's `want up`/`want down` flips correctly, so a
stop persists. Partial failures are surfaced per-profile with a
running success count; the host pkill path is only reached when s6
isn't in play.

`start --all` isn't a CLI surface — the helper rejects it and
returns False (host code path can take over).
This commit is contained in:
Ben 2026-05-23 15:08:17 +10:00
parent 8ae959adb6
commit efd3569739
2 changed files with 206 additions and 2 deletions

View file

@ -5088,6 +5088,57 @@ def _dispatch_via_service_manager_if_s6(
return True
def _dispatch_all_via_service_manager_if_s6(action: str) -> bool:
"""Inside a container with s6, dispatch ``--all`` lifecycle to every
registered profile gateway.
Returns True iff dispatched (caller should ``return``); False
otherwise caller continues with the host-side code path.
Without this, ``hermes gateway stop --all`` and ``... restart --all``
fall through to ``kill_gateway_processes(all_profiles=True)``, which
just ``pkill``s every gateway process. s6-supervise observes the
crash and restarts each one ~1s later so ``--all`` ends up
*kicking* every gateway instead of *stopping* it. By iterating
``list_profile_gateways()`` and sending the lifecycle command
through the service manager we get the intended semantics (s6's
``want up``/``want down`` flips correctly so supervise stays down
after a stop).
``action`` is one of ``stop`` / ``restart`` (``start --all`` isn't
a supported CLI surface).
"""
from hermes_cli.service_manager import (
detect_service_manager,
get_service_manager,
)
if detect_service_manager() != "s6":
return False
if action not in ("stop", "restart"):
return False
mgr = get_service_manager()
profiles = mgr.list_profile_gateways()
if not profiles:
print("✗ No profile gateways registered under s6")
return True
fn = mgr.stop if action == "stop" else mgr.restart
errors: list[tuple[str, Exception]] = []
for profile in profiles:
service_name = f"gateway-{profile}"
try:
fn(service_name)
except Exception as exc: # noqa: BLE001 — report and continue
errors.append((profile, exc))
succeeded = len(profiles) - len(errors)
verb = "stopped" if action == "stop" else "restarted"
if succeeded:
print(f"{verb.capitalize()} {succeeded} profile gateway(s) under s6")
for profile, exc in errors:
print(f"✗ Could not {action} gateway-{profile}: {exc}")
return True
def gateway_command(args):
"""Handle gateway subcommands."""
try:
@ -5297,7 +5348,11 @@ def _gateway_command_inner(args):
system = getattr(args, 'system', False)
# Phase 4: inside a container with s6, dispatch via the service
# manager. `--all` is left to the existing process-sweep path below.
# manager. ``--all`` iterates every registered profile gateway
# through s6 (otherwise it would fall through to ``pkill``,
# which s6-supervise observes as a crash and immediately restarts).
if stop_all and _dispatch_all_via_service_manager_if_s6("stop"):
return
if not stop_all and _dispatch_via_service_manager_if_s6("stop"):
return
@ -5371,7 +5426,12 @@ def _gateway_command_inner(args):
service_configured = False
# Phase 4: inside a container with s6, dispatch via the service
# manager (s6-svc -t restarts the supervised process).
# manager (s6-svc -t restarts the supervised process). ``--all``
# iterates every registered profile gateway through s6; without
# this it would fall through to ``pkill``, which s6-supervise
# would observe as a crash and immediately restart anyway.
if restart_all and _dispatch_all_via_service_manager_if_s6("restart"):
return
if not restart_all and _dispatch_via_service_manager_if_s6("restart"):
return

View file

@ -115,3 +115,147 @@ def test_dispatch_defaults_profile_to_default(
)
assert gw._dispatch_via_service_manager_if_s6("start") is True
assert rec.calls == [("start", "gateway-default")]
# ---------------------------------------------------------------------------
# _dispatch_all_via_service_manager_if_s6 — --all under s6
# ---------------------------------------------------------------------------
class _ListingRecorder(_CallRecorder):
"""_CallRecorder that also exposes a profile list."""
def __init__(self, profiles: list[str]) -> None:
super().__init__()
self._profiles = profiles
def list_profile_gateways(self) -> list[str]:
return list(self._profiles)
def test_dispatch_all_returns_false_on_host(
monkeypatch: pytest.MonkeyPatch,
) -> None:
from hermes_cli import gateway as gw
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "systemd",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager",
lambda: pytest.fail("manager should not be constructed on host"),
)
assert gw._dispatch_all_via_service_manager_if_s6("stop") is False
def test_dispatch_all_iterates_every_profile_on_stop(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
from hermes_cli import gateway as gw
rec = _ListingRecorder(["coder", "writer", "assistant"])
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: rec,
)
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
assert rec.calls == [
("stop", "gateway-coder"),
("stop", "gateway-writer"),
("stop", "gateway-assistant"),
]
out = capsys.readouterr().out
assert "Stopped 3 profile gateway(s)" in out
def test_dispatch_all_iterates_every_profile_on_restart(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
from hermes_cli import gateway as gw
rec = _ListingRecorder(["coder", "writer"])
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: rec,
)
assert gw._dispatch_all_via_service_manager_if_s6("restart") is True
assert rec.calls == [
("restart", "gateway-coder"),
("restart", "gateway-writer"),
]
out = capsys.readouterr().out
assert "Restarted 2 profile gateway(s)" in out
def test_dispatch_all_handles_partial_failure(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
"""A failure on one profile must not skip the others; the helper
reports each failure and the success count."""
from hermes_cli import gateway as gw
class _FailOnWriter(_ListingRecorder):
def stop(self, name: str) -> None:
if name == "gateway-writer":
raise RuntimeError("supervise FIFO permission denied")
super().stop(name)
rec = _FailOnWriter(["coder", "writer", "assistant"])
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: rec,
)
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
# The two successful ones were called; writer raised before recording.
assert ("stop", "gateway-coder") in rec.calls
assert ("stop", "gateway-assistant") in rec.calls
assert ("stop", "gateway-writer") not in rec.calls
out = capsys.readouterr().out
assert "Stopped 2 profile gateway(s)" in out
assert "Could not stop gateway-writer" in out
assert "supervise FIFO permission denied" in out
def test_dispatch_all_empty_list_reports_and_returns_true(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
"""With no profile gateways registered the helper still claims the
dispatch (returns True) and prints a friendly message the host
fallback would just pkill nothing, which isn't useful inside a
container."""
from hermes_cli import gateway as gw
rec = _ListingRecorder([])
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: rec,
)
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
assert rec.calls == []
assert "No profile gateways" in capsys.readouterr().out
def test_dispatch_all_unknown_action_returns_false(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""`start --all` is not a supported CLI surface; the helper must
fall through to the host code path rather than no-op."""
from hermes_cli import gateway as gw
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager",
lambda: pytest.fail(
"manager should not be constructed for unsupported --all action",
),
)
assert gw._dispatch_all_via_service_manager_if_s6("start") is False