fix(gateway): route --all stop/restart through s6 under container

PR #30136 review caught that `hermes gateway stop --all` and
`... restart --all` were broken under s6. The Phase 4 dispatcher was
gated on `not stop_all` (and the symmetric restart_all), so `--all`
fell through to `kill_gateway_processes(all_profiles=True)`. pkill
SIGTERMed every gateway, s6-supervise observed the crashes, and
restarted every gateway ~1s later — net effect: `--all` *kicked*
gateways instead of *stopping* them.

Add `_dispatch_all_via_service_manager_if_s6(action)` that iterates
`mgr.list_profile_gateways()` and routes stop/restart through each
service slot. s6's `want up`/`want down` flips correctly, so a
stop persists. Partial failures are surfaced per-profile with a
running success count; the host pkill path is only reached when s6
isn't in play.

`start --all` isn't a CLI surface — the helper rejects it and
returns False (host code path can take over).
This commit is contained in:
Ben 2026-05-23 15:08:17 +10:00
parent 8ae959adb6
commit efd3569739
2 changed files with 206 additions and 2 deletions

View file

@ -115,3 +115,147 @@ def test_dispatch_defaults_profile_to_default(
)
assert gw._dispatch_via_service_manager_if_s6("start") is True
assert rec.calls == [("start", "gateway-default")]
# ---------------------------------------------------------------------------
# _dispatch_all_via_service_manager_if_s6 — --all under s6
# ---------------------------------------------------------------------------
class _ListingRecorder(_CallRecorder):
"""_CallRecorder that also exposes a profile list."""
def __init__(self, profiles: list[str]) -> None:
super().__init__()
self._profiles = profiles
def list_profile_gateways(self) -> list[str]:
return list(self._profiles)
def test_dispatch_all_returns_false_on_host(
monkeypatch: pytest.MonkeyPatch,
) -> None:
from hermes_cli import gateway as gw
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "systemd",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager",
lambda: pytest.fail("manager should not be constructed on host"),
)
assert gw._dispatch_all_via_service_manager_if_s6("stop") is False
def test_dispatch_all_iterates_every_profile_on_stop(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
from hermes_cli import gateway as gw
rec = _ListingRecorder(["coder", "writer", "assistant"])
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: rec,
)
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
assert rec.calls == [
("stop", "gateway-coder"),
("stop", "gateway-writer"),
("stop", "gateway-assistant"),
]
out = capsys.readouterr().out
assert "Stopped 3 profile gateway(s)" in out
def test_dispatch_all_iterates_every_profile_on_restart(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
from hermes_cli import gateway as gw
rec = _ListingRecorder(["coder", "writer"])
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: rec,
)
assert gw._dispatch_all_via_service_manager_if_s6("restart") is True
assert rec.calls == [
("restart", "gateway-coder"),
("restart", "gateway-writer"),
]
out = capsys.readouterr().out
assert "Restarted 2 profile gateway(s)" in out
def test_dispatch_all_handles_partial_failure(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
"""A failure on one profile must not skip the others; the helper
reports each failure and the success count."""
from hermes_cli import gateway as gw
class _FailOnWriter(_ListingRecorder):
def stop(self, name: str) -> None:
if name == "gateway-writer":
raise RuntimeError("supervise FIFO permission denied")
super().stop(name)
rec = _FailOnWriter(["coder", "writer", "assistant"])
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: rec,
)
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
# The two successful ones were called; writer raised before recording.
assert ("stop", "gateway-coder") in rec.calls
assert ("stop", "gateway-assistant") in rec.calls
assert ("stop", "gateway-writer") not in rec.calls
out = capsys.readouterr().out
assert "Stopped 2 profile gateway(s)" in out
assert "Could not stop gateway-writer" in out
assert "supervise FIFO permission denied" in out
def test_dispatch_all_empty_list_reports_and_returns_true(
monkeypatch: pytest.MonkeyPatch,
capsys: pytest.CaptureFixture,
) -> None:
"""With no profile gateways registered the helper still claims the
dispatch (returns True) and prints a friendly message the host
fallback would just pkill nothing, which isn't useful inside a
container."""
from hermes_cli import gateway as gw
rec = _ListingRecorder([])
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager", lambda: rec,
)
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
assert rec.calls == []
assert "No profile gateways" in capsys.readouterr().out
def test_dispatch_all_unknown_action_returns_false(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""`start --all` is not a supported CLI surface; the helper must
fall through to the host code path rather than no-op."""
from hermes_cli import gateway as gw
monkeypatch.setattr(
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
)
monkeypatch.setattr(
"hermes_cli.service_manager.get_service_manager",
lambda: pytest.fail(
"manager should not be constructed for unsupported --all action",
),
)
assert gw._dispatch_all_via_service_manager_if_s6("start") is False