diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index e68fac0a4f4..d3eeb757fc4 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -5066,6 +5066,57 @@ def _dispatch_via_service_manager_if_s6( return True +def _dispatch_all_via_service_manager_if_s6(action: str) -> bool: + """Inside a container with s6, dispatch ``--all`` lifecycle to every + registered profile gateway. + + Returns True iff dispatched (caller should ``return``); False + otherwise — caller continues with the host-side code path. + + Without this, ``hermes gateway stop --all`` and ``... restart --all`` + fall through to ``kill_gateway_processes(all_profiles=True)``, which + just ``pkill``s every gateway process. s6-supervise observes the + crash and restarts each one ~1s later — so ``--all`` ends up + *kicking* every gateway instead of *stopping* it. By iterating + ``list_profile_gateways()`` and sending the lifecycle command + through the service manager we get the intended semantics (s6's + ``want up``/``want down`` flips correctly so supervise stays down + after a stop). + + ``action`` is one of ``stop`` / ``restart`` (``start --all`` isn't + a supported CLI surface). + """ + from hermes_cli.service_manager import ( + detect_service_manager, + get_service_manager, + ) + + if detect_service_manager() != "s6": + return False + if action not in ("stop", "restart"): + return False + mgr = get_service_manager() + profiles = mgr.list_profile_gateways() + if not profiles: + print("✗ No profile gateways registered under s6") + return True + fn = mgr.stop if action == "stop" else mgr.restart + errors: list[tuple[str, Exception]] = [] + for profile in profiles: + service_name = f"gateway-{profile}" + try: + fn(service_name) + except Exception as exc: # noqa: BLE001 — report and continue + errors.append((profile, exc)) + succeeded = len(profiles) - len(errors) + verb = "stopped" if action == "stop" else "restarted" + if succeeded: + print(f"✓ {verb.capitalize()} {succeeded} profile gateway(s) under s6") + for profile, exc in errors: + print(f"✗ Could not {action} gateway-{profile}: {exc}") + return True + + def gateway_command(args): """Handle gateway subcommands.""" try: @@ -5275,7 +5326,11 @@ def _gateway_command_inner(args): system = getattr(args, 'system', False) # Phase 4: inside a container with s6, dispatch via the service - # manager. `--all` is left to the existing process-sweep path below. + # manager. ``--all`` iterates every registered profile gateway + # through s6 (otherwise it would fall through to ``pkill``, + # which s6-supervise observes as a crash and immediately restarts). + if stop_all and _dispatch_all_via_service_manager_if_s6("stop"): + return if not stop_all and _dispatch_via_service_manager_if_s6("stop"): return @@ -5349,7 +5404,12 @@ def _gateway_command_inner(args): service_configured = False # Phase 4: inside a container with s6, dispatch via the service - # manager (s6-svc -t restarts the supervised process). + # manager (s6-svc -t restarts the supervised process). ``--all`` + # iterates every registered profile gateway through s6; without + # this it would fall through to ``pkill``, which s6-supervise + # would observe as a crash and immediately restart anyway. + if restart_all and _dispatch_all_via_service_manager_if_s6("restart"): + return if not restart_all and _dispatch_via_service_manager_if_s6("restart"): return diff --git a/tests/hermes_cli/test_gateway_s6_dispatch.py b/tests/hermes_cli/test_gateway_s6_dispatch.py index 6516f85eab2..e4a1969d3fd 100644 --- a/tests/hermes_cli/test_gateway_s6_dispatch.py +++ b/tests/hermes_cli/test_gateway_s6_dispatch.py @@ -115,3 +115,147 @@ def test_dispatch_defaults_profile_to_default( ) assert gw._dispatch_via_service_manager_if_s6("start") is True assert rec.calls == [("start", "gateway-default")] + + +# --------------------------------------------------------------------------- +# _dispatch_all_via_service_manager_if_s6 — --all under s6 +# --------------------------------------------------------------------------- + + +class _ListingRecorder(_CallRecorder): + """_CallRecorder that also exposes a profile list.""" + + def __init__(self, profiles: list[str]) -> None: + super().__init__() + self._profiles = profiles + + def list_profile_gateways(self) -> list[str]: + return list(self._profiles) + + +def test_dispatch_all_returns_false_on_host( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from hermes_cli import gateway as gw + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "systemd", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: pytest.fail("manager should not be constructed on host"), + ) + assert gw._dispatch_all_via_service_manager_if_s6("stop") is False + + +def test_dispatch_all_iterates_every_profile_on_stop( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + from hermes_cli import gateway as gw + rec = _ListingRecorder(["coder", "writer", "assistant"]) + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_all_via_service_manager_if_s6("stop") is True + assert rec.calls == [ + ("stop", "gateway-coder"), + ("stop", "gateway-writer"), + ("stop", "gateway-assistant"), + ] + out = capsys.readouterr().out + assert "Stopped 3 profile gateway(s)" in out + + +def test_dispatch_all_iterates_every_profile_on_restart( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + from hermes_cli import gateway as gw + rec = _ListingRecorder(["coder", "writer"]) + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_all_via_service_manager_if_s6("restart") is True + assert rec.calls == [ + ("restart", "gateway-coder"), + ("restart", "gateway-writer"), + ] + out = capsys.readouterr().out + assert "Restarted 2 profile gateway(s)" in out + + +def test_dispatch_all_handles_partial_failure( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + """A failure on one profile must not skip the others; the helper + reports each failure and the success count.""" + from hermes_cli import gateway as gw + + class _FailOnWriter(_ListingRecorder): + def stop(self, name: str) -> None: + if name == "gateway-writer": + raise RuntimeError("supervise FIFO permission denied") + super().stop(name) + + rec = _FailOnWriter(["coder", "writer", "assistant"]) + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_all_via_service_manager_if_s6("stop") is True + # The two successful ones were called; writer raised before recording. + assert ("stop", "gateway-coder") in rec.calls + assert ("stop", "gateway-assistant") in rec.calls + assert ("stop", "gateway-writer") not in rec.calls + out = capsys.readouterr().out + assert "Stopped 2 profile gateway(s)" in out + assert "Could not stop gateway-writer" in out + assert "supervise FIFO permission denied" in out + + +def test_dispatch_all_empty_list_reports_and_returns_true( + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture, +) -> None: + """With no profile gateways registered the helper still claims the + dispatch (returns True) and prints a friendly message — the host + fallback would just pkill nothing, which isn't useful inside a + container.""" + from hermes_cli import gateway as gw + rec = _ListingRecorder([]) + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + assert gw._dispatch_all_via_service_manager_if_s6("stop") is True + assert rec.calls == [] + assert "No profile gateways" in capsys.readouterr().out + + +def test_dispatch_all_unknown_action_returns_false( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """`start --all` is not a supported CLI surface; the helper must + fall through to the host code path rather than no-op.""" + from hermes_cli import gateway as gw + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", lambda: "s6", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", + lambda: pytest.fail( + "manager should not be constructed for unsupported --all action", + ), + ) + assert gw._dispatch_all_via_service_manager_if_s6("start") is False