mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
fix(gateway): route --all stop/restart through s6 under container
PR #30136 review caught that `hermes gateway stop --all` and `... restart --all` were broken under s6. The Phase 4 dispatcher was gated on `not stop_all` (and the symmetric restart_all), so `--all` fell through to `kill_gateway_processes(all_profiles=True)`. pkill SIGTERMed every gateway, s6-supervise observed the crashes, and restarted every gateway ~1s later — net effect: `--all` *kicked* gateways instead of *stopping* them. Add `_dispatch_all_via_service_manager_if_s6(action)` that iterates `mgr.list_profile_gateways()` and routes stop/restart through each service slot. s6's `want up`/`want down` flips correctly, so a stop persists. Partial failures are surfaced per-profile with a running success count; the host pkill path is only reached when s6 isn't in play. `start --all` isn't a CLI surface — the helper rejects it and returns False (host code path can take over).
This commit is contained in:
parent
8ae959adb6
commit
efd3569739
2 changed files with 206 additions and 2 deletions
|
|
@ -5088,6 +5088,57 @@ def _dispatch_via_service_manager_if_s6(
|
|||
return True
|
||||
|
||||
|
||||
def _dispatch_all_via_service_manager_if_s6(action: str) -> bool:
|
||||
"""Inside a container with s6, dispatch ``--all`` lifecycle to every
|
||||
registered profile gateway.
|
||||
|
||||
Returns True iff dispatched (caller should ``return``); False
|
||||
otherwise — caller continues with the host-side code path.
|
||||
|
||||
Without this, ``hermes gateway stop --all`` and ``... restart --all``
|
||||
fall through to ``kill_gateway_processes(all_profiles=True)``, which
|
||||
just ``pkill``s every gateway process. s6-supervise observes the
|
||||
crash and restarts each one ~1s later — so ``--all`` ends up
|
||||
*kicking* every gateway instead of *stopping* it. By iterating
|
||||
``list_profile_gateways()`` and sending the lifecycle command
|
||||
through the service manager we get the intended semantics (s6's
|
||||
``want up``/``want down`` flips correctly so supervise stays down
|
||||
after a stop).
|
||||
|
||||
``action`` is one of ``stop`` / ``restart`` (``start --all`` isn't
|
||||
a supported CLI surface).
|
||||
"""
|
||||
from hermes_cli.service_manager import (
|
||||
detect_service_manager,
|
||||
get_service_manager,
|
||||
)
|
||||
|
||||
if detect_service_manager() != "s6":
|
||||
return False
|
||||
if action not in ("stop", "restart"):
|
||||
return False
|
||||
mgr = get_service_manager()
|
||||
profiles = mgr.list_profile_gateways()
|
||||
if not profiles:
|
||||
print("✗ No profile gateways registered under s6")
|
||||
return True
|
||||
fn = mgr.stop if action == "stop" else mgr.restart
|
||||
errors: list[tuple[str, Exception]] = []
|
||||
for profile in profiles:
|
||||
service_name = f"gateway-{profile}"
|
||||
try:
|
||||
fn(service_name)
|
||||
except Exception as exc: # noqa: BLE001 — report and continue
|
||||
errors.append((profile, exc))
|
||||
succeeded = len(profiles) - len(errors)
|
||||
verb = "stopped" if action == "stop" else "restarted"
|
||||
if succeeded:
|
||||
print(f"✓ {verb.capitalize()} {succeeded} profile gateway(s) under s6")
|
||||
for profile, exc in errors:
|
||||
print(f"✗ Could not {action} gateway-{profile}: {exc}")
|
||||
return True
|
||||
|
||||
|
||||
def gateway_command(args):
|
||||
"""Handle gateway subcommands."""
|
||||
try:
|
||||
|
|
@ -5297,7 +5348,11 @@ def _gateway_command_inner(args):
|
|||
system = getattr(args, 'system', False)
|
||||
|
||||
# Phase 4: inside a container with s6, dispatch via the service
|
||||
# manager. `--all` is left to the existing process-sweep path below.
|
||||
# manager. ``--all`` iterates every registered profile gateway
|
||||
# through s6 (otherwise it would fall through to ``pkill``,
|
||||
# which s6-supervise observes as a crash and immediately restarts).
|
||||
if stop_all and _dispatch_all_via_service_manager_if_s6("stop"):
|
||||
return
|
||||
if not stop_all and _dispatch_via_service_manager_if_s6("stop"):
|
||||
return
|
||||
|
||||
|
|
@ -5371,7 +5426,12 @@ def _gateway_command_inner(args):
|
|||
service_configured = False
|
||||
|
||||
# Phase 4: inside a container with s6, dispatch via the service
|
||||
# manager (s6-svc -t restarts the supervised process).
|
||||
# manager (s6-svc -t restarts the supervised process). ``--all``
|
||||
# iterates every registered profile gateway through s6; without
|
||||
# this it would fall through to ``pkill``, which s6-supervise
|
||||
# would observe as a crash and immediately restart anyway.
|
||||
if restart_all and _dispatch_all_via_service_manager_if_s6("restart"):
|
||||
return
|
||||
if not restart_all and _dispatch_via_service_manager_if_s6("restart"):
|
||||
return
|
||||
|
||||
|
|
|
|||
|
|
@ -115,3 +115,147 @@ def test_dispatch_defaults_profile_to_default(
|
|||
)
|
||||
assert gw._dispatch_via_service_manager_if_s6("start") is True
|
||||
assert rec.calls == [("start", "gateway-default")]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _dispatch_all_via_service_manager_if_s6 — --all under s6
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class _ListingRecorder(_CallRecorder):
|
||||
"""_CallRecorder that also exposes a profile list."""
|
||||
|
||||
def __init__(self, profiles: list[str]) -> None:
|
||||
super().__init__()
|
||||
self._profiles = profiles
|
||||
|
||||
def list_profile_gateways(self) -> list[str]:
|
||||
return list(self._profiles)
|
||||
|
||||
|
||||
def test_dispatch_all_returns_false_on_host(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
from hermes_cli import gateway as gw
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.detect_service_manager", lambda: "systemd",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager",
|
||||
lambda: pytest.fail("manager should not be constructed on host"),
|
||||
)
|
||||
assert gw._dispatch_all_via_service_manager_if_s6("stop") is False
|
||||
|
||||
|
||||
def test_dispatch_all_iterates_every_profile_on_stop(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
capsys: pytest.CaptureFixture,
|
||||
) -> None:
|
||||
from hermes_cli import gateway as gw
|
||||
rec = _ListingRecorder(["coder", "writer", "assistant"])
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager", lambda: rec,
|
||||
)
|
||||
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
|
||||
assert rec.calls == [
|
||||
("stop", "gateway-coder"),
|
||||
("stop", "gateway-writer"),
|
||||
("stop", "gateway-assistant"),
|
||||
]
|
||||
out = capsys.readouterr().out
|
||||
assert "Stopped 3 profile gateway(s)" in out
|
||||
|
||||
|
||||
def test_dispatch_all_iterates_every_profile_on_restart(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
capsys: pytest.CaptureFixture,
|
||||
) -> None:
|
||||
from hermes_cli import gateway as gw
|
||||
rec = _ListingRecorder(["coder", "writer"])
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager", lambda: rec,
|
||||
)
|
||||
assert gw._dispatch_all_via_service_manager_if_s6("restart") is True
|
||||
assert rec.calls == [
|
||||
("restart", "gateway-coder"),
|
||||
("restart", "gateway-writer"),
|
||||
]
|
||||
out = capsys.readouterr().out
|
||||
assert "Restarted 2 profile gateway(s)" in out
|
||||
|
||||
|
||||
def test_dispatch_all_handles_partial_failure(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
capsys: pytest.CaptureFixture,
|
||||
) -> None:
|
||||
"""A failure on one profile must not skip the others; the helper
|
||||
reports each failure and the success count."""
|
||||
from hermes_cli import gateway as gw
|
||||
|
||||
class _FailOnWriter(_ListingRecorder):
|
||||
def stop(self, name: str) -> None:
|
||||
if name == "gateway-writer":
|
||||
raise RuntimeError("supervise FIFO permission denied")
|
||||
super().stop(name)
|
||||
|
||||
rec = _FailOnWriter(["coder", "writer", "assistant"])
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager", lambda: rec,
|
||||
)
|
||||
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
|
||||
# The two successful ones were called; writer raised before recording.
|
||||
assert ("stop", "gateway-coder") in rec.calls
|
||||
assert ("stop", "gateway-assistant") in rec.calls
|
||||
assert ("stop", "gateway-writer") not in rec.calls
|
||||
out = capsys.readouterr().out
|
||||
assert "Stopped 2 profile gateway(s)" in out
|
||||
assert "Could not stop gateway-writer" in out
|
||||
assert "supervise FIFO permission denied" in out
|
||||
|
||||
|
||||
def test_dispatch_all_empty_list_reports_and_returns_true(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
capsys: pytest.CaptureFixture,
|
||||
) -> None:
|
||||
"""With no profile gateways registered the helper still claims the
|
||||
dispatch (returns True) and prints a friendly message — the host
|
||||
fallback would just pkill nothing, which isn't useful inside a
|
||||
container."""
|
||||
from hermes_cli import gateway as gw
|
||||
rec = _ListingRecorder([])
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager", lambda: rec,
|
||||
)
|
||||
assert gw._dispatch_all_via_service_manager_if_s6("stop") is True
|
||||
assert rec.calls == []
|
||||
assert "No profile gateways" in capsys.readouterr().out
|
||||
|
||||
|
||||
def test_dispatch_all_unknown_action_returns_false(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""`start --all` is not a supported CLI surface; the helper must
|
||||
fall through to the host code path rather than no-op."""
|
||||
from hermes_cli import gateway as gw
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.detect_service_manager", lambda: "s6",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"hermes_cli.service_manager.get_service_manager",
|
||||
lambda: pytest.fail(
|
||||
"manager should not be constructed for unsupported --all action",
|
||||
),
|
||||
)
|
||||
assert gw._dispatch_all_via_service_manager_if_s6("start") is False
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue