feat(gateway): multiplex phase 4 — lifecycle guard + per-profile observability

- _guard_named_profile_under_multiplexer: when the default gateway is running
  with gateway.multiplex_profiles=on, a named-profile 'hermes gateway run' hard
  -errors (pointing at the multiplexer) instead of double-binding that
  profile's platforms. Inert unless all hold: this invocation is a named
  profile, a default-profile gateway is alive, and its config has multiplexing
  on. --force overrides. Wired into run_gateway's guard chain.
- write_runtime_status gains served_profiles: the secondary-adapter startup
  records [active] + multiplexed profiles into runtime_status.json so
  'hermes status' can show per-profile coverage without a second probe. Absent
  for single-profile gateways.

Tests: served_profiles round-trips and is absent by default; guard is inert for
the default profile / under --force / when no default gateway is running.
This commit is contained in:
Ben Barclay 2026-06-18 16:12:06 +10:00 committed by Teknium
parent d5d02eabb0
commit 1e70df5fdd
4 changed files with 151 additions and 0 deletions

View file

@ -6735,6 +6735,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
"Failed to start adapters for profile '%s': %s",
profile_name, e, exc_info=True,
)
# Record served profiles in runtime status for `hermes status`.
try:
from gateway.status import write_runtime_status
served = [active] + sorted(self._profile_adapters.keys())
write_runtime_status(served_profiles=served)
except Exception:
logger.debug("could not record served_profiles", exc_info=True)
return connected
async def _start_one_profile_adapters(

View file

@ -575,6 +575,7 @@ def write_runtime_status(
platform_state: Any = _UNSET,
error_code: Any = _UNSET,
error_message: Any = _UNSET,
served_profiles: Any = _UNSET,
) -> None:
"""Persist gateway runtime health information for diagnostics/status."""
path = _get_runtime_status_path()
@ -595,6 +596,11 @@ def write_runtime_status(
payload["restart_requested"] = bool(restart_requested)
if active_agents is not _UNSET:
payload["active_agents"] = max(0, int(active_agents))
if served_profiles is not _UNSET:
# Profiles this gateway multiplexes (multi-profile mode). Absent/empty
# for a single-profile gateway. Lets `hermes status` show per-profile
# coverage without a second probe.
payload["served_profiles"] = list(served_profiles or [])
if platform is not _UNSET:
platform_payload = payload["platforms"].get(platform, {})

View file

@ -3851,6 +3851,86 @@ def _running_under_gateway_supervisor() -> bool:
return False
def _guard_named_profile_under_multiplexer(force: bool = False) -> None:
"""Refuse a named-profile gateway when a multiplexer is already serving it.
When the default profile's gateway runs with gateway.multiplex_profiles=on,
it is the sole inbound process for EVERY profile on the host. Starting a
separate gateway for a named profile would double-bind that profile's
platforms (two pollers on one bot token, port fights). In that mode a
named-profile ``hermes gateway run`` is always a misconfiguration, so we
hard-error with a pointer to the multiplexer. ``--force`` overrides.
Inert unless ALL of: (a) this invocation is a named profile, (b) a default-
profile gateway is running, (c) that gateway's config has multiplexing on.
"""
if force:
return
# (a) Are we a named profile? Default/custom-hash homes return "".
try:
suffix = _profile_suffix()
except Exception:
return
if not suffix:
return # default profile (or unrecognized) — this guard doesn't apply
try:
from hermes_constants import get_default_hermes_root
default_root = get_default_hermes_root()
# (b) Is the default-profile gateway running?
from gateway.status import get_running_pid as _default_running_pid # noqa
except Exception:
return
try:
import yaml as _yaml
from gateway.status import _read_pid_record # type: ignore
# (b) default gateway PID file present + alive
default_pid_path = default_root / "gateway.pid"
rec = _read_pid_record(default_pid_path)
if not rec:
return
from gateway.status import _pid_exists, _pid_from_record
pid = _pid_from_record(rec)
if not pid or not _pid_exists(pid):
return
# (c) default config has multiplexing on
cfg_path = default_root / "config.yaml"
if not cfg_path.exists():
return
with open(cfg_path, encoding="utf-8") as f:
cfg = _yaml.safe_load(f) or {}
multiplex = bool(
cfg.get("multiplex_profiles")
or (cfg.get("gateway", {}) or {}).get("multiplex_profiles")
)
if not multiplex:
return
except Exception:
logger.debug("Multiplexer-conflict probe failed", exc_info=True)
return
print_error(
f"The default gateway is running as a profile multiplexer and already "
f"serves profile '{suffix}'."
)
print(
" When gateway.multiplex_profiles is on, the default gateway is the\n"
" single inbound process for every profile. Starting a separate\n"
" gateway for this profile would double-bind its platforms (two\n"
" pollers on one bot token, port conflicts).\n"
)
print(" Manage the multiplexer instead (from the default profile):")
print()
print(" hermes gateway restart")
print()
print(" Pass --force to start a separate profile gateway anyway (not")
print(" recommended while the multiplexer is running).")
sys.exit(1)
def _guard_supervised_gateway_conflict(force: bool = False) -> None:
"""Refuse a foreground gateway when a service manager already supervises one.
@ -3963,6 +4043,7 @@ def run_gateway(verbose: int = 0, quiet: bool = False, replace: bool = False, fo
systemd/launchd service is already supervising this profile.
"""
_guard_official_docker_root_gateway()
_guard_named_profile_under_multiplexer(force=force)
_guard_supervised_gateway_conflict(force=force)
_guard_existing_gateway_process_conflict(replace=replace)
sys.path.insert(0, str(PROJECT_ROOT))

View file

@ -0,0 +1,55 @@
"""Phase 4: lifecycle guard + per-profile observability."""
import pytest
class TestServedProfilesStatus:
def test_write_and_read_served_profiles(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
import importlib
import gateway.status as status
importlib.reload(status)
try:
status.write_runtime_status(
gateway_state="running", served_profiles=["default", "coder"]
)
rec = status.read_runtime_status()
assert rec.get("served_profiles") == ["default", "coder"]
finally:
importlib.reload(status)
def test_served_profiles_absent_by_default(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
import importlib
import gateway.status as status
importlib.reload(status)
try:
status.write_runtime_status(gateway_state="running")
rec = status.read_runtime_status()
assert "served_profiles" not in rec
finally:
importlib.reload(status)
class TestNamedProfileMultiplexerGuard:
"""_guard_named_profile_under_multiplexer is inert unless all conditions hold."""
def test_inert_for_default_profile(self, monkeypatch):
from hermes_cli import gateway as gw
monkeypatch.setattr(gw, "_profile_suffix", lambda: "")
# Should return without raising (default profile => guard N/A).
gw._guard_named_profile_under_multiplexer(force=False)
def test_force_bypasses(self, monkeypatch):
from hermes_cli import gateway as gw
# Even if it looks like a named profile, force returns immediately.
monkeypatch.setattr(gw, "_profile_suffix", lambda: "coder")
gw._guard_named_profile_under_multiplexer(force=True)
def test_inert_when_no_default_gateway_running(self, monkeypatch, tmp_path):
from hermes_cli import gateway as gw
monkeypatch.setattr(gw, "_profile_suffix", lambda: "coder")
monkeypatch.setattr(
"hermes_constants.get_default_hermes_root", lambda: tmp_path
)
# No gateway.pid in tmp_path => no running default gateway => no raise.
gw._guard_named_profile_under_multiplexer(force=False)