fix(container_boot): always register gateway-default slot

PR #30136 review caught: `hermes gateway start` (no `-p`) inside
the container resolves `_profile_suffix() == ""` → service name
`gateway-default`, but no such slot was ever registered. The Phase 4
profile-create hook only fired on `hermes profile create <name>`,
and the root profile (which lives at the top of $HERMES_HOME, not
under `profiles/`) was never one of those. So bare `hermes gateway
start` landed on `s6-svc -u /run/service/gateway-default` →
uncaught `CalledProcessError` → traceback to the user.

Changes:

1. `reconcile_profile_gateways` now always registers a
   `gateway-default` slot before iterating named profiles. Its
   prior state is read from `$HERMES_HOME/gateway_state.json`
   (sibling to the profile root, not under `profiles/`); stale
   runtime files there are swept the same way. Auto-up only if the
   prior state was `running` — same rule as named profiles.

2. `S6ServiceManager._render_run_script` special-cases
   `profile == "default"` to emit `hermes gateway run` with NO
   `-p` flag. Passing `-p default` would resolve to
   `$HERMES_HOME/profiles/default/` — a different profile that
   almost certainly doesn't exist. The empty profile-suffix
   convention is the dispatcher's contract and the run script has
   to match.

3. A user-created `profiles/default/` collides with the reserved
   root-profile slot; the reconciler now skips it with a warning
   rather than producing two registrations of the same service name.

Action-list ordering is stable: `default` first, then named
profiles in directory order. Boot-log readers can rely on this.

Tests: 8 new dedicated default-slot tests plus updates to every
existing test that asserted against the action list (via the new
`_named_actions` helper that drops the always-present default
entry).
This commit is contained in:
Ben 2026-05-23 15:16:35 +10:00
parent 04d1894f36
commit 367c15b1dc
3 changed files with 283 additions and 39 deletions

View file

@ -60,44 +60,87 @@ def reconcile_profile_gateways(
) -> list[ReconcileAction]:
"""Recreate s6 service registrations for every persistent profile.
Always registers a ``gateway-default`` slot for the root profile
(the implicit profile that lives at the top of ``$HERMES_HOME``,
not under ``profiles/``). The dispatcher in ``hermes_cli.gateway``
maps an empty profile suffix to ``gateway-default``, so this slot
is what ``hermes gateway start`` (no ``-p``) targets. Without it,
bare ``hermes gateway start`` inside the container would land on
``s6-svc -u /run/service/gateway-default`` uncaught
``CalledProcessError`` traceback to the user (PR #30136 review).
The default slot's prior state is read from
``$HERMES_HOME/gateway_state.json`` (sibling to the profile root,
not under ``profiles/``); stale runtime files there are swept the
same way as for named profiles.
Args:
hermes_home: The container's HERMES_HOME (typically /opt/data).
Profiles live under ``<hermes_home>/profiles/<name>/``.
Profiles live under ``<hermes_home>/profiles/<name>/``;
the default profile lives at ``<hermes_home>`` itself.
scandir: The s6 dynamic scandir (typically /run/service). Service
directories are created at ``<scandir>/gateway-<profile>/``.
dry_run: When True, walk and return the action list without
touching the filesystem. For tests and `--dry-run` debug.
Returns:
One :class:`ReconcileAction` per profile, in directory order.
One :class:`ReconcileAction` per profile, in this order:
``default`` first, then named profiles in directory order.
"""
actions: list[ReconcileAction] = []
# Default profile — always register, even if nothing has ever
# populated the root profile dir. The slot exists so
# ``hermes gateway start`` (no ``-p``) has somewhere to land;
# auto-up only when the prior state was "running" (same rule as
# named profiles).
default_prior_state = _read_prior_state(hermes_home)
default_should_start = default_prior_state in _AUTOSTART_STATES
if not dry_run:
_cleanup_stale_runtime_files(hermes_home)
_register_service(scandir, "default", start=default_should_start)
actions.append(ReconcileAction(
profile="default",
prior_state=default_prior_state,
action="started" if default_should_start else "registered",
))
profiles_root = hermes_home / "profiles"
if not profiles_root.is_dir():
return actions
if profiles_root.is_dir():
for entry in sorted(profiles_root.iterdir()):
if not entry.is_dir():
continue
# SOUL.md is always seeded by `hermes profile create` (config.yaml
# is not — that comes later via `hermes setup`). Use it as the
# "real profile" marker so stray dirs (backups, manual mkdir)
# aren't picked up.
if not (entry / "SOUL.md").exists():
continue
# The "default" service name is reserved for the root
# profile (above) — if a user has somehow created a
# ``profiles/default/`` directory, skip it to avoid the
# slot collision. Their gateway would still be reachable
# via ``hermes -p default-named gateway start`` if they
# rename the directory; we don't try to disambiguate here.
if entry.name == "default":
log.warning(
"profiles/default/ exists — skipping to avoid colliding "
"with the reserved root-profile s6 slot",
)
continue
for entry in sorted(profiles_root.iterdir()):
if not entry.is_dir():
continue
# SOUL.md is always seeded by `hermes profile create` (config.yaml
# is not — that comes later via `hermes setup`). Use it as the
# "real profile" marker so stray dirs (backups, manual mkdir)
# aren't picked up.
if not (entry / "SOUL.md").exists():
continue
prior_state = _read_prior_state(entry)
should_start = prior_state in _AUTOSTART_STATES
prior_state = _read_prior_state(entry)
should_start = prior_state in _AUTOSTART_STATES
if not dry_run:
_cleanup_stale_runtime_files(entry)
_register_service(scandir, entry.name, start=should_start)
if not dry_run:
_cleanup_stale_runtime_files(entry)
_register_service(scandir, entry.name, start=should_start)
actions.append(ReconcileAction(
profile=entry.name,
prior_state=prior_state,
action="started" if should_start else "registered",
))
actions.append(ReconcileAction(
profile=entry.name,
prior_state=prior_state,
action="started" if should_start else "registered",
))
if not dry_run:
_write_reconcile_log(hermes_home, actions)

View file

@ -378,7 +378,19 @@ class S6ServiceManager:
time, not Python-substituted at registration time (OQ8-C).
2. Activates the bundled venv.
3. Drops to the hermes user and exec's
``hermes -p <profile> gateway run``.
``hermes -p <profile> gateway run`` (or just ``hermes
gateway run`` for the default profile see below).
Special case: ``profile == "default"`` emits ``hermes gateway
run`` with **no** ``-p`` flag. This is the sentinel for "the
root HERMES_HOME profile" (the implicit profile that exists at
the top of $HERMES_HOME, not under profiles/). It must be
spelled this way because ``_profile_suffix()`` returns the
empty string for the root profile, and the dispatcher in
``hermes_cli.gateway`` maps that empty string to the
``gateway-default`` service slot. Passing ``-p default`` here
would instead look up ``$HERMES_HOME/profiles/default/`` a
completely different (and almost always nonexistent) profile.
Note: the ``port`` parameter is accepted for API parity with
:meth:`register_profile_gateway` but is currently ignored the
@ -401,9 +413,12 @@ class S6ServiceManager:
]
for k, v in sorted(extra_env.items()):
lines.append(f"export {k}={shlex.quote(v)}")
lines.append(
f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run"
)
if profile == "default":
lines.append("exec s6-setuidgid hermes hermes gateway run")
else:
lines.append(
f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run"
)
return "\n".join(lines) + "\n"
@staticmethod

View file

@ -52,6 +52,31 @@ def _make_profile(
return p
def _seed_default_root(
hermes_home: Path,
*,
state: str | None = None,
with_pid: bool = False,
) -> None:
"""Populate gateway_state.json / stale runtime files at the
HERMES_HOME root (the implicit default profile)."""
if state is not None:
(hermes_home / "gateway_state.json").write_text(json.dumps({
"gateway_state": state, "timestamp": 1234567890,
}))
if with_pid:
(hermes_home / "gateway.pid").write_text(json.dumps(
{"pid": 99999, "host": "old-container"},
))
(hermes_home / "processes.json").write_text("[]")
def _named_actions(actions: list[ReconcileAction]) -> list[ReconcileAction]:
"""Drop the always-present default-profile action so tests that
only care about named profiles can assert against a clean list."""
return [a for a in actions if a.profile != "default"]
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
@ -65,7 +90,7 @@ def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None:
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions == [ReconcileAction(
assert _named_actions(actions) == [ReconcileAction(
profile="coder", prior_state="running", action="started",
)]
svc = scandir / "gateway-coder"
@ -84,7 +109,7 @@ def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None:
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions == [ReconcileAction(
assert _named_actions(actions) == [ReconcileAction(
profile="writer", prior_state="stopped", action="registered",
)]
# down marker tells s6-svscan to NOT start the service.
@ -100,7 +125,8 @@ def test_startup_failed_does_not_autostart(tmp_path: Path) -> None:
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions[0].action == "registered"
named = _named_actions(actions)
assert named[0].action == "registered"
assert (scandir / "gateway-broken" / "down").exists()
@ -114,7 +140,8 @@ def test_starting_state_does_not_autostart(tmp_path: Path) -> None:
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions[0].action == "registered"
named = _named_actions(actions)
assert named[0].action == "registered"
def test_stale_runtime_files_are_removed(tmp_path: Path) -> None:
@ -143,7 +170,7 @@ def test_profile_without_state_file_is_registered_but_not_started(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions == [ReconcileAction(
assert _named_actions(actions) == [ReconcileAction(
profile="fresh", prior_state=None, action="registered",
)]
assert (scandir / "gateway-fresh" / "down").exists()
@ -160,7 +187,7 @@ def test_directory_without_marker_file_is_skipped(tmp_path: Path) -> None:
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions == []
assert _named_actions(actions) == []
assert not (scandir / "gateway-stray").exists()
@ -175,7 +202,8 @@ def test_corrupt_state_file_treated_as_no_prior_state(tmp_path: Path) -> None:
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions[0].action == "registered" # not "started"
named = _named_actions(actions)
assert named[0].action == "registered" # not "started"
assert (scandir / "gateway-junk" / "down").exists()
@ -204,7 +232,7 @@ def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None:
)
# The action list is still produced...
assert actions == [ReconcileAction(
assert _named_actions(actions) == [ReconcileAction(
profile="coder", prior_state="running", action="started",
)]
# ...but nothing on disk was touched.
@ -213,14 +241,23 @@ def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None:
assert not (tmp_path / "logs" / "container-boot.log").exists()
def test_missing_profiles_root_returns_empty(tmp_path: Path) -> None:
def test_missing_profiles_root_still_registers_default_slot(
tmp_path: Path,
) -> None:
"""When $HERMES_HOME/profiles doesn't exist (fresh install), the
reconciliation should return an empty list without raising."""
reconciliation should still register a gateway-default slot for
the root profile and return without raising. Previously this
returned an empty list; the default slot is now always present
so `hermes gateway start` (no -p) has somewhere to land."""
scandir = tmp_path / "run-service"; scandir.mkdir()
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions == []
assert actions == [ReconcileAction(
profile="default", prior_state=None, action="registered",
)]
assert (scandir / "gateway-default").is_dir()
assert (scandir / "gateway-default" / "down").exists()
def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None:
@ -233,3 +270,152 @@ def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None:
reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
# ---------------------------------------------------------------------------
# Default-profile slot — always registered (PR #30136 review item I1)
# ---------------------------------------------------------------------------
def test_default_slot_always_registered_on_empty_home(tmp_path: Path) -> None:
"""Bare HERMES_HOME with nothing under it still produces a
gateway-default slot (down state)."""
scandir = tmp_path / "run-service"; scandir.mkdir()
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert actions == [ReconcileAction(
profile="default", prior_state=None, action="registered",
)]
svc = scandir / "gateway-default"
assert svc.is_dir()
assert (svc / "run").exists()
assert (svc / "down").exists()
def test_default_slot_run_script_omits_profile_flag(tmp_path: Path) -> None:
"""The default slot's run script must NOT pass `-p default` —
that would resolve to $HERMES_HOME/profiles/default/ instead of
the root profile. It must call `hermes gateway run` directly."""
scandir = tmp_path / "run-service"; scandir.mkdir()
reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
run = (scandir / "gateway-default" / "run").read_text()
assert "hermes gateway run" in run
assert "-p default" not in run
assert "-p 'default'" not in run
def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None:
"""gateway_state.json at the HERMES_HOME root with state=running
means the default slot auto-starts on container boot."""
scandir = tmp_path / "run-service"; scandir.mkdir()
_seed_default_root(tmp_path, state="running")
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.prior_state == "running"
assert default_action.action == "started"
assert not (scandir / "gateway-default" / "down").exists()
def test_default_slot_does_not_autostart_when_root_state_stopped(
tmp_path: Path,
) -> None:
scandir = tmp_path / "run-service"; scandir.mkdir()
_seed_default_root(tmp_path, state="stopped")
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.action == "registered"
assert (scandir / "gateway-default" / "down").exists()
def test_default_slot_does_not_autostart_when_root_state_startup_failed(
tmp_path: Path,
) -> None:
"""Crash-loop guard applies to the default slot too."""
scandir = tmp_path / "run-service"; scandir.mkdir()
_seed_default_root(tmp_path, state="startup_failed")
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.action == "registered"
def test_default_slot_cleans_up_stale_runtime_files_at_root(
tmp_path: Path,
) -> None:
"""gateway.pid and processes.json at the HERMES_HOME root (left
over from the previous container's default gateway) must be
swept the same way as for named profiles."""
scandir = tmp_path / "run-service"; scandir.mkdir()
_seed_default_root(tmp_path, state="running", with_pid=True)
assert (tmp_path / "gateway.pid").exists()
reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert not (tmp_path / "gateway.pid").exists()
assert not (tmp_path / "processes.json").exists()
def test_default_slot_appears_before_named_profiles(tmp_path: Path) -> None:
"""The action list is ordered: default first, then named profiles
in directory order. Operators and the boot-log reader rely on
this ordering being stable."""
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "z-last-alphabetically", state="stopped")
_make_profile(tmp_path, "a-first-alphabetically", state="stopped")
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert [a.profile for a in actions] == [
"default",
"a-first-alphabetically",
"z-last-alphabetically",
]
def test_profiles_default_subdir_is_skipped_with_warning(
tmp_path: Path,
caplog: pytest.LogCaptureFixture,
) -> None:
"""A user-created profiles/default/ collides with the reserved
root-profile slot the named entry is skipped (with a warning)
so we don't double-register gateway-default."""
import logging
caplog.set_level(logging.WARNING)
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "default", state="running")
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
# Only the root-profile default slot appears — not the colliding
# named profile.
default_actions = [a for a in actions if a.profile == "default"]
assert len(default_actions) == 1
# And the warning surfaces so operators know the named profile
# was ignored.
assert any(
"profiles/default/" in record.message for record in caplog.records
)