diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py index fa4fe4568c4..2cc9c306fd2 100644 --- a/hermes_cli/container_boot.py +++ b/hermes_cli/container_boot.py @@ -60,44 +60,87 @@ def reconcile_profile_gateways( ) -> list[ReconcileAction]: """Recreate s6 service registrations for every persistent profile. + Always registers a ``gateway-default`` slot for the root profile + (the implicit profile that lives at the top of ``$HERMES_HOME``, + not under ``profiles/``). The dispatcher in ``hermes_cli.gateway`` + maps an empty profile suffix to ``gateway-default``, so this slot + is what ``hermes gateway start`` (no ``-p``) targets. Without it, + bare ``hermes gateway start`` inside the container would land on + ``s6-svc -u /run/service/gateway-default`` → uncaught + ``CalledProcessError`` → traceback to the user (PR #30136 review). + + The default slot's prior state is read from + ``$HERMES_HOME/gateway_state.json`` (sibling to the profile root, + not under ``profiles/``); stale runtime files there are swept the + same way as for named profiles. + Args: hermes_home: The container's HERMES_HOME (typically /opt/data). - Profiles live under ``/profiles//``. + Profiles live under ``/profiles//``; + the default profile lives at ```` itself. scandir: The s6 dynamic scandir (typically /run/service). Service directories are created at ``/gateway-/``. dry_run: When True, walk and return the action list without touching the filesystem. For tests and `--dry-run` debug. Returns: - One :class:`ReconcileAction` per profile, in directory order. + One :class:`ReconcileAction` per profile, in this order: + ``default`` first, then named profiles in directory order. """ actions: list[ReconcileAction] = [] + + # Default profile — always register, even if nothing has ever + # populated the root profile dir. The slot exists so + # ``hermes gateway start`` (no ``-p``) has somewhere to land; + # auto-up only when the prior state was "running" (same rule as + # named profiles). + default_prior_state = _read_prior_state(hermes_home) + default_should_start = default_prior_state in _AUTOSTART_STATES + if not dry_run: + _cleanup_stale_runtime_files(hermes_home) + _register_service(scandir, "default", start=default_should_start) + actions.append(ReconcileAction( + profile="default", + prior_state=default_prior_state, + action="started" if default_should_start else "registered", + )) + profiles_root = hermes_home / "profiles" - if not profiles_root.is_dir(): - return actions + if profiles_root.is_dir(): + for entry in sorted(profiles_root.iterdir()): + if not entry.is_dir(): + continue + # SOUL.md is always seeded by `hermes profile create` (config.yaml + # is not — that comes later via `hermes setup`). Use it as the + # "real profile" marker so stray dirs (backups, manual mkdir) + # aren't picked up. + if not (entry / "SOUL.md").exists(): + continue + # The "default" service name is reserved for the root + # profile (above) — if a user has somehow created a + # ``profiles/default/`` directory, skip it to avoid the + # slot collision. Their gateway would still be reachable + # via ``hermes -p default-named gateway start`` if they + # rename the directory; we don't try to disambiguate here. + if entry.name == "default": + log.warning( + "profiles/default/ exists — skipping to avoid colliding " + "with the reserved root-profile s6 slot", + ) + continue - for entry in sorted(profiles_root.iterdir()): - if not entry.is_dir(): - continue - # SOUL.md is always seeded by `hermes profile create` (config.yaml - # is not — that comes later via `hermes setup`). Use it as the - # "real profile" marker so stray dirs (backups, manual mkdir) - # aren't picked up. - if not (entry / "SOUL.md").exists(): - continue + prior_state = _read_prior_state(entry) + should_start = prior_state in _AUTOSTART_STATES - prior_state = _read_prior_state(entry) - should_start = prior_state in _AUTOSTART_STATES + if not dry_run: + _cleanup_stale_runtime_files(entry) + _register_service(scandir, entry.name, start=should_start) - if not dry_run: - _cleanup_stale_runtime_files(entry) - _register_service(scandir, entry.name, start=should_start) - - actions.append(ReconcileAction( - profile=entry.name, - prior_state=prior_state, - action="started" if should_start else "registered", - )) + actions.append(ReconcileAction( + profile=entry.name, + prior_state=prior_state, + action="started" if should_start else "registered", + )) if not dry_run: _write_reconcile_log(hermes_home, actions) diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 18b6ef01664..461a2c98601 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -378,7 +378,19 @@ class S6ServiceManager: time, not Python-substituted at registration time (OQ8-C). 2. Activates the bundled venv. 3. Drops to the hermes user and exec's - ``hermes -p gateway run``. + ``hermes -p gateway run`` (or just ``hermes + gateway run`` for the default profile — see below). + + Special case: ``profile == "default"`` emits ``hermes gateway + run`` with **no** ``-p`` flag. This is the sentinel for "the + root HERMES_HOME profile" (the implicit profile that exists at + the top of $HERMES_HOME, not under profiles/). It must be + spelled this way because ``_profile_suffix()`` returns the + empty string for the root profile, and the dispatcher in + ``hermes_cli.gateway`` maps that empty string to the + ``gateway-default`` service slot. Passing ``-p default`` here + would instead look up ``$HERMES_HOME/profiles/default/`` — a + completely different (and almost always nonexistent) profile. Note: the ``port`` parameter is accepted for API parity with :meth:`register_profile_gateway` but is currently ignored — the @@ -401,9 +413,12 @@ class S6ServiceManager: ] for k, v in sorted(extra_env.items()): lines.append(f"export {k}={shlex.quote(v)}") - lines.append( - f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run" - ) + if profile == "default": + lines.append("exec s6-setuidgid hermes hermes gateway run") + else: + lines.append( + f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run" + ) return "\n".join(lines) + "\n" @staticmethod diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py index f0d932292c5..8272c090448 100644 --- a/tests/hermes_cli/test_container_boot.py +++ b/tests/hermes_cli/test_container_boot.py @@ -52,6 +52,31 @@ def _make_profile( return p +def _seed_default_root( + hermes_home: Path, + *, + state: str | None = None, + with_pid: bool = False, +) -> None: + """Populate gateway_state.json / stale runtime files at the + HERMES_HOME root (the implicit default profile).""" + if state is not None: + (hermes_home / "gateway_state.json").write_text(json.dumps({ + "gateway_state": state, "timestamp": 1234567890, + })) + if with_pid: + (hermes_home / "gateway.pid").write_text(json.dumps( + {"pid": 99999, "host": "old-container"}, + )) + (hermes_home / "processes.json").write_text("[]") + + +def _named_actions(actions: list[ReconcileAction]) -> list[ReconcileAction]: + """Drop the always-present default-profile action so tests that + only care about named profiles can assert against a clean list.""" + return [a for a in actions if a.profile != "default"] + + # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- @@ -65,7 +90,7 @@ def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None: hermes_home=tmp_path, scandir=scandir, dry_run=False, ) - assert actions == [ReconcileAction( + assert _named_actions(actions) == [ReconcileAction( profile="coder", prior_state="running", action="started", )] svc = scandir / "gateway-coder" @@ -84,7 +109,7 @@ def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None: hermes_home=tmp_path, scandir=scandir, dry_run=False, ) - assert actions == [ReconcileAction( + assert _named_actions(actions) == [ReconcileAction( profile="writer", prior_state="stopped", action="registered", )] # down marker tells s6-svscan to NOT start the service. @@ -100,7 +125,8 @@ def test_startup_failed_does_not_autostart(tmp_path: Path) -> None: hermes_home=tmp_path, scandir=scandir, dry_run=False, ) - assert actions[0].action == "registered" + named = _named_actions(actions) + assert named[0].action == "registered" assert (scandir / "gateway-broken" / "down").exists() @@ -114,7 +140,8 @@ def test_starting_state_does_not_autostart(tmp_path: Path) -> None: hermes_home=tmp_path, scandir=scandir, dry_run=False, ) - assert actions[0].action == "registered" + named = _named_actions(actions) + assert named[0].action == "registered" def test_stale_runtime_files_are_removed(tmp_path: Path) -> None: @@ -143,7 +170,7 @@ def test_profile_without_state_file_is_registered_but_not_started( hermes_home=tmp_path, scandir=scandir, dry_run=False, ) - assert actions == [ReconcileAction( + assert _named_actions(actions) == [ReconcileAction( profile="fresh", prior_state=None, action="registered", )] assert (scandir / "gateway-fresh" / "down").exists() @@ -160,7 +187,7 @@ def test_directory_without_marker_file_is_skipped(tmp_path: Path) -> None: hermes_home=tmp_path, scandir=scandir, dry_run=False, ) - assert actions == [] + assert _named_actions(actions) == [] assert not (scandir / "gateway-stray").exists() @@ -175,7 +202,8 @@ def test_corrupt_state_file_treated_as_no_prior_state(tmp_path: Path) -> None: hermes_home=tmp_path, scandir=scandir, dry_run=False, ) - assert actions[0].action == "registered" # not "started" + named = _named_actions(actions) + assert named[0].action == "registered" # not "started" assert (scandir / "gateway-junk" / "down").exists() @@ -204,7 +232,7 @@ def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None: ) # The action list is still produced... - assert actions == [ReconcileAction( + assert _named_actions(actions) == [ReconcileAction( profile="coder", prior_state="running", action="started", )] # ...but nothing on disk was touched. @@ -213,14 +241,23 @@ def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None: assert not (tmp_path / "logs" / "container-boot.log").exists() -def test_missing_profiles_root_returns_empty(tmp_path: Path) -> None: +def test_missing_profiles_root_still_registers_default_slot( + tmp_path: Path, +) -> None: """When $HERMES_HOME/profiles doesn't exist (fresh install), the - reconciliation should return an empty list without raising.""" + reconciliation should still register a gateway-default slot for + the root profile and return without raising. Previously this + returned an empty list; the default slot is now always present + so `hermes gateway start` (no -p) has somewhere to land.""" scandir = tmp_path / "run-service"; scandir.mkdir() actions = reconcile_profile_gateways( hermes_home=tmp_path, scandir=scandir, dry_run=False, ) - assert actions == [] + assert actions == [ReconcileAction( + profile="default", prior_state=None, action="registered", + )] + assert (scandir / "gateway-default").is_dir() + assert (scandir / "gateway-default" / "down").exists() def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None: @@ -233,3 +270,152 @@ def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None: reconcile_profile_gateways( hermes_home=tmp_path, scandir=scandir, dry_run=False, ) + + +# --------------------------------------------------------------------------- +# Default-profile slot — always registered (PR #30136 review item I1) +# --------------------------------------------------------------------------- + + +def test_default_slot_always_registered_on_empty_home(tmp_path: Path) -> None: + """Bare HERMES_HOME with nothing under it still produces a + gateway-default slot (down state).""" + scandir = tmp_path / "run-service"; scandir.mkdir() + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert actions == [ReconcileAction( + profile="default", prior_state=None, action="registered", + )] + svc = scandir / "gateway-default" + assert svc.is_dir() + assert (svc / "run").exists() + assert (svc / "down").exists() + + +def test_default_slot_run_script_omits_profile_flag(tmp_path: Path) -> None: + """The default slot's run script must NOT pass `-p default` — + that would resolve to $HERMES_HOME/profiles/default/ instead of + the root profile. It must call `hermes gateway run` directly.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + run = (scandir / "gateway-default" / "run").read_text() + assert "hermes gateway run" in run + assert "-p default" not in run + assert "-p 'default'" not in run + + +def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None: + """gateway_state.json at the HERMES_HOME root with state=running + means the default slot auto-starts on container boot.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="running") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.prior_state == "running" + assert default_action.action == "started" + assert not (scandir / "gateway-default" / "down").exists() + + +def test_default_slot_does_not_autostart_when_root_state_stopped( + tmp_path: Path, +) -> None: + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="stopped") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.action == "registered" + assert (scandir / "gateway-default" / "down").exists() + + +def test_default_slot_does_not_autostart_when_root_state_startup_failed( + tmp_path: Path, +) -> None: + """Crash-loop guard applies to the default slot too.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="startup_failed") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.action == "registered" + + +def test_default_slot_cleans_up_stale_runtime_files_at_root( + tmp_path: Path, +) -> None: + """gateway.pid and processes.json at the HERMES_HOME root (left + over from the previous container's default gateway) must be + swept the same way as for named profiles.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="running", with_pid=True) + assert (tmp_path / "gateway.pid").exists() + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert not (tmp_path / "gateway.pid").exists() + assert not (tmp_path / "processes.json").exists() + + +def test_default_slot_appears_before_named_profiles(tmp_path: Path) -> None: + """The action list is ordered: default first, then named profiles + in directory order. Operators and the boot-log reader rely on + this ordering being stable.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "z-last-alphabetically", state="stopped") + _make_profile(tmp_path, "a-first-alphabetically", state="stopped") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert [a.profile for a in actions] == [ + "default", + "a-first-alphabetically", + "z-last-alphabetically", + ] + + +def test_profiles_default_subdir_is_skipped_with_warning( + tmp_path: Path, + caplog: pytest.LogCaptureFixture, +) -> None: + """A user-created profiles/default/ collides with the reserved + root-profile slot — the named entry is skipped (with a warning) + so we don't double-register gateway-default.""" + import logging + caplog.set_level(logging.WARNING) + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "default", state="running") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + # Only the root-profile default slot appears — not the colliding + # named profile. + default_actions = [a for a in actions if a.profile == "default"] + assert len(default_actions) == 1 + # And the warning surfaces so operators know the named profile + # was ignored. + assert any( + "profiles/default/" in record.message for record in caplog.records + )