mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(container_boot): always register gateway-default slot
PR #30136 review caught: `hermes gateway start` (no `-p`) inside the container resolves `_profile_suffix() == ""` → service name `gateway-default`, but no such slot was ever registered. The Phase 4 profile-create hook only fired on `hermes profile create <name>`, and the root profile (which lives at the top of $HERMES_HOME, not under `profiles/`) was never one of those. So bare `hermes gateway start` landed on `s6-svc -u /run/service/gateway-default` → uncaught `CalledProcessError` → traceback to the user. Changes: 1. `reconcile_profile_gateways` now always registers a `gateway-default` slot before iterating named profiles. Its prior state is read from `$HERMES_HOME/gateway_state.json` (sibling to the profile root, not under `profiles/`); stale runtime files there are swept the same way. Auto-up only if the prior state was `running` — same rule as named profiles. 2. `S6ServiceManager._render_run_script` special-cases `profile == "default"` to emit `hermes gateway run` with NO `-p` flag. Passing `-p default` would resolve to `$HERMES_HOME/profiles/default/` — a different profile that almost certainly doesn't exist. The empty profile-suffix convention is the dispatcher's contract and the run script has to match. 3. A user-created `profiles/default/` collides with the reserved root-profile slot; the reconciler now skips it with a warning rather than producing two registrations of the same service name. Action-list ordering is stable: `default` first, then named profiles in directory order. Boot-log readers can rely on this. Tests: 8 new dedicated default-slot tests plus updates to every existing test that asserted against the action list (via the new `_named_actions` helper that drops the always-present default entry).
This commit is contained in:
parent
a1a53a5d6e
commit
b044c1ac29
3 changed files with 283 additions and 39 deletions
|
|
@ -60,44 +60,87 @@ def reconcile_profile_gateways(
|
|||
) -> list[ReconcileAction]:
|
||||
"""Recreate s6 service registrations for every persistent profile.
|
||||
|
||||
Always registers a ``gateway-default`` slot for the root profile
|
||||
(the implicit profile that lives at the top of ``$HERMES_HOME``,
|
||||
not under ``profiles/``). The dispatcher in ``hermes_cli.gateway``
|
||||
maps an empty profile suffix to ``gateway-default``, so this slot
|
||||
is what ``hermes gateway start`` (no ``-p``) targets. Without it,
|
||||
bare ``hermes gateway start`` inside the container would land on
|
||||
``s6-svc -u /run/service/gateway-default`` → uncaught
|
||||
``CalledProcessError`` → traceback to the user (PR #30136 review).
|
||||
|
||||
The default slot's prior state is read from
|
||||
``$HERMES_HOME/gateway_state.json`` (sibling to the profile root,
|
||||
not under ``profiles/``); stale runtime files there are swept the
|
||||
same way as for named profiles.
|
||||
|
||||
Args:
|
||||
hermes_home: The container's HERMES_HOME (typically /opt/data).
|
||||
Profiles live under ``<hermes_home>/profiles/<name>/``.
|
||||
Profiles live under ``<hermes_home>/profiles/<name>/``;
|
||||
the default profile lives at ``<hermes_home>`` itself.
|
||||
scandir: The s6 dynamic scandir (typically /run/service). Service
|
||||
directories are created at ``<scandir>/gateway-<profile>/``.
|
||||
dry_run: When True, walk and return the action list without
|
||||
touching the filesystem. For tests and `--dry-run` debug.
|
||||
|
||||
Returns:
|
||||
One :class:`ReconcileAction` per profile, in directory order.
|
||||
One :class:`ReconcileAction` per profile, in this order:
|
||||
``default`` first, then named profiles in directory order.
|
||||
"""
|
||||
actions: list[ReconcileAction] = []
|
||||
|
||||
# Default profile — always register, even if nothing has ever
|
||||
# populated the root profile dir. The slot exists so
|
||||
# ``hermes gateway start`` (no ``-p``) has somewhere to land;
|
||||
# auto-up only when the prior state was "running" (same rule as
|
||||
# named profiles).
|
||||
default_prior_state = _read_prior_state(hermes_home)
|
||||
default_should_start = default_prior_state in _AUTOSTART_STATES
|
||||
if not dry_run:
|
||||
_cleanup_stale_runtime_files(hermes_home)
|
||||
_register_service(scandir, "default", start=default_should_start)
|
||||
actions.append(ReconcileAction(
|
||||
profile="default",
|
||||
prior_state=default_prior_state,
|
||||
action="started" if default_should_start else "registered",
|
||||
))
|
||||
|
||||
profiles_root = hermes_home / "profiles"
|
||||
if not profiles_root.is_dir():
|
||||
return actions
|
||||
if profiles_root.is_dir():
|
||||
for entry in sorted(profiles_root.iterdir()):
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
# SOUL.md is always seeded by `hermes profile create` (config.yaml
|
||||
# is not — that comes later via `hermes setup`). Use it as the
|
||||
# "real profile" marker so stray dirs (backups, manual mkdir)
|
||||
# aren't picked up.
|
||||
if not (entry / "SOUL.md").exists():
|
||||
continue
|
||||
# The "default" service name is reserved for the root
|
||||
# profile (above) — if a user has somehow created a
|
||||
# ``profiles/default/`` directory, skip it to avoid the
|
||||
# slot collision. Their gateway would still be reachable
|
||||
# via ``hermes -p default-named gateway start`` if they
|
||||
# rename the directory; we don't try to disambiguate here.
|
||||
if entry.name == "default":
|
||||
log.warning(
|
||||
"profiles/default/ exists — skipping to avoid colliding "
|
||||
"with the reserved root-profile s6 slot",
|
||||
)
|
||||
continue
|
||||
|
||||
for entry in sorted(profiles_root.iterdir()):
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
# SOUL.md is always seeded by `hermes profile create` (config.yaml
|
||||
# is not — that comes later via `hermes setup`). Use it as the
|
||||
# "real profile" marker so stray dirs (backups, manual mkdir)
|
||||
# aren't picked up.
|
||||
if not (entry / "SOUL.md").exists():
|
||||
continue
|
||||
prior_state = _read_prior_state(entry)
|
||||
should_start = prior_state in _AUTOSTART_STATES
|
||||
|
||||
prior_state = _read_prior_state(entry)
|
||||
should_start = prior_state in _AUTOSTART_STATES
|
||||
if not dry_run:
|
||||
_cleanup_stale_runtime_files(entry)
|
||||
_register_service(scandir, entry.name, start=should_start)
|
||||
|
||||
if not dry_run:
|
||||
_cleanup_stale_runtime_files(entry)
|
||||
_register_service(scandir, entry.name, start=should_start)
|
||||
|
||||
actions.append(ReconcileAction(
|
||||
profile=entry.name,
|
||||
prior_state=prior_state,
|
||||
action="started" if should_start else "registered",
|
||||
))
|
||||
actions.append(ReconcileAction(
|
||||
profile=entry.name,
|
||||
prior_state=prior_state,
|
||||
action="started" if should_start else "registered",
|
||||
))
|
||||
|
||||
if not dry_run:
|
||||
_write_reconcile_log(hermes_home, actions)
|
||||
|
|
|
|||
|
|
@ -378,7 +378,19 @@ class S6ServiceManager:
|
|||
time, not Python-substituted at registration time (OQ8-C).
|
||||
2. Activates the bundled venv.
|
||||
3. Drops to the hermes user and exec's
|
||||
``hermes -p <profile> gateway run``.
|
||||
``hermes -p <profile> gateway run`` (or just ``hermes
|
||||
gateway run`` for the default profile — see below).
|
||||
|
||||
Special case: ``profile == "default"`` emits ``hermes gateway
|
||||
run`` with **no** ``-p`` flag. This is the sentinel for "the
|
||||
root HERMES_HOME profile" (the implicit profile that exists at
|
||||
the top of $HERMES_HOME, not under profiles/). It must be
|
||||
spelled this way because ``_profile_suffix()`` returns the
|
||||
empty string for the root profile, and the dispatcher in
|
||||
``hermes_cli.gateway`` maps that empty string to the
|
||||
``gateway-default`` service slot. Passing ``-p default`` here
|
||||
would instead look up ``$HERMES_HOME/profiles/default/`` — a
|
||||
completely different (and almost always nonexistent) profile.
|
||||
|
||||
Note: the ``port`` parameter is accepted for API parity with
|
||||
:meth:`register_profile_gateway` but is currently ignored — the
|
||||
|
|
@ -401,9 +413,12 @@ class S6ServiceManager:
|
|||
]
|
||||
for k, v in sorted(extra_env.items()):
|
||||
lines.append(f"export {k}={shlex.quote(v)}")
|
||||
lines.append(
|
||||
f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run"
|
||||
)
|
||||
if profile == "default":
|
||||
lines.append("exec s6-setuidgid hermes hermes gateway run")
|
||||
else:
|
||||
lines.append(
|
||||
f"exec s6-setuidgid hermes hermes -p {shlex.quote(profile)} gateway run"
|
||||
)
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
|
|
@ -52,6 +52,31 @@ def _make_profile(
|
|||
return p
|
||||
|
||||
|
||||
def _seed_default_root(
|
||||
hermes_home: Path,
|
||||
*,
|
||||
state: str | None = None,
|
||||
with_pid: bool = False,
|
||||
) -> None:
|
||||
"""Populate gateway_state.json / stale runtime files at the
|
||||
HERMES_HOME root (the implicit default profile)."""
|
||||
if state is not None:
|
||||
(hermes_home / "gateway_state.json").write_text(json.dumps({
|
||||
"gateway_state": state, "timestamp": 1234567890,
|
||||
}))
|
||||
if with_pid:
|
||||
(hermes_home / "gateway.pid").write_text(json.dumps(
|
||||
{"pid": 99999, "host": "old-container"},
|
||||
))
|
||||
(hermes_home / "processes.json").write_text("[]")
|
||||
|
||||
|
||||
def _named_actions(actions: list[ReconcileAction]) -> list[ReconcileAction]:
|
||||
"""Drop the always-present default-profile action so tests that
|
||||
only care about named profiles can assert against a clean list."""
|
||||
return [a for a in actions if a.profile != "default"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -65,7 +90,7 @@ def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None:
|
|||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert actions == [ReconcileAction(
|
||||
assert _named_actions(actions) == [ReconcileAction(
|
||||
profile="coder", prior_state="running", action="started",
|
||||
)]
|
||||
svc = scandir / "gateway-coder"
|
||||
|
|
@ -84,7 +109,7 @@ def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None:
|
|||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert actions == [ReconcileAction(
|
||||
assert _named_actions(actions) == [ReconcileAction(
|
||||
profile="writer", prior_state="stopped", action="registered",
|
||||
)]
|
||||
# down marker tells s6-svscan to NOT start the service.
|
||||
|
|
@ -100,7 +125,8 @@ def test_startup_failed_does_not_autostart(tmp_path: Path) -> None:
|
|||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert actions[0].action == "registered"
|
||||
named = _named_actions(actions)
|
||||
assert named[0].action == "registered"
|
||||
assert (scandir / "gateway-broken" / "down").exists()
|
||||
|
||||
|
||||
|
|
@ -114,7 +140,8 @@ def test_starting_state_does_not_autostart(tmp_path: Path) -> None:
|
|||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert actions[0].action == "registered"
|
||||
named = _named_actions(actions)
|
||||
assert named[0].action == "registered"
|
||||
|
||||
|
||||
def test_stale_runtime_files_are_removed(tmp_path: Path) -> None:
|
||||
|
|
@ -143,7 +170,7 @@ def test_profile_without_state_file_is_registered_but_not_started(
|
|||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert actions == [ReconcileAction(
|
||||
assert _named_actions(actions) == [ReconcileAction(
|
||||
profile="fresh", prior_state=None, action="registered",
|
||||
)]
|
||||
assert (scandir / "gateway-fresh" / "down").exists()
|
||||
|
|
@ -160,7 +187,7 @@ def test_directory_without_marker_file_is_skipped(tmp_path: Path) -> None:
|
|||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert actions == []
|
||||
assert _named_actions(actions) == []
|
||||
assert not (scandir / "gateway-stray").exists()
|
||||
|
||||
|
||||
|
|
@ -175,7 +202,8 @@ def test_corrupt_state_file_treated_as_no_prior_state(tmp_path: Path) -> None:
|
|||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert actions[0].action == "registered" # not "started"
|
||||
named = _named_actions(actions)
|
||||
assert named[0].action == "registered" # not "started"
|
||||
assert (scandir / "gateway-junk" / "down").exists()
|
||||
|
||||
|
||||
|
|
@ -204,7 +232,7 @@ def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None:
|
|||
)
|
||||
|
||||
# The action list is still produced...
|
||||
assert actions == [ReconcileAction(
|
||||
assert _named_actions(actions) == [ReconcileAction(
|
||||
profile="coder", prior_state="running", action="started",
|
||||
)]
|
||||
# ...but nothing on disk was touched.
|
||||
|
|
@ -213,14 +241,23 @@ def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None:
|
|||
assert not (tmp_path / "logs" / "container-boot.log").exists()
|
||||
|
||||
|
||||
def test_missing_profiles_root_returns_empty(tmp_path: Path) -> None:
|
||||
def test_missing_profiles_root_still_registers_default_slot(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
"""When $HERMES_HOME/profiles doesn't exist (fresh install), the
|
||||
reconciliation should return an empty list without raising."""
|
||||
reconciliation should still register a gateway-default slot for
|
||||
the root profile and return without raising. Previously this
|
||||
returned an empty list; the default slot is now always present
|
||||
so `hermes gateway start` (no -p) has somewhere to land."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
assert actions == []
|
||||
assert actions == [ReconcileAction(
|
||||
profile="default", prior_state=None, action="registered",
|
||||
)]
|
||||
assert (scandir / "gateway-default").is_dir()
|
||||
assert (scandir / "gateway-default" / "down").exists()
|
||||
|
||||
|
||||
def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None:
|
||||
|
|
@ -233,3 +270,152 @@ def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None:
|
|||
reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default-profile slot — always registered (PR #30136 review item I1)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_default_slot_always_registered_on_empty_home(tmp_path: Path) -> None:
|
||||
"""Bare HERMES_HOME with nothing under it still produces a
|
||||
gateway-default slot (down state)."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert actions == [ReconcileAction(
|
||||
profile="default", prior_state=None, action="registered",
|
||||
)]
|
||||
svc = scandir / "gateway-default"
|
||||
assert svc.is_dir()
|
||||
assert (svc / "run").exists()
|
||||
assert (svc / "down").exists()
|
||||
|
||||
|
||||
def test_default_slot_run_script_omits_profile_flag(tmp_path: Path) -> None:
|
||||
"""The default slot's run script must NOT pass `-p default` —
|
||||
that would resolve to $HERMES_HOME/profiles/default/ instead of
|
||||
the root profile. It must call `hermes gateway run` directly."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
|
||||
reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
run = (scandir / "gateway-default" / "run").read_text()
|
||||
assert "hermes gateway run" in run
|
||||
assert "-p default" not in run
|
||||
assert "-p 'default'" not in run
|
||||
|
||||
|
||||
def test_default_slot_autostarts_when_root_state_running(tmp_path: Path) -> None:
|
||||
"""gateway_state.json at the HERMES_HOME root with state=running
|
||||
means the default slot auto-starts on container boot."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_seed_default_root(tmp_path, state="running")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
default_action = next(a for a in actions if a.profile == "default")
|
||||
assert default_action.prior_state == "running"
|
||||
assert default_action.action == "started"
|
||||
assert not (scandir / "gateway-default" / "down").exists()
|
||||
|
||||
|
||||
def test_default_slot_does_not_autostart_when_root_state_stopped(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_seed_default_root(tmp_path, state="stopped")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
default_action = next(a for a in actions if a.profile == "default")
|
||||
assert default_action.action == "registered"
|
||||
assert (scandir / "gateway-default" / "down").exists()
|
||||
|
||||
|
||||
def test_default_slot_does_not_autostart_when_root_state_startup_failed(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
"""Crash-loop guard applies to the default slot too."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_seed_default_root(tmp_path, state="startup_failed")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
default_action = next(a for a in actions if a.profile == "default")
|
||||
assert default_action.action == "registered"
|
||||
|
||||
|
||||
def test_default_slot_cleans_up_stale_runtime_files_at_root(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
"""gateway.pid and processes.json at the HERMES_HOME root (left
|
||||
over from the previous container's default gateway) must be
|
||||
swept the same way as for named profiles."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_seed_default_root(tmp_path, state="running", with_pid=True)
|
||||
assert (tmp_path / "gateway.pid").exists()
|
||||
|
||||
reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert not (tmp_path / "gateway.pid").exists()
|
||||
assert not (tmp_path / "processes.json").exists()
|
||||
|
||||
|
||||
def test_default_slot_appears_before_named_profiles(tmp_path: Path) -> None:
|
||||
"""The action list is ordered: default first, then named profiles
|
||||
in directory order. Operators and the boot-log reader rely on
|
||||
this ordering being stable."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "z-last-alphabetically", state="stopped")
|
||||
_make_profile(tmp_path, "a-first-alphabetically", state="stopped")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert [a.profile for a in actions] == [
|
||||
"default",
|
||||
"a-first-alphabetically",
|
||||
"z-last-alphabetically",
|
||||
]
|
||||
|
||||
|
||||
def test_profiles_default_subdir_is_skipped_with_warning(
|
||||
tmp_path: Path,
|
||||
caplog: pytest.LogCaptureFixture,
|
||||
) -> None:
|
||||
"""A user-created profiles/default/ collides with the reserved
|
||||
root-profile slot — the named entry is skipped (with a warning)
|
||||
so we don't double-register gateway-default."""
|
||||
import logging
|
||||
caplog.set_level(logging.WARNING)
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "default", state="running")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
# Only the root-profile default slot appears — not the colliding
|
||||
# named profile.
|
||||
default_actions = [a for a in actions if a.profile == "default"]
|
||||
assert len(default_actions) == 1
|
||||
# And the warning surfaces so operators know the named profile
|
||||
# was ignored.
|
||||
assert any(
|
||||
"profiles/default/" in record.message for record in caplog.records
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue