mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
Phase 4 of the s6-overlay supervision plan. Activates the Phase 3
S6ServiceManager by hooking it into the profile lifecycle and the
`hermes gateway start/stop/restart` dispatcher, and adds a cont-
init.d-time reconciliation pass that survives `docker restart`.
Task 4.0 — container-boot reconciliation:
/run/service/ is tmpfs, so every `docker restart` wipes every
per-profile gateway slot. /etc/cont-init.d/02-reconcile-profiles
invokes hermes_cli.container_boot.reconcile_profile_gateways() on
every boot, which walks $HERMES_HOME/profiles/<name>/, reads each
gateway_state.json, recreates the s6 service slot, and auto-starts
only those whose last state was 'running'. Other states
(stopped, starting, startup_failed, missing) register the slot
in the down state — avoiding crash-loops across restarts for a
gateway that was broken last boot. Per-profile outcome is recorded
to $HERMES_HOME/logs/container-boot.log.
Implementation: hermes_cli/container_boot.py + 12 unit tests.
Profile-marker is SOUL.md, not config.yaml, because `hermes profile
create` only seeds SOUL.md by default (config.yaml comes from
`hermes setup`).
Task 4.1 / 4.2 — profile create/delete hooks:
hermes_cli/profiles.py::create_profile now calls
_maybe_register_gateway_service(<canon>) at the end, which routes
through ServiceManager.register_profile_gateway when running on s6
and no-ops on host backends. delete_profile mirrors with
_maybe_unregister_gateway_service. _allocate_gateway_port produces
a deterministic SHA-256-derived port in [9200, 9800).
Task 4.3 — gateway dispatch + remove rejection arms:
_dispatch_via_service_manager_if_s6(action) intercepts
start/stop/restart at the top of each subcommand and routes them
through S6ServiceManager.{start,stop,restart}. The pre-Phase-4
`elif is_container():` rejection arms are kept as fallback for
pre-s6 containers / unsupported runtimes, but only ever fire when
detect_service_manager() != 's6'. install/uninstall under s6
print informational guidance pointing users at profile create/delete.
Removed the two xfail(strict=True) markers from
tests/docker/test_profile_gateway.py — both tests now pass strictly.
Task 4.4 — status reporting:
get_gateway_runtime_snapshot() reports
Manager: 's6 (container supervisor)' inside an s6 container instead
of 'docker (foreground)'.
Plan-vs-reality drift fixed in this commit:
- Plan's S6ServiceManager._render_run_script used
`gateway start --foreground --port {port}` — invented args; the
real CLI is `gateway run`. Switched accordingly. port arg
retained for API parity but now documented as 'currently ignored'.
- Plan's reconciler keyed on config.yaml; switched to SOUL.md
(config.yaml is created by hermes setup, not by hermes profile
create, so the original gate caught nothing).
- The plan's _dispatch helper used _profile_arg() which returns
'--profile <name>' (i.e. with the flag prefix). Switched to
_profile_suffix() which returns the bare name.
- Architecture B's docker exec doesn't get /command on PATH or
the venv on PATH; Dockerfile's runtime PATH now includes
/opt/hermes/.venv/bin so 'docker exec <c> hermes ...' works
without sourcing the venv.
- stage2-hook now chowns $HERMES_HOME/profiles to hermes on every
boot, not just on the UID-remap path. Without this, files created
by docker-exec-as-root accumulate and the next reconciler run
fails with PermissionError reading SOUL.md.
Test harness:
19 passed, 0 xfailed (the two pre-Phase-4 xfail targets flip to
passing). 78 unit tests across service_manager + container_boot +
profiles_s6_hooks + gateway_s6_dispatch. Hadolint + shellcheck
pass cleanly.
Refs: docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md
235 lines
8.2 KiB
Python
235 lines
8.2 KiB
Python
"""Tests for hermes_cli.container_boot — the cont-init.d-time
|
|
reconciliation that recreates per-profile gateway s6 service slots
|
|
from the persistent profiles directory.
|
|
|
|
These tests run against a fake $HERMES_HOME under tmp_path; no real
|
|
s6 supervision tree is required. The in-container integration test
|
|
covering end-to-end "docker restart" survival lives in
|
|
tests/docker/test_container_restart.py.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from hermes_cli.container_boot import (
|
|
ReconcileAction,
|
|
reconcile_profile_gateways,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures + helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _make_profile(
|
|
hermes_home: Path,
|
|
name: str,
|
|
*,
|
|
state: str | None,
|
|
with_pid: bool = False,
|
|
config: bool = True,
|
|
) -> Path:
|
|
"""Create a fake profile directory under hermes_home/profiles/<name>/."""
|
|
p = hermes_home / "profiles" / name
|
|
p.mkdir(parents=True)
|
|
if config:
|
|
# SOUL.md is what the reconciler keys on — it's always seeded by
|
|
# `hermes profile create`. See container_boot._render_run_script.
|
|
(p / "SOUL.md").write_text("# fake profile\n")
|
|
if state is not None:
|
|
(p / "gateway_state.json").write_text(json.dumps({
|
|
"gateway_state": state, "timestamp": 1234567890,
|
|
}))
|
|
if with_pid:
|
|
(p / "gateway.pid").write_text(json.dumps(
|
|
{"pid": 99999, "host": "old-container"},
|
|
))
|
|
(p / "processes.json").write_text("[]")
|
|
return p
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_running_profile_is_registered_and_autostarted(tmp_path: Path) -> None:
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
_make_profile(tmp_path, "coder", state="running")
|
|
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
assert actions == [ReconcileAction(
|
|
profile="coder", prior_state="running", action="started",
|
|
)]
|
|
svc = scandir / "gateway-coder"
|
|
assert (svc / "run").exists()
|
|
assert (svc / "run").stat().st_mode & 0o111 # executable
|
|
assert (svc / "type").read_text().strip() == "longrun"
|
|
# Auto-start means no down-marker.
|
|
assert not (svc / "down").exists()
|
|
|
|
|
|
def test_stopped_profile_is_registered_but_not_started(tmp_path: Path) -> None:
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
_make_profile(tmp_path, "writer", state="stopped")
|
|
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
assert actions == [ReconcileAction(
|
|
profile="writer", prior_state="stopped", action="registered",
|
|
)]
|
|
# down marker tells s6-svscan to NOT start the service.
|
|
assert (scandir / "gateway-writer" / "down").exists()
|
|
|
|
|
|
def test_startup_failed_does_not_autostart(tmp_path: Path) -> None:
|
|
"""Avoid crash-loop on restart when the gateway was failing to boot."""
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
_make_profile(tmp_path, "broken", state="startup_failed")
|
|
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
assert actions[0].action == "registered"
|
|
assert (scandir / "gateway-broken" / "down").exists()
|
|
|
|
|
|
def test_starting_state_does_not_autostart(tmp_path: Path) -> None:
|
|
"""`starting` means the gateway died mid-boot last time; treat as
|
|
failed, not as a candidate for auto-restart."""
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
_make_profile(tmp_path, "unlucky", state="starting")
|
|
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
assert actions[0].action == "registered"
|
|
|
|
|
|
def test_stale_runtime_files_are_removed(tmp_path: Path) -> None:
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)
|
|
assert (profile / "gateway.pid").exists()
|
|
assert (profile / "processes.json").exists()
|
|
|
|
reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
assert not (profile / "gateway.pid").exists()
|
|
assert not (profile / "processes.json").exists()
|
|
|
|
|
|
def test_profile_without_state_file_is_registered_but_not_started(
|
|
tmp_path: Path,
|
|
) -> None:
|
|
"""A freshly-created profile that's never been started: register
|
|
its slot but don't auto-start."""
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
_make_profile(tmp_path, "fresh", state=None)
|
|
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
assert actions == [ReconcileAction(
|
|
profile="fresh", prior_state=None, action="registered",
|
|
)]
|
|
assert (scandir / "gateway-fresh" / "down").exists()
|
|
|
|
|
|
def test_directory_without_marker_file_is_skipped(tmp_path: Path) -> None:
|
|
"""A stray dir under profiles/ that isn't actually a profile (no
|
|
SOUL.md — the marker the reconciler keys on) should be skipped."""
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
# Create a profile dir but without SOUL.md
|
|
(tmp_path / "profiles" / "stray").mkdir(parents=True)
|
|
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
assert actions == []
|
|
assert not (scandir / "gateway-stray").exists()
|
|
|
|
|
|
def test_corrupt_state_file_treated_as_no_prior_state(tmp_path: Path) -> None:
|
|
"""If gateway_state.json is malformed JSON, don't blow up the whole
|
|
reconciliation — register the slot in the down state."""
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
profile = _make_profile(tmp_path, "junk", state="running")
|
|
(profile / "gateway_state.json").write_text("{ not valid json")
|
|
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
assert actions[0].action == "registered" # not "started"
|
|
assert (scandir / "gateway-junk" / "down").exists()
|
|
|
|
|
|
def test_reconcile_log_is_written(tmp_path: Path) -> None:
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
_make_profile(tmp_path, "a", state="running")
|
|
_make_profile(tmp_path, "b", state="stopped")
|
|
|
|
reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
|
|
log = (tmp_path / "logs" / "container-boot.log").read_text()
|
|
assert "profile=a" in log
|
|
assert "action=started" in log
|
|
assert "profile=b" in log
|
|
assert "action=registered" in log
|
|
|
|
|
|
def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None:
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)
|
|
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=True,
|
|
)
|
|
|
|
# The action list is still produced...
|
|
assert actions == [ReconcileAction(
|
|
profile="coder", prior_state="running", action="started",
|
|
)]
|
|
# ...but nothing on disk was touched.
|
|
assert (profile / "gateway.pid").exists() # not removed under dry_run
|
|
assert not (scandir / "gateway-coder").exists()
|
|
assert not (tmp_path / "logs" / "container-boot.log").exists()
|
|
|
|
|
|
def test_missing_profiles_root_returns_empty(tmp_path: Path) -> None:
|
|
"""When $HERMES_HOME/profiles doesn't exist (fresh install), the
|
|
reconciliation should return an empty list without raising."""
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|
|
assert actions == []
|
|
|
|
|
|
def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None:
|
|
"""A profile dir whose name doesn't match validate_profile_name's
|
|
rules (uppercase, etc.) must surface as a hard error rather than
|
|
silently produce an invalid s6 service dir."""
|
|
scandir = tmp_path / "run-service"; scandir.mkdir()
|
|
_make_profile(tmp_path, "BadName", state="running")
|
|
with pytest.raises(ValueError):
|
|
reconcile_profile_gateways(
|
|
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
|
)
|