mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-08 08:11:38 +00:00
feat(docker): per-profile s6 supervision + container-restart reconciliation
Phase 4 of the s6-overlay supervision plan. Activates the Phase 3
S6ServiceManager by hooking it into the profile lifecycle and the
`hermes gateway start/stop/restart` dispatcher, and adds a cont-
init.d-time reconciliation pass that survives `docker restart`.
Task 4.0 — container-boot reconciliation:
/run/service/ is tmpfs, so every `docker restart` wipes every
per-profile gateway slot. /etc/cont-init.d/02-reconcile-profiles
invokes hermes_cli.container_boot.reconcile_profile_gateways() on
every boot, which walks $HERMES_HOME/profiles/<name>/, reads each
gateway_state.json, recreates the s6 service slot, and auto-starts
only those whose last state was 'running'. Other states
(stopped, starting, startup_failed, missing) register the slot
in the down state — avoiding crash-loops across restarts for a
gateway that was broken last boot. Per-profile outcome is recorded
to $HERMES_HOME/logs/container-boot.log.
Implementation: hermes_cli/container_boot.py + 12 unit tests.
Profile-marker is SOUL.md, not config.yaml, because `hermes profile
create` only seeds SOUL.md by default (config.yaml comes from
`hermes setup`).
Task 4.1 / 4.2 — profile create/delete hooks:
hermes_cli/profiles.py::create_profile now calls
_maybe_register_gateway_service(<canon>) at the end, which routes
through ServiceManager.register_profile_gateway when running on s6
and no-ops on host backends. delete_profile mirrors with
_maybe_unregister_gateway_service. _allocate_gateway_port produces
a deterministic SHA-256-derived port in [9200, 9800).
Task 4.3 — gateway dispatch + remove rejection arms:
_dispatch_via_service_manager_if_s6(action) intercepts
start/stop/restart at the top of each subcommand and routes them
through S6ServiceManager.{start,stop,restart}. The pre-Phase-4
`elif is_container():` rejection arms are kept as fallback for
pre-s6 containers / unsupported runtimes, but only ever fire when
detect_service_manager() != 's6'. install/uninstall under s6
print informational guidance pointing users at profile create/delete.
Removed the two xfail(strict=True) markers from
tests/docker/test_profile_gateway.py — both tests now pass strictly.
Task 4.4 — status reporting:
get_gateway_runtime_snapshot() reports
Manager: 's6 (container supervisor)' inside an s6 container instead
of 'docker (foreground)'.
Plan-vs-reality drift fixed in this commit:
- Plan's S6ServiceManager._render_run_script used
`gateway start --foreground --port {port}` — invented args; the
real CLI is `gateway run`. Switched accordingly. port arg
retained for API parity but now documented as 'currently ignored'.
- Plan's reconciler keyed on config.yaml; switched to SOUL.md
(config.yaml is created by hermes setup, not by hermes profile
create, so the original gate caught nothing).
- The plan's _dispatch helper used _profile_arg() which returns
'--profile <name>' (i.e. with the flag prefix). Switched to
_profile_suffix() which returns the bare name.
- Architecture B's docker exec doesn't get /command on PATH or
the venv on PATH; Dockerfile's runtime PATH now includes
/opt/hermes/.venv/bin so 'docker exec <c> hermes ...' works
without sourcing the venv.
- stage2-hook now chowns $HERMES_HOME/profiles to hermes on every
boot, not just on the UID-remap path. Without this, files created
by docker-exec-as-root accumulate and the next reconciler run
fails with PermissionError reading SOUL.md.
Test harness:
19 passed, 0 xfailed (the two pre-Phase-4 xfail targets flip to
passing). 78 unit tests across service_manager + container_boot +
profiles_s6_hooks + gateway_s6_dispatch. Hadolint + shellcheck
pass cleanly.
Refs: docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md
This commit is contained in:
parent
0abf661f71
commit
2afefc501c
13 changed files with 1217 additions and 39 deletions
168
tests/docker/test_container_restart.py
Normal file
168
tests/docker/test_container_restart.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
"""Container-restart survives per-profile gateway registrations.
|
||||
|
||||
The s6 dynamic scandir at /run/service/ lives on tmpfs and is wiped
|
||||
on every container restart. Phase 4 Task 4.0's container_boot module
|
||||
+ cont-init.d/02-reconcile-profiles regenerate the service slots from
|
||||
$HERMES_HOME/profiles/<name>/gateway_state.json on every boot and
|
||||
auto-start only those whose last state was `running`.
|
||||
|
||||
These tests stand up a container with a named volume, create profiles
|
||||
inside it in various gateway states, restart the container, and
|
||||
assert the reconciler did the right thing.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _docker(*args: str, **kw) -> subprocess.CompletedProcess[str]:
|
||||
return subprocess.run(
|
||||
["docker", *args],
|
||||
capture_output=True, text=True, timeout=kw.pop("timeout", 60),
|
||||
**kw,
|
||||
)
|
||||
|
||||
|
||||
def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
|
||||
return _docker("exec", container, *args, timeout=timeout)
|
||||
|
||||
|
||||
def _sh(container: str, cmd: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
|
||||
return _docker("exec", container, "sh", "-c", cmd, timeout=timeout)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def restart_container(request, built_image: str):
|
||||
"""A long-running container with a named volume so docker restart
|
||||
preserves $HERMES_HOME/profiles/."""
|
||||
safe = request.node.name.replace("[", "_").replace("]", "_")
|
||||
name = f"hermes-restart-{safe}"
|
||||
volume = f"hermes-restart-vol-{safe}"
|
||||
_docker("rm", "-f", name)
|
||||
_docker("volume", "rm", "-f", volume)
|
||||
_docker("volume", "create", volume, timeout=10).check_returncode()
|
||||
r = _docker(
|
||||
"run", "-d", "--name", name,
|
||||
"-v", f"{volume}:/opt/data",
|
||||
built_image, "sleep", "infinity",
|
||||
timeout=30,
|
||||
)
|
||||
r.check_returncode()
|
||||
# Give s6 + stage2 + 02-reconcile a moment to come up cleanly on
|
||||
# the fresh volume.
|
||||
time.sleep(5)
|
||||
yield name
|
||||
_docker("rm", "-f", name)
|
||||
_docker("volume", "rm", "-f", volume)
|
||||
|
||||
|
||||
def test_running_gateway_survives_container_restart(restart_container: str) -> None:
|
||||
container = restart_container
|
||||
|
||||
# Create the profile + start its gateway. The Phase 4 hooks
|
||||
# register the s6 service slot during create and the dispatch
|
||||
# path brings it up via s6-svc -u.
|
||||
r = _exec(container, "hermes", "profile", "create", "coder")
|
||||
assert r.returncode == 0, f"profile create failed: {r.stderr}"
|
||||
|
||||
r = _exec(container, "hermes", "-p", "coder", "gateway", "start", timeout=60)
|
||||
assert r.returncode == 0, f"gateway start failed: {r.stderr}"
|
||||
|
||||
# Give the service time to actually come up under supervision.
|
||||
deadline = time.monotonic() + 15.0
|
||||
while time.monotonic() < deadline:
|
||||
r = _sh(container, "/command/s6-svstat /run/service/gateway-coder")
|
||||
if r.returncode == 0 and "up " in r.stdout:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
assert "up " in r.stdout, f"gateway never came up pre-restart: {r.stdout!r}"
|
||||
|
||||
# Persist state so the reconciler will treat the slot as 'running'
|
||||
# post-restart. The gateway process itself writes gateway_state.json
|
||||
# via gateway/status.py — but we don't want to wait for or assert
|
||||
# against the live process here; just stamp the file directly to
|
||||
# exercise the reconciler's contract.
|
||||
write_state = (
|
||||
"import json, pathlib; "
|
||||
"p = pathlib.Path('/opt/data/profiles/coder/gateway_state.json'); "
|
||||
"p.write_text(json.dumps({'gateway_state': 'running', 'timestamp': 1}))"
|
||||
)
|
||||
_exec(container, "python3", "-c", write_state, timeout=10).check_returncode()
|
||||
|
||||
# Restart. After this, /run/service/ is empty until cont-init.d
|
||||
# runs the reconciler.
|
||||
_docker("restart", container, timeout=60).check_returncode()
|
||||
time.sleep(8) # stage2 + reconcile + svscan rescan
|
||||
|
||||
# Reconciler logged the action.
|
||||
r = _sh(container, "cat /opt/data/logs/container-boot.log")
|
||||
assert r.returncode == 0, f"reconcile log missing: {r.stderr}"
|
||||
assert "profile=coder" in r.stdout
|
||||
assert "action=started" in r.stdout
|
||||
|
||||
# Service slot exists.
|
||||
r = _sh(container, "test -d /run/service/gateway-coder")
|
||||
assert r.returncode == 0, "slot not recreated after restart"
|
||||
|
||||
# No `down` marker — we asked for auto-start.
|
||||
r = _sh(container, "test -f /run/service/gateway-coder/down")
|
||||
assert r.returncode != 0, "down marker present despite prior_state=running"
|
||||
|
||||
|
||||
def test_stopped_gateway_stays_stopped_after_restart(restart_container: str) -> None:
|
||||
container = restart_container
|
||||
|
||||
_exec(container, "hermes", "profile", "create", "writer").check_returncode()
|
||||
|
||||
# Write 'stopped' directly so we don't have to race against the
|
||||
# gateway's own state writes.
|
||||
write_state = (
|
||||
"import json, pathlib; "
|
||||
"p = pathlib.Path('/opt/data/profiles/writer/gateway_state.json'); "
|
||||
"p.write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1}))"
|
||||
)
|
||||
_exec(container, "python3", "-c", write_state, timeout=10).check_returncode()
|
||||
|
||||
_docker("restart", container, timeout=60).check_returncode()
|
||||
time.sleep(8)
|
||||
|
||||
# Slot exists.
|
||||
r = _sh(container, "test -d /run/service/gateway-writer")
|
||||
assert r.returncode == 0
|
||||
|
||||
# Down marker present.
|
||||
r = _sh(container, "test -f /run/service/gateway-writer/down")
|
||||
assert r.returncode == 0, "down marker missing despite prior_state=stopped"
|
||||
|
||||
|
||||
def test_stale_gateway_pid_cleaned_up_on_restart(restart_container: str) -> None:
|
||||
"""A dead container's gateway.pid + processes.json must NOT
|
||||
survive the restart — a numerically-equal live PID in the new
|
||||
container is a different process and would confuse the gateway
|
||||
process-mismatch checks."""
|
||||
container = restart_container
|
||||
|
||||
_exec(container, "hermes", "profile", "create", "ghost").check_returncode()
|
||||
|
||||
# Stamp stale runtime files alongside a 'running' state so the
|
||||
# reconciler walks this profile.
|
||||
stamp = (
|
||||
"import json, pathlib; "
|
||||
"p = pathlib.Path('/opt/data/profiles/ghost'); "
|
||||
"(p / 'gateway_state.json').write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1})); "
|
||||
"(p / 'gateway.pid').write_text(json.dumps({'pid': 99999, 'host': 'old'})); "
|
||||
"(p / 'processes.json').write_text('[]')"
|
||||
)
|
||||
_exec(container, "python3", "-c", stamp, timeout=10).check_returncode()
|
||||
|
||||
_docker("restart", container, timeout=60).check_returncode()
|
||||
time.sleep(8)
|
||||
|
||||
# Stale runtime files swept.
|
||||
r = _sh(container, "test -f /opt/data/profiles/ghost/gateway.pid")
|
||||
assert r.returncode != 0, "stale gateway.pid survived restart"
|
||||
r = _sh(container, "test -f /opt/data/profiles/ghost/processes.json")
|
||||
assert r.returncode != 0, "stale processes.json survived restart"
|
||||
|
|
@ -1,31 +1,26 @@
|
|||
"""Harness: per-profile gateway start/stop inside the container.
|
||||
|
||||
Phase 4 will change the *implementation* of these commands inside the
|
||||
container — they'll talk to s6 instead of refusing. The user-visible
|
||||
surface that should result is locked here.
|
||||
Phase 4 wires `hermes -p <profile> gateway start/stop` through the s6
|
||||
ServiceManager dispatch path inside the container — so the lifecycle
|
||||
commands now bring up an s6-supervised gateway rather than refusing
|
||||
with the pre-Phase-4 informational message.
|
||||
|
||||
NOTE: These tests are marked ``xfail(strict=True)`` until Phase 4 lands.
|
||||
The current tini image deliberately refuses gateway start/stop inside
|
||||
containers — ``pgrep`` finds nothing and the tests fail. After Phase 4
|
||||
they should flip to passing automatically; ``strict=True`` means an
|
||||
unexpected pass also fails the test, protecting against side-channel
|
||||
fixes outside the planned Phase 4 mechanism.
|
||||
These tests were marked ``xfail(strict=True)`` through Phase 0–3 and
|
||||
flip to plain ``test_…`` once Phase 4 lands (now).
|
||||
|
||||
NB: The harness profile created here has no model/auth configured,
|
||||
so the gateway process itself will exit with code 1 on every start
|
||||
attempt (s6 will keep restarting it). We assert against s6's
|
||||
``want up`` / ``want down`` state — which reflects the lifecycle
|
||||
command's intent, not the supervised process's health.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
PROFILE = "test-harness-profile"
|
||||
|
||||
_PHASE4_REASON = (
|
||||
"Phase 4 not yet landed: container-side `hermes gateway start` "
|
||||
"currently exits 0 with an informational message instead of "
|
||||
"spawning/supervising a gateway. Remove this marker after Task 4.3."
|
||||
)
|
||||
|
||||
|
||||
def _sh(
|
||||
container: str, command: str, timeout: int = 30,
|
||||
|
|
@ -36,7 +31,14 @@ def _sh(
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason=_PHASE4_REASON, strict=True)
|
||||
def _svstat(container: str) -> str:
|
||||
"""Returns the raw s6-svstat output for the test profile's slot.
|
||||
/command/s6-svstat is called by absolute path because /command/
|
||||
isn't on PATH for docker-exec sessions."""
|
||||
r = _sh(container, f"/command/s6-svstat /run/service/gateway-{PROFILE}")
|
||||
return r.stdout if r.returncode == 0 else ""
|
||||
|
||||
|
||||
def test_profile_create_then_gateway_start(
|
||||
built_image: str, container_name: str,
|
||||
) -> None:
|
||||
|
|
@ -50,30 +52,35 @@ def test_profile_create_then_gateway_start(
|
|||
r = _sh(container_name, f"hermes profile create {PROFILE}")
|
||||
assert r.returncode == 0, f"profile create failed: {r.stderr}"
|
||||
|
||||
# Profile create's s6-register hook should have produced a service slot.
|
||||
r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}")
|
||||
assert r.returncode == 0, "s6 service slot not created on profile create"
|
||||
|
||||
r = _sh(container_name, f"hermes -p {PROFILE} gateway start", timeout=60)
|
||||
assert r.returncode == 0, (
|
||||
f"gateway start failed: stderr={r.stderr!r} stdout={r.stdout!r}"
|
||||
)
|
||||
|
||||
time.sleep(3)
|
||||
|
||||
r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'")
|
||||
assert r.returncode == 0, "gateway process not running"
|
||||
# After start, s6's intent is "up" — even if the supervised gateway
|
||||
# process spin-fails (no model/auth in the test profile), the
|
||||
# supervision-state contract holds.
|
||||
time.sleep(2)
|
||||
state = _svstat(container_name)
|
||||
assert "want up" in state, f"want up not in svstat: {state!r}"
|
||||
|
||||
r = _sh(container_name, f"hermes -p {PROFILE} gateway stop", timeout=30)
|
||||
assert r.returncode == 0
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'")
|
||||
assert r.returncode != 0, "gateway process still running after stop"
|
||||
state = _svstat(container_name)
|
||||
assert "want up" not in state, f"want up still in svstat: {state!r}"
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason=_PHASE4_REASON, strict=True)
|
||||
def test_profile_delete_stops_gateway(
|
||||
built_image: str, container_name: str,
|
||||
) -> None:
|
||||
"""Deleting a profile should stop its gateway if running."""
|
||||
"""Deleting a profile should stop its gateway and remove the s6
|
||||
service slot."""
|
||||
subprocess.run(
|
||||
["docker", "run", "-d", "--name", container_name, built_image,
|
||||
"sleep", "120"],
|
||||
|
|
@ -90,8 +97,9 @@ def test_profile_delete_stops_gateway(
|
|||
f"hermes profile delete {PROFILE} --yes",
|
||||
timeout=30,
|
||||
)
|
||||
assert r.returncode == 0
|
||||
assert r.returncode == 0, f"profile delete failed: {r.stderr}"
|
||||
|
||||
time.sleep(2)
|
||||
r = _sh(container_name, f"pgrep -f 'gateway.*{PROFILE}'")
|
||||
assert r.returncode != 0, "gateway still running after profile delete"
|
||||
# Service slot should be gone.
|
||||
r = _sh(container_name, f"test -d /run/service/gateway-{PROFILE}")
|
||||
assert r.returncode != 0, "s6 service slot still present after profile delete"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue