mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
Phase 4 of the s6-overlay supervision plan. Activates the Phase 3
S6ServiceManager by hooking it into the profile lifecycle and the
`hermes gateway start/stop/restart` dispatcher, and adds a cont-
init.d-time reconciliation pass that survives `docker restart`.
Task 4.0 — container-boot reconciliation:
/run/service/ is tmpfs, so every `docker restart` wipes every
per-profile gateway slot. /etc/cont-init.d/02-reconcile-profiles
invokes hermes_cli.container_boot.reconcile_profile_gateways() on
every boot, which walks $HERMES_HOME/profiles/<name>/, reads each
gateway_state.json, recreates the s6 service slot, and auto-starts
only those whose last state was 'running'. Other states
(stopped, starting, startup_failed, missing) register the slot
in the down state — avoiding crash-loops across restarts for a
gateway that was broken last boot. Per-profile outcome is recorded
to $HERMES_HOME/logs/container-boot.log.
Implementation: hermes_cli/container_boot.py + 12 unit tests.
Profile-marker is SOUL.md, not config.yaml, because `hermes profile
create` only seeds SOUL.md by default (config.yaml comes from
`hermes setup`).
Task 4.1 / 4.2 — profile create/delete hooks:
hermes_cli/profiles.py::create_profile now calls
_maybe_register_gateway_service(<canon>) at the end, which routes
through ServiceManager.register_profile_gateway when running on s6
and no-ops on host backends. delete_profile mirrors with
_maybe_unregister_gateway_service. _allocate_gateway_port produces
a deterministic SHA-256-derived port in [9200, 9800).
Task 4.3 — gateway dispatch + remove rejection arms:
_dispatch_via_service_manager_if_s6(action) intercepts
start/stop/restart at the top of each subcommand and routes them
through S6ServiceManager.{start,stop,restart}. The pre-Phase-4
`elif is_container():` rejection arms are kept as fallback for
pre-s6 containers / unsupported runtimes, but only ever fire when
detect_service_manager() != 's6'. install/uninstall under s6
print informational guidance pointing users at profile create/delete.
Removed the two xfail(strict=True) markers from
tests/docker/test_profile_gateway.py — both tests now pass strictly.
Task 4.4 — status reporting:
get_gateway_runtime_snapshot() reports
Manager: 's6 (container supervisor)' inside an s6 container instead
of 'docker (foreground)'.
Plan-vs-reality drift fixed in this commit:
- Plan's S6ServiceManager._render_run_script used
`gateway start --foreground --port {port}` — invented args; the
real CLI is `gateway run`. Switched accordingly. port arg
retained for API parity but now documented as 'currently ignored'.
- Plan's reconciler keyed on config.yaml; switched to SOUL.md
(config.yaml is created by hermes setup, not by hermes profile
create, so the original gate caught nothing).
- The plan's _dispatch helper used _profile_arg() which returns
'--profile <name>' (i.e. with the flag prefix). Switched to
_profile_suffix() which returns the bare name.
- Architecture B's docker exec doesn't get /command on PATH or
the venv on PATH; Dockerfile's runtime PATH now includes
/opt/hermes/.venv/bin so 'docker exec <c> hermes ...' works
without sourcing the venv.
- stage2-hook now chowns $HERMES_HOME/profiles to hermes on every
boot, not just on the UID-remap path. Without this, files created
by docker-exec-as-root accumulate and the next reconciler run
fails with PermissionError reading SOUL.md.
Test harness:
19 passed, 0 xfailed (the two pre-Phase-4 xfail targets flip to
passing). 78 unit tests across service_manager + container_boot +
profiles_s6_hooks + gateway_s6_dispatch. Hadolint + shellcheck
pass cleanly.
Refs: docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md
218 lines
8 KiB
Python
218 lines
8 KiB
Python
"""Container-boot reconciliation of per-profile gateway s6 services.
|
|
|
|
Service directories under /run/service/ live on **tmpfs** and are wiped
|
|
on every container restart. Profile directories under
|
|
``$HERMES_HOME/profiles/<name>/`` live on the persistent VOLUME, and
|
|
each one records its gateway's last state in ``gateway_state.json``.
|
|
This module bridges the two: on every container boot, walk the
|
|
persistent profiles, recreate the s6 service slots, and auto-start
|
|
only those whose last recorded state was ``running``.
|
|
|
|
Wired into the image as /etc/cont-init.d/02-reconcile-profiles by the
|
|
Dockerfile (Phase 4 Task 4.0). Runs as root after 01-hermes-setup
|
|
(the stage2 hook) has chowned the volume and seeded $HERMES_HOME, but
|
|
before s6-rc starts user services.
|
|
|
|
Without this module, every ``docker restart`` would silently wipe
|
|
every per-profile gateway, even though the user's profiles still
|
|
exist on disk.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Literal
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# Only this prior state triggers automatic restart. Everything else
|
|
# (startup_failed, starting, stopped, missing) registers the slot in
|
|
# the down state and waits for explicit user action — this avoids the
|
|
# crash-loop where a broken gateway keeps being restarted across
|
|
# `docker restart` cycles.
|
|
_AUTOSTART_STATES = frozenset({"running"})
|
|
|
|
# Stale runtime files we sweep before recreating service slots. These
|
|
# all hold container-namespaced state (PIDs, process tables) that's
|
|
# garbage post-restart — a numerically-equal PID in the new container
|
|
# is a different process. See the Risk Register in the plan.
|
|
_STALE_RUNTIME_FILES = ("gateway.pid", "processes.json")
|
|
|
|
ReconcileActionLabel = Literal["started", "registered", "skipped"]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ReconcileAction:
|
|
"""One profile's outcome from a single reconciliation pass."""
|
|
profile: str
|
|
prior_state: str | None
|
|
action: ReconcileActionLabel
|
|
|
|
|
|
def reconcile_profile_gateways(
|
|
*,
|
|
hermes_home: Path,
|
|
scandir: Path,
|
|
dry_run: bool = False,
|
|
) -> list[ReconcileAction]:
|
|
"""Recreate s6 service registrations for every persistent profile.
|
|
|
|
Args:
|
|
hermes_home: The container's HERMES_HOME (typically /opt/data).
|
|
Profiles live under ``<hermes_home>/profiles/<name>/``.
|
|
scandir: The s6 dynamic scandir (typically /run/service). Service
|
|
directories are created at ``<scandir>/gateway-<profile>/``.
|
|
dry_run: When True, walk and return the action list without
|
|
touching the filesystem. For tests and `--dry-run` debug.
|
|
|
|
Returns:
|
|
One :class:`ReconcileAction` per profile, in directory order.
|
|
"""
|
|
actions: list[ReconcileAction] = []
|
|
profiles_root = hermes_home / "profiles"
|
|
if not profiles_root.is_dir():
|
|
return actions
|
|
|
|
for entry in sorted(profiles_root.iterdir()):
|
|
if not entry.is_dir():
|
|
continue
|
|
# SOUL.md is always seeded by `hermes profile create` (config.yaml
|
|
# is not — that comes later via `hermes setup`). Use it as the
|
|
# "real profile" marker so stray dirs (backups, manual mkdir)
|
|
# aren't picked up.
|
|
if not (entry / "SOUL.md").exists():
|
|
continue
|
|
|
|
prior_state = _read_prior_state(entry)
|
|
should_start = prior_state in _AUTOSTART_STATES
|
|
|
|
if not dry_run:
|
|
_cleanup_stale_runtime_files(entry)
|
|
_register_service(scandir, entry.name, start=should_start)
|
|
|
|
actions.append(ReconcileAction(
|
|
profile=entry.name,
|
|
prior_state=prior_state,
|
|
action="started" if should_start else "registered",
|
|
))
|
|
|
|
if not dry_run:
|
|
_write_reconcile_log(hermes_home, actions)
|
|
return actions
|
|
|
|
|
|
def _read_prior_state(profile_dir: Path) -> str | None:
|
|
"""Read gateway_state.json's ``gateway_state`` field, or None if
|
|
missing or unparseable. Unparseable counts as "no prior state" so
|
|
we don't bork the whole reconciliation on a corrupt file."""
|
|
state_file = profile_dir / "gateway_state.json"
|
|
if not state_file.exists():
|
|
return None
|
|
try:
|
|
return json.loads(state_file.read_text()).get("gateway_state")
|
|
except (OSError, json.JSONDecodeError):
|
|
log.warning(
|
|
"could not read %s; treating as no prior state", state_file,
|
|
)
|
|
return None
|
|
|
|
|
|
def _cleanup_stale_runtime_files(profile_dir: Path) -> None:
|
|
"""Remove gateway.pid and processes.json — they reference PIDs in
|
|
the dead container's process namespace and would otherwise confuse
|
|
the newly-started gateway's process-mismatch checks."""
|
|
for name in _STALE_RUNTIME_FILES:
|
|
(profile_dir / name).unlink(missing_ok=True)
|
|
|
|
|
|
def _register_service(scandir: Path, profile: str, *, start: bool) -> None:
|
|
"""Recreate the s6 service slot for one profile.
|
|
|
|
Mirrors the rendering in :func:`S6ServiceManager.register_profile_gateway`,
|
|
but here we control the start state directly via the ``down`` marker
|
|
file (s6-svscan honors it on rescan). Cannot use the manager
|
|
directly because the cont-init.d phase runs as root before
|
|
s6-svscan starts scanning the dynamic scandir — the manager's
|
|
``s6-svscanctl -a`` call would fail with no control socket.
|
|
"""
|
|
from hermes_cli.service_manager import (
|
|
S6ServiceManager,
|
|
validate_profile_name,
|
|
)
|
|
|
|
validate_profile_name(profile)
|
|
service_dir = scandir / f"gateway-{profile}"
|
|
service_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
(service_dir / "type").write_text("longrun\n")
|
|
|
|
# Reuse the manager's run-script rendering — single source of truth
|
|
# so register_profile_gateway and reconcile_profile_gateways stay
|
|
# consistent. extra_env is empty here; users who need per-profile
|
|
# env can set it via the profile's config.yaml (which the gateway
|
|
# itself loads).
|
|
run = service_dir / "run"
|
|
run.write_text(S6ServiceManager._render_run_script(profile, port=0, extra_env={}))
|
|
run.chmod(0o755)
|
|
|
|
# Persistent log rotation (OQ8-C).
|
|
log_subdir = service_dir / "log"
|
|
log_subdir.mkdir(exist_ok=True)
|
|
log_run = log_subdir / "run"
|
|
log_run.write_text(S6ServiceManager._render_log_run(profile))
|
|
log_run.chmod(0o755)
|
|
|
|
# The presence of a `down` file tells s6-supervise to NOT start
|
|
# the service when s6-svscan picks it up. User brings it up
|
|
# explicitly with `hermes -p <profile> gateway start` (which
|
|
# routes through the Phase 4 _dispatch_via_service_manager_if_s6
|
|
# helper to `s6-svc -u`).
|
|
down_marker = service_dir / "down"
|
|
if start:
|
|
down_marker.unlink(missing_ok=True)
|
|
else:
|
|
down_marker.touch()
|
|
|
|
|
|
def _write_reconcile_log(
|
|
hermes_home: Path, actions: list[ReconcileAction],
|
|
) -> None:
|
|
"""Append one line per profile to $HERMES_HOME/logs/container-boot.log.
|
|
|
|
Operators inspect this to debug "why didn't my profile come back
|
|
up". Keeping a separate log file (vs. mixing into agent.log) lets
|
|
troubleshooters grep for "profile=foo" without wading through
|
|
unrelated activity.
|
|
"""
|
|
import time
|
|
log_dir = hermes_home / "logs"
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
ts = time.strftime("%Y-%m-%dT%H:%M:%S%z")
|
|
with (log_dir / "container-boot.log").open("a", encoding="utf-8") as f:
|
|
for a in actions:
|
|
f.write(
|
|
f"{ts} profile={a.profile} prior_state={a.prior_state} "
|
|
f"action={a.action}\n"
|
|
)
|
|
|
|
|
|
def main() -> int:
|
|
"""Entry point invoked from /etc/cont-init.d/02-reconcile-profiles."""
|
|
hermes_home = Path(os.environ.get("HERMES_HOME", "/opt/data"))
|
|
scandir = Path(os.environ.get("S6_PROFILE_GATEWAY_SCANDIR", "/run/service"))
|
|
actions = reconcile_profile_gateways(
|
|
hermes_home=hermes_home, scandir=scandir,
|
|
)
|
|
for a in actions:
|
|
print(
|
|
f"reconcile: profile={a.profile} "
|
|
f"prior_state={a.prior_state} action={a.action}"
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|