hermes-agent/tests/docker/test_s6_profile_gateway_integration.py
Ben ad5fdab092 feat(service_manager): add S6ServiceManager for runtime gateway supervision
Phase 3 of the s6-overlay supervision plan. Implements the runtime-
registration surface from D4 — only the s6 backend supports
register_profile_gateway / unregister_profile_gateway /
list_profile_gateways; host backends continue to raise
NotImplementedError. No caller yet (Phase 4 wires in the profile
create/delete hooks).

Key implementation notes:

  - Service directory shape: /run/service/gateway-<profile>/{type,run,log/run}.
    Atomic register: write to gateway-<profile>.tmp, fsync via
    os.rename. Cleanup on rescan failure.

  - Run script uses #!/command/with-contenv sh so HERMES_HOME and any
    extra_env arrive at exec time. The hermes -p <profile> gateway
    start --foreground --port <port> command is wrapped in
    s6-setuidgid hermes for the per-service privilege drop (OQ2-A).

  - Log script (OQ8-C): persists via s6-log to
    ${HERMES_HOME}/logs/gateways/<profile>/. CRITICAL — HERMES_HOME is
    a runtime env-var expansion in the rendered script, NOT a Python
    f-string substitution. Negative-asserted in
    test_s6_register_creates_service_dir_and_triggers_scan so
    regressions are caught.

  - PATH gotcha: /command/ is only on PATH for processes spawned by
    the supervision tree (services, cont-init.d). `docker exec` and
    profile-create hooks don't get it. S6ServiceManager calls all
    s6-* binaries via absolute path through the new _S6_BIN_DIR
    constant so callers don't have to fix up env vars.

  - validate_profile_name rejects path-traversal, leading-dash (s6
    would parse as a flag), uppercase, whitespace, and names >251
    chars (s6-svscan default name_max).

Test coverage:
  - 13 new unit tests in tests/hermes_cli/test_service_manager.py
    (kind detection, run-script content, env quoting, register
    rollback on rescan failure, unregister idempotence, list filter,
    lifecycle dispatch, svstat parsing). Total: 36 passing.
  - 2 new in-container integration tests in
    tests/docker/test_s6_profile_gateway_integration.py validating
    end-to-end registration against a real s6 supervision tree.

Docker harness: 14 passed, 2 xfailed (Phase 4 target unchanged).

Refs: docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md
2026-05-22 11:47:41 +10:00

124 lines
4.7 KiB
Python

"""Harness: in-container integration tests for S6ServiceManager.
The unit tests in tests/hermes_cli/test_service_manager.py exercise the
class against a tmp-path scandir with a stubbed ``subprocess.run``.
These tests run the real class inside a real container against the
real s6-svc / s6-svscanctl binaries, validating end-to-end.
Phase 3 only registers the service slot — it doesn't depend on the
gateway actually starting (the binary will refuse to start without a
valid profile config). The full register → start → supervised-restart
→ unregister cycle is covered by Phase 4 once profile create/delete
hooks land.
"""
from __future__ import annotations
import subprocess
import time
_REGISTER_SCRIPT = """
import sys
sys.path.insert(0, "/opt/hermes")
from hermes_cli.service_manager import S6ServiceManager
S6ServiceManager().register_profile_gateway("phase3test", port=9301)
# Don't worry about whether the gateway actually starts — we only care
# that the supervision slot was created. The gateway run script will
# likely error out (no profile config exists) but that's expected.
print("REGISTERED")
"""
_UNREGISTER_SCRIPT = """
import sys
sys.path.insert(0, "/opt/hermes")
from hermes_cli.service_manager import S6ServiceManager
S6ServiceManager().unregister_profile_gateway("phase3test")
print("UNREGISTERED")
"""
def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess:
return subprocess.run(
["docker", "exec", container, *args],
capture_output=True, text=True, timeout=timeout,
)
def test_s6_register_creates_service_dir_in_live_container(
built_image: str, container_name: str,
) -> None:
"""S6ServiceManager.register_profile_gateway must create
``/run/service/gateway-<profile>/`` and trigger s6-svscan rescan
against the real s6 supervision tree."""
subprocess.run(
["docker", "run", "-d", "--name", container_name, built_image,
"sleep", "120"],
check=True, capture_output=True, timeout=30,
)
# Give the supervision tree a moment to come up.
time.sleep(3)
r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
assert "REGISTERED" in r.stdout, (
f"register failed: stderr={r.stderr!r} stdout={r.stdout!r}"
)
# Service directory exists with the expected structure.
r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
assert r.returncode == 0, "service directory not created"
r = _exec(container_name, "test", "-f", "/run/service/gateway-phase3test/run")
assert r.returncode == 0, "run script not created"
r = _exec(container_name, "test", "-f",
"/run/service/gateway-phase3test/log/run")
assert r.returncode == 0, "log/run script not created"
# s6-svscan picked it up — s6-svstat works against the dir.
# `docker exec` doesn't put /command/ on PATH (only the supervision
# tree does), so call s6-svstat by absolute path.
r = _exec(container_name, "/command/s6-svstat",
"/run/service/gateway-phase3test")
assert r.returncode == 0, f"s6-svstat failed: {r.stderr or r.stdout}"
# list_profile_gateways picks it up.
r = _exec(container_name, "python3", "-c", (
"from hermes_cli.service_manager import S6ServiceManager;"
"print(S6ServiceManager().list_profile_gateways())"
))
assert "phase3test" in r.stdout, f"list output: {r.stdout!r}"
def test_s6_unregister_removes_service_dir_in_live_container(
built_image: str, container_name: str,
) -> None:
"""unregister_profile_gateway must stop the service, remove the
directory, and trigger s6-svscan rescan so the supervise process
is dropped."""
subprocess.run(
["docker", "run", "-d", "--name", container_name, built_image,
"sleep", "120"],
check=True, capture_output=True, timeout=30,
)
time.sleep(3)
# First register so we have something to unregister.
r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
assert "REGISTERED" in r.stdout
# Then unregister.
r = _exec(container_name, "python3", "-c", _UNREGISTER_SCRIPT, timeout=30)
assert "UNREGISTERED" in r.stdout, (
f"unregister failed: stderr={r.stderr!r} stdout={r.stdout!r}"
)
# Directory is gone.
r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
assert r.returncode != 0, "service directory still exists after unregister"
# list_profile_gateways no longer includes it.
r = _exec(container_name, "python3", "-c", (
"from hermes_cli.service_manager import S6ServiceManager;"
"print(S6ServiceManager().list_profile_gateways())"
))
assert "phase3test" not in r.stdout