mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-08 08:11:38 +00:00
feat(service_manager): add S6ServiceManager for runtime gateway supervision
Phase 3 of the s6-overlay supervision plan. Implements the runtime-
registration surface from D4 — only the s6 backend supports
register_profile_gateway / unregister_profile_gateway /
list_profile_gateways; host backends continue to raise
NotImplementedError. No caller yet (Phase 4 wires in the profile
create/delete hooks).
Key implementation notes:
- Service directory shape: /run/service/gateway-<profile>/{type,run,log/run}.
Atomic register: write to gateway-<profile>.tmp, fsync via
os.rename. Cleanup on rescan failure.
- Run script uses #!/command/with-contenv sh so HERMES_HOME and any
extra_env arrive at exec time. The hermes -p <profile> gateway
start --foreground --port <port> command is wrapped in
s6-setuidgid hermes for the per-service privilege drop (OQ2-A).
- Log script (OQ8-C): persists via s6-log to
${HERMES_HOME}/logs/gateways/<profile>/. CRITICAL — HERMES_HOME is
a runtime env-var expansion in the rendered script, NOT a Python
f-string substitution. Negative-asserted in
test_s6_register_creates_service_dir_and_triggers_scan so
regressions are caught.
- PATH gotcha: /command/ is only on PATH for processes spawned by
the supervision tree (services, cont-init.d). `docker exec` and
profile-create hooks don't get it. S6ServiceManager calls all
s6-* binaries via absolute path through the new _S6_BIN_DIR
constant so callers don't have to fix up env vars.
- validate_profile_name rejects path-traversal, leading-dash (s6
would parse as a flag), uppercase, whitespace, and names >251
chars (s6-svscan default name_max).
Test coverage:
- 13 new unit tests in tests/hermes_cli/test_service_manager.py
(kind detection, run-script content, env quoting, register
rollback on rescan failure, unregister idempotence, list filter,
lifecycle dispatch, svstat parsing). Total: 36 passing.
- 2 new in-container integration tests in
tests/docker/test_s6_profile_gateway_integration.py validating
end-to-end registration against a real s6 supervision tree.
Docker harness: 14 passed, 2 xfailed (Phase 4 target unchanged).
Refs: docs/plans/2026-05-07-s6-overlay-dynamic-subagent-gateways.md
This commit is contained in:
parent
4826ea7b41
commit
ad5fdab092
3 changed files with 622 additions and 11 deletions
124
tests/docker/test_s6_profile_gateway_integration.py
Normal file
124
tests/docker/test_s6_profile_gateway_integration.py
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
"""Harness: in-container integration tests for S6ServiceManager.
|
||||
|
||||
The unit tests in tests/hermes_cli/test_service_manager.py exercise the
|
||||
class against a tmp-path scandir with a stubbed ``subprocess.run``.
|
||||
These tests run the real class inside a real container against the
|
||||
real s6-svc / s6-svscanctl binaries, validating end-to-end.
|
||||
|
||||
Phase 3 only registers the service slot — it doesn't depend on the
|
||||
gateway actually starting (the binary will refuse to start without a
|
||||
valid profile config). The full register → start → supervised-restart
|
||||
→ unregister cycle is covered by Phase 4 once profile create/delete
|
||||
hooks land.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
|
||||
_REGISTER_SCRIPT = """
|
||||
import sys
|
||||
sys.path.insert(0, "/opt/hermes")
|
||||
from hermes_cli.service_manager import S6ServiceManager
|
||||
S6ServiceManager().register_profile_gateway("phase3test", port=9301)
|
||||
# Don't worry about whether the gateway actually starts — we only care
|
||||
# that the supervision slot was created. The gateway run script will
|
||||
# likely error out (no profile config exists) but that's expected.
|
||||
print("REGISTERED")
|
||||
"""
|
||||
|
||||
_UNREGISTER_SCRIPT = """
|
||||
import sys
|
||||
sys.path.insert(0, "/opt/hermes")
|
||||
from hermes_cli.service_manager import S6ServiceManager
|
||||
S6ServiceManager().unregister_profile_gateway("phase3test")
|
||||
print("UNREGISTERED")
|
||||
"""
|
||||
|
||||
|
||||
def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess:
|
||||
return subprocess.run(
|
||||
["docker", "exec", container, *args],
|
||||
capture_output=True, text=True, timeout=timeout,
|
||||
)
|
||||
|
||||
|
||||
def test_s6_register_creates_service_dir_in_live_container(
|
||||
built_image: str, container_name: str,
|
||||
) -> None:
|
||||
"""S6ServiceManager.register_profile_gateway must create
|
||||
``/run/service/gateway-<profile>/`` and trigger s6-svscan rescan
|
||||
against the real s6 supervision tree."""
|
||||
subprocess.run(
|
||||
["docker", "run", "-d", "--name", container_name, built_image,
|
||||
"sleep", "120"],
|
||||
check=True, capture_output=True, timeout=30,
|
||||
)
|
||||
# Give the supervision tree a moment to come up.
|
||||
time.sleep(3)
|
||||
|
||||
r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
|
||||
assert "REGISTERED" in r.stdout, (
|
||||
f"register failed: stderr={r.stderr!r} stdout={r.stdout!r}"
|
||||
)
|
||||
|
||||
# Service directory exists with the expected structure.
|
||||
r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
|
||||
assert r.returncode == 0, "service directory not created"
|
||||
|
||||
r = _exec(container_name, "test", "-f", "/run/service/gateway-phase3test/run")
|
||||
assert r.returncode == 0, "run script not created"
|
||||
|
||||
r = _exec(container_name, "test", "-f",
|
||||
"/run/service/gateway-phase3test/log/run")
|
||||
assert r.returncode == 0, "log/run script not created"
|
||||
|
||||
# s6-svscan picked it up — s6-svstat works against the dir.
|
||||
# `docker exec` doesn't put /command/ on PATH (only the supervision
|
||||
# tree does), so call s6-svstat by absolute path.
|
||||
r = _exec(container_name, "/command/s6-svstat",
|
||||
"/run/service/gateway-phase3test")
|
||||
assert r.returncode == 0, f"s6-svstat failed: {r.stderr or r.stdout}"
|
||||
|
||||
# list_profile_gateways picks it up.
|
||||
r = _exec(container_name, "python3", "-c", (
|
||||
"from hermes_cli.service_manager import S6ServiceManager;"
|
||||
"print(S6ServiceManager().list_profile_gateways())"
|
||||
))
|
||||
assert "phase3test" in r.stdout, f"list output: {r.stdout!r}"
|
||||
|
||||
|
||||
def test_s6_unregister_removes_service_dir_in_live_container(
|
||||
built_image: str, container_name: str,
|
||||
) -> None:
|
||||
"""unregister_profile_gateway must stop the service, remove the
|
||||
directory, and trigger s6-svscan rescan so the supervise process
|
||||
is dropped."""
|
||||
subprocess.run(
|
||||
["docker", "run", "-d", "--name", container_name, built_image,
|
||||
"sleep", "120"],
|
||||
check=True, capture_output=True, timeout=30,
|
||||
)
|
||||
time.sleep(3)
|
||||
|
||||
# First register so we have something to unregister.
|
||||
r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
|
||||
assert "REGISTERED" in r.stdout
|
||||
|
||||
# Then unregister.
|
||||
r = _exec(container_name, "python3", "-c", _UNREGISTER_SCRIPT, timeout=30)
|
||||
assert "UNREGISTERED" in r.stdout, (
|
||||
f"unregister failed: stderr={r.stderr!r} stdout={r.stdout!r}"
|
||||
)
|
||||
|
||||
# Directory is gone.
|
||||
r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
|
||||
assert r.returncode != 0, "service directory still exists after unregister"
|
||||
|
||||
# list_profile_gateways no longer includes it.
|
||||
r = _exec(container_name, "python3", "-c", (
|
||||
"from hermes_cli.service_manager import S6ServiceManager;"
|
||||
"print(S6ServiceManager().list_profile_gateways())"
|
||||
))
|
||||
assert "phase3test" not in r.stdout
|
||||
Loading…
Add table
Add a link
Reference in a new issue