mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-01 07:01:41 +00:00
PR #30136 review surfaced two issues, both rooted in the same audit gap: docker integration tests were running as root, not the unprivileged `hermes` user (UID 10000) that the runtime actually uses via `s6-setuidgid hermes`. Anything that probed PID-1 state or wrote to the s6 control surface worked as root in the tests but was inert in production. Fixes: 1. `_s6_running()` previously called `Path("/proc/1/exe").resolve()`, which is root-only readable. For UID 10000 the symlink yields PermissionError, `resolve()` silently returns the unresolved path, and `exe.name == "exe"` — so detection always returned False, the service-manager runtime-registration path was inert, and every `hermes profile create` / `hermes -p X gateway start` silently skipped the s6 hook. Replace with `/proc/1/comm` (world-readable) + `/run/s6/basedir` (s6-overlay-specific) — both required, fail closed. 2. `02-reconcile-profiles` now also chowns `/run/service/.s6-svscan/` {control,lock} to hermes so `s6-svscanctl -a/-an` works without root. Previously the directory chown stopped at `/run/service` and the FIFO inside stayed root-owned, so `register_profile_gateway` from hermes failed at the rescan-trigger step with EACCES — the wrapper in profiles.py caught the exception and printed a swallowed warning, so profile creation appeared to succeed while the slot was rolled back. Audit changes to flush this class of bug next time: - Add `docker_exec` / `docker_exec_sh` helpers to `tests/docker/conftest.py` that default to `-u hermes`. The module docstring explains why and flags `user="root"` as opt-in only for tests that explicitly need root (none currently do). - Refactor every `docker exec` call in tests/docker/ through the new helpers (test_dashboard.py, test_zombie_reaping.py, test_profile_gateway.py, test_container_restart.py, test_s6_profile_gateway_integration.py). - Add 5 unit tests covering `_s6_running` under various probe states (both signals present; comm wrong; basedir missing; PermissionError on /proc/1/comm; missing /proc — non-Linux). The PermissionError test is the explicit regression guard for the original bug. Known follow-up: the per-service `supervise/control` FIFO inside each `/run/service/gateway-<profile>/supervise/` is created root-owned by s6-supervise (which runs as root because s6-svscan is PID 1). `s6-svc -u/-d/-t` from the hermes user will get EACCES on those. The audit under `-u hermes` will reveal this in lifecycle tests — surfacing the issue cleanly so it can be fixed in a focused follow-up (likely via a small SUID helper or a polling chown loop in cont-init.d). The detection + svscanctl fixes here are independent and complete on their own.
129 lines
4.9 KiB
Python
129 lines
4.9 KiB
Python
"""Harness: in-container integration tests for S6ServiceManager.
|
|
|
|
The unit tests in tests/hermes_cli/test_service_manager.py exercise the
|
|
class against a tmp-path scandir with a stubbed ``subprocess.run``.
|
|
These tests run the real class inside a real container against the
|
|
real s6-svc / s6-svscanctl binaries, validating end-to-end.
|
|
|
|
Phase 3 only registers the service slot — it doesn't depend on the
|
|
gateway actually starting (the binary will refuse to start without a
|
|
valid profile config). The full register → start → supervised-restart
|
|
→ unregister cycle is covered by Phase 4 once profile create/delete
|
|
hooks land.
|
|
|
|
Every ``docker exec`` here runs as the unprivileged ``hermes`` user
|
|
(via :func:`docker_exec` in conftest); see the conftest module
|
|
docstring. ``/run/service`` is chowned hermes-writable by the
|
|
``02-reconcile-profiles`` cont-init.d script, so register/unregister
|
|
operations work correctly under UID 10000.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import subprocess
|
|
import time
|
|
|
|
from tests.docker.conftest import docker_exec
|
|
|
|
|
|
_REGISTER_SCRIPT = """
|
|
import sys
|
|
sys.path.insert(0, "/opt/hermes")
|
|
from hermes_cli.service_manager import S6ServiceManager
|
|
S6ServiceManager().register_profile_gateway("phase3test", port=9301)
|
|
# Don't worry about whether the gateway actually starts — we only care
|
|
# that the supervision slot was created. The gateway run script will
|
|
# likely error out (no profile config exists) but that's expected.
|
|
print("REGISTERED")
|
|
"""
|
|
|
|
_UNREGISTER_SCRIPT = """
|
|
import sys
|
|
sys.path.insert(0, "/opt/hermes")
|
|
from hermes_cli.service_manager import S6ServiceManager
|
|
S6ServiceManager().unregister_profile_gateway("phase3test")
|
|
print("UNREGISTERED")
|
|
"""
|
|
|
|
|
|
def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess:
|
|
return docker_exec(container, *args, timeout=timeout)
|
|
|
|
|
|
def test_s6_register_creates_service_dir_in_live_container(
|
|
built_image: str, container_name: str,
|
|
) -> None:
|
|
"""S6ServiceManager.register_profile_gateway must create
|
|
``/run/service/gateway-<profile>/`` and trigger s6-svscan rescan
|
|
against the real s6 supervision tree."""
|
|
subprocess.run(
|
|
["docker", "run", "-d", "--name", container_name, built_image,
|
|
"sleep", "120"],
|
|
check=True, capture_output=True, timeout=30,
|
|
)
|
|
# Give the supervision tree a moment to come up.
|
|
time.sleep(3)
|
|
|
|
r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
|
|
assert "REGISTERED" in r.stdout, (
|
|
f"register failed: stderr={r.stderr!r} stdout={r.stdout!r}"
|
|
)
|
|
|
|
# Service directory exists with the expected structure.
|
|
r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
|
|
assert r.returncode == 0, "service directory not created"
|
|
|
|
r = _exec(container_name, "test", "-f", "/run/service/gateway-phase3test/run")
|
|
assert r.returncode == 0, "run script not created"
|
|
|
|
r = _exec(container_name, "test", "-f",
|
|
"/run/service/gateway-phase3test/log/run")
|
|
assert r.returncode == 0, "log/run script not created"
|
|
|
|
# s6-svscan picked it up — s6-svstat works against the dir.
|
|
# `docker exec` doesn't put /command/ on PATH (only the supervision
|
|
# tree does), so call s6-svstat by absolute path.
|
|
r = _exec(container_name, "/command/s6-svstat",
|
|
"/run/service/gateway-phase3test")
|
|
assert r.returncode == 0, f"s6-svstat failed: {r.stderr or r.stdout}"
|
|
|
|
# list_profile_gateways picks it up.
|
|
r = _exec(container_name, "python3", "-c", (
|
|
"from hermes_cli.service_manager import S6ServiceManager;"
|
|
"print(S6ServiceManager().list_profile_gateways())"
|
|
))
|
|
assert "phase3test" in r.stdout, f"list output: {r.stdout!r}"
|
|
|
|
|
|
def test_s6_unregister_removes_service_dir_in_live_container(
|
|
built_image: str, container_name: str,
|
|
) -> None:
|
|
"""unregister_profile_gateway must stop the service, remove the
|
|
directory, and trigger s6-svscan rescan so the supervise process
|
|
is dropped."""
|
|
subprocess.run(
|
|
["docker", "run", "-d", "--name", container_name, built_image,
|
|
"sleep", "120"],
|
|
check=True, capture_output=True, timeout=30,
|
|
)
|
|
time.sleep(3)
|
|
|
|
# First register so we have something to unregister.
|
|
r = _exec(container_name, "python3", "-c", _REGISTER_SCRIPT, timeout=30)
|
|
assert "REGISTERED" in r.stdout
|
|
|
|
# Then unregister.
|
|
r = _exec(container_name, "python3", "-c", _UNREGISTER_SCRIPT, timeout=30)
|
|
assert "UNREGISTERED" in r.stdout, (
|
|
f"unregister failed: stderr={r.stderr!r} stdout={r.stdout!r}"
|
|
)
|
|
|
|
# Directory is gone.
|
|
r = _exec(container_name, "test", "-d", "/run/service/gateway-phase3test")
|
|
assert r.returncode != 0, "service directory still exists after unregister"
|
|
|
|
# list_profile_gateways no longer includes it.
|
|
r = _exec(container_name, "python3", "-c", (
|
|
"from hermes_cli.service_manager import S6ServiceManager;"
|
|
"print(S6ServiceManager().list_profile_gateways())"
|
|
))
|
|
assert "phase3test" not in r.stdout
|