mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
PR #30136 review surfaced two issues, both rooted in the same audit gap: docker integration tests were running as root, not the unprivileged `hermes` user (UID 10000) that the runtime actually uses via `s6-setuidgid hermes`. Anything that probed PID-1 state or wrote to the s6 control surface worked as root in the tests but was inert in production. Fixes: 1. `_s6_running()` previously called `Path("/proc/1/exe").resolve()`, which is root-only readable. For UID 10000 the symlink yields PermissionError, `resolve()` silently returns the unresolved path, and `exe.name == "exe"` — so detection always returned False, the service-manager runtime-registration path was inert, and every `hermes profile create` / `hermes -p X gateway start` silently skipped the s6 hook. Replace with `/proc/1/comm` (world-readable) + `/run/s6/basedir` (s6-overlay-specific) — both required, fail closed. 2. `02-reconcile-profiles` now also chowns `/run/service/.s6-svscan/` {control,lock} to hermes so `s6-svscanctl -a/-an` works without root. Previously the directory chown stopped at `/run/service` and the FIFO inside stayed root-owned, so `register_profile_gateway` from hermes failed at the rescan-trigger step with EACCES — the wrapper in profiles.py caught the exception and printed a swallowed warning, so profile creation appeared to succeed while the slot was rolled back. Audit changes to flush this class of bug next time: - Add `docker_exec` / `docker_exec_sh` helpers to `tests/docker/conftest.py` that default to `-u hermes`. The module docstring explains why and flags `user="root"` as opt-in only for tests that explicitly need root (none currently do). - Refactor every `docker exec` call in tests/docker/ through the new helpers (test_dashboard.py, test_zombie_reaping.py, test_profile_gateway.py, test_container_restart.py, test_s6_profile_gateway_integration.py). - Add 5 unit tests covering `_s6_running` under various probe states (both signals present; comm wrong; basedir missing; PermissionError on /proc/1/comm; missing /proc — non-Linux). The PermissionError test is the explicit regression guard for the original bug. Known follow-up: the per-service `supervise/control` FIFO inside each `/run/service/gateway-<profile>/supervise/` is created root-owned by s6-supervise (which runs as root because s6-svscan is PID 1). `s6-svc -u/-d/-t` from the hermes user will get EACCES on those. The audit under `-u hermes` will reveal this in lifecycle tests — surfacing the issue cleanly so it can be fixed in a focused follow-up (likely via a small SUID helper or a polling chown loop in cont-init.d). The detection + svscanctl fixes here are independent and complete on their own.
174 lines
6.8 KiB
Python
174 lines
6.8 KiB
Python
"""Container-restart survives per-profile gateway registrations.
|
|
|
|
The s6 dynamic scandir at /run/service/ lives on tmpfs and is wiped
|
|
on every container restart. Phase 4 Task 4.0's container_boot module
|
|
+ cont-init.d/02-reconcile-profiles regenerate the service slots from
|
|
$HERMES_HOME/profiles/<name>/gateway_state.json on every boot and
|
|
auto-start only those whose last state was `running`.
|
|
|
|
These tests stand up a container with a named volume, create profiles
|
|
inside it in various gateway states, restart the container, and
|
|
assert the reconciler did the right thing.
|
|
|
|
Every ``docker exec`` here runs as the unprivileged ``hermes`` user
|
|
(via :func:`docker_exec` / :func:`docker_exec_sh` in conftest); see
|
|
the conftest module docstring.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import subprocess
|
|
import time
|
|
|
|
import pytest
|
|
|
|
from tests.docker.conftest import docker_exec, docker_exec_sh
|
|
|
|
|
|
def _docker(*args: str, **kw) -> subprocess.CompletedProcess[str]:
|
|
return subprocess.run(
|
|
["docker", *args],
|
|
capture_output=True, text=True, timeout=kw.pop("timeout", 60),
|
|
**kw,
|
|
)
|
|
|
|
|
|
def _exec(container: str, *args: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
|
|
return docker_exec(container, *args, timeout=timeout)
|
|
|
|
|
|
def _sh(container: str, cmd: str, timeout: int = 30) -> subprocess.CompletedProcess[str]:
|
|
return docker_exec_sh(container, cmd, timeout=timeout)
|
|
|
|
|
|
@pytest.fixture
|
|
def restart_container(request, built_image: str):
|
|
"""A long-running container with a named volume so docker restart
|
|
preserves $HERMES_HOME/profiles/."""
|
|
safe = request.node.name.replace("[", "_").replace("]", "_")
|
|
name = f"hermes-restart-{safe}"
|
|
volume = f"hermes-restart-vol-{safe}"
|
|
_docker("rm", "-f", name)
|
|
_docker("volume", "rm", "-f", volume)
|
|
_docker("volume", "create", volume, timeout=10).check_returncode()
|
|
r = _docker(
|
|
"run", "-d", "--name", name,
|
|
"-v", f"{volume}:/opt/data",
|
|
built_image, "sleep", "infinity",
|
|
timeout=30,
|
|
)
|
|
r.check_returncode()
|
|
# Give s6 + stage2 + 02-reconcile a moment to come up cleanly on
|
|
# the fresh volume.
|
|
time.sleep(5)
|
|
yield name
|
|
_docker("rm", "-f", name)
|
|
_docker("volume", "rm", "-f", volume)
|
|
|
|
|
|
def test_running_gateway_survives_container_restart(restart_container: str) -> None:
|
|
container = restart_container
|
|
|
|
# Create the profile + start its gateway. The Phase 4 hooks
|
|
# register the s6 service slot during create and the dispatch
|
|
# path brings it up via s6-svc -u.
|
|
r = _exec(container, "hermes", "profile", "create", "coder")
|
|
assert r.returncode == 0, f"profile create failed: {r.stderr}"
|
|
|
|
r = _exec(container, "hermes", "-p", "coder", "gateway", "start", timeout=60)
|
|
assert r.returncode == 0, f"gateway start failed: {r.stderr}"
|
|
|
|
# Give the service time to actually come up under supervision.
|
|
deadline = time.monotonic() + 15.0
|
|
while time.monotonic() < deadline:
|
|
r = _sh(container, "/command/s6-svstat /run/service/gateway-coder")
|
|
if r.returncode == 0 and "up " in r.stdout:
|
|
break
|
|
time.sleep(0.5)
|
|
assert "up " in r.stdout, f"gateway never came up pre-restart: {r.stdout!r}"
|
|
|
|
# Persist state so the reconciler will treat the slot as 'running'
|
|
# post-restart. The gateway process itself writes gateway_state.json
|
|
# via gateway/status.py — but we don't want to wait for or assert
|
|
# against the live process here; just stamp the file directly to
|
|
# exercise the reconciler's contract.
|
|
write_state = (
|
|
"import json, pathlib; "
|
|
"p = pathlib.Path('/opt/data/profiles/coder/gateway_state.json'); "
|
|
"p.write_text(json.dumps({'gateway_state': 'running', 'timestamp': 1}))"
|
|
)
|
|
_exec(container, "python3", "-c", write_state, timeout=10).check_returncode()
|
|
|
|
# Restart. After this, /run/service/ is empty until cont-init.d
|
|
# runs the reconciler.
|
|
_docker("restart", container, timeout=60).check_returncode()
|
|
time.sleep(8) # stage2 + reconcile + svscan rescan
|
|
|
|
# Reconciler logged the action.
|
|
r = _sh(container, "cat /opt/data/logs/container-boot.log")
|
|
assert r.returncode == 0, f"reconcile log missing: {r.stderr}"
|
|
assert "profile=coder" in r.stdout
|
|
assert "action=started" in r.stdout
|
|
|
|
# Service slot exists.
|
|
r = _sh(container, "test -d /run/service/gateway-coder")
|
|
assert r.returncode == 0, "slot not recreated after restart"
|
|
|
|
# No `down` marker — we asked for auto-start.
|
|
r = _sh(container, "test -f /run/service/gateway-coder/down")
|
|
assert r.returncode != 0, "down marker present despite prior_state=running"
|
|
|
|
|
|
def test_stopped_gateway_stays_stopped_after_restart(restart_container: str) -> None:
|
|
container = restart_container
|
|
|
|
_exec(container, "hermes", "profile", "create", "writer").check_returncode()
|
|
|
|
# Write 'stopped' directly so we don't have to race against the
|
|
# gateway's own state writes.
|
|
write_state = (
|
|
"import json, pathlib; "
|
|
"p = pathlib.Path('/opt/data/profiles/writer/gateway_state.json'); "
|
|
"p.write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1}))"
|
|
)
|
|
_exec(container, "python3", "-c", write_state, timeout=10).check_returncode()
|
|
|
|
_docker("restart", container, timeout=60).check_returncode()
|
|
time.sleep(8)
|
|
|
|
# Slot exists.
|
|
r = _sh(container, "test -d /run/service/gateway-writer")
|
|
assert r.returncode == 0
|
|
|
|
# Down marker present.
|
|
r = _sh(container, "test -f /run/service/gateway-writer/down")
|
|
assert r.returncode == 0, "down marker missing despite prior_state=stopped"
|
|
|
|
|
|
def test_stale_gateway_pid_cleaned_up_on_restart(restart_container: str) -> None:
|
|
"""A dead container's gateway.pid + processes.json must NOT
|
|
survive the restart — a numerically-equal live PID in the new
|
|
container is a different process and would confuse the gateway
|
|
process-mismatch checks."""
|
|
container = restart_container
|
|
|
|
_exec(container, "hermes", "profile", "create", "ghost").check_returncode()
|
|
|
|
# Stamp stale runtime files alongside a 'running' state so the
|
|
# reconciler walks this profile.
|
|
stamp = (
|
|
"import json, pathlib; "
|
|
"p = pathlib.Path('/opt/data/profiles/ghost'); "
|
|
"(p / 'gateway_state.json').write_text(json.dumps({'gateway_state': 'stopped', 'timestamp': 1})); "
|
|
"(p / 'gateway.pid').write_text(json.dumps({'pid': 99999, 'host': 'old'})); "
|
|
"(p / 'processes.json').write_text('[]')"
|
|
)
|
|
_exec(container, "python3", "-c", stamp, timeout=10).check_returncode()
|
|
|
|
_docker("restart", container, timeout=60).check_returncode()
|
|
time.sleep(8)
|
|
|
|
# Stale runtime files swept.
|
|
r = _sh(container, "test -f /opt/data/profiles/ghost/gateway.pid")
|
|
assert r.returncode != 0, "stale gateway.pid survived restart"
|
|
r = _sh(container, "test -f /opt/data/profiles/ghost/processes.json")
|
|
assert r.returncode != 0, "stale processes.json survived restart"
|