mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
PR #30136 review surfaced two issues, both rooted in the same audit gap: docker integration tests were running as root, not the unprivileged `hermes` user (UID 10000) that the runtime actually uses via `s6-setuidgid hermes`. Anything that probed PID-1 state or wrote to the s6 control surface worked as root in the tests but was inert in production. Fixes: 1. `_s6_running()` previously called `Path("/proc/1/exe").resolve()`, which is root-only readable. For UID 10000 the symlink yields PermissionError, `resolve()` silently returns the unresolved path, and `exe.name == "exe"` — so detection always returned False, the service-manager runtime-registration path was inert, and every `hermes profile create` / `hermes -p X gateway start` silently skipped the s6 hook. Replace with `/proc/1/comm` (world-readable) + `/run/s6/basedir` (s6-overlay-specific) — both required, fail closed. 2. `02-reconcile-profiles` now also chowns `/run/service/.s6-svscan/` {control,lock} to hermes so `s6-svscanctl -a/-an` works without root. Previously the directory chown stopped at `/run/service` and the FIFO inside stayed root-owned, so `register_profile_gateway` from hermes failed at the rescan-trigger step with EACCES — the wrapper in profiles.py caught the exception and printed a swallowed warning, so profile creation appeared to succeed while the slot was rolled back. Audit changes to flush this class of bug next time: - Add `docker_exec` / `docker_exec_sh` helpers to `tests/docker/conftest.py` that default to `-u hermes`. The module docstring explains why and flags `user="root"` as opt-in only for tests that explicitly need root (none currently do). - Refactor every `docker exec` call in tests/docker/ through the new helpers (test_dashboard.py, test_zombie_reaping.py, test_profile_gateway.py, test_container_restart.py, test_s6_profile_gateway_integration.py). - Add 5 unit tests covering `_s6_running` under various probe states (both signals present; comm wrong; basedir missing; PermissionError on /proc/1/comm; missing /proc — non-Linux). The PermissionError test is the explicit regression guard for the original bug. Known follow-up: the per-service `supervise/control` FIFO inside each `/run/service/gateway-<profile>/supervise/` is created root-owned by s6-supervise (which runs as root because s6-svscan is PID 1). `s6-svc -u/-d/-t` from the hermes user will get EACCES on those. The audit under `-u hermes` will reveal this in lifecycle tests — surfacing the issue cleanly so it can be fixed in a focused follow-up (likely via a small SUID helper or a polling chown loop in cont-init.d). The detection + svscanctl fixes here are independent and complete on their own.
139 lines
4.8 KiB
Python
139 lines
4.8 KiB
Python
"""Shared fixtures for docker-image integration tests.
|
|
|
|
Tests in this directory build the image with the current ``Dockerfile``
|
|
and exercise it via ``docker run``. They skip when Docker is unavailable
|
|
(e.g. on developer laptops without a daemon).
|
|
|
|
Override the image with ``HERMES_TEST_IMAGE`` env var to point at a pre-built
|
|
image (faster local iteration); otherwise the ``built_image`` fixture builds
|
|
the repo's Dockerfile once per session.
|
|
|
|
Docker tests need longer timeouts than the suite default (30s), so every
|
|
test under this directory is granted a 180s default via
|
|
``pytest.mark.timeout`` applied at collection time.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
from collections.abc import Iterator
|
|
|
|
import pytest
|
|
|
|
IMAGE_TAG = os.environ.get("HERMES_TEST_IMAGE", "hermes-agent-harness:latest")
|
|
|
|
|
|
def _docker_available() -> bool:
|
|
"""Return True iff a docker CLI is on PATH and the daemon answers."""
|
|
if shutil.which("docker") is None:
|
|
return False
|
|
try:
|
|
r = subprocess.run(
|
|
["docker", "info"], capture_output=True, timeout=5,
|
|
)
|
|
return r.returncode == 0
|
|
except (subprocess.TimeoutExpired, OSError):
|
|
return False
|
|
|
|
|
|
def pytest_collection_modifyitems(config, items): # noqa: D401 - pytest hook
|
|
"""Apply docker-suite policy: timeout bump + skip on missing docker."""
|
|
docker_ok = _docker_available()
|
|
skip_docker = pytest.mark.skip(
|
|
reason="Docker not available or daemon not running",
|
|
)
|
|
extend_timeout = pytest.mark.timeout(180)
|
|
for item in items:
|
|
if "tests/docker/" not in str(item.fspath).replace(os.sep, "/"):
|
|
continue
|
|
item.add_marker(extend_timeout)
|
|
if not docker_ok:
|
|
item.add_marker(skip_docker)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def built_image() -> str:
|
|
"""Build the image once per test session.
|
|
|
|
Override with ``HERMES_TEST_IMAGE`` env var to point at a pre-built
|
|
image (faster local iteration).
|
|
"""
|
|
if os.environ.get("HERMES_TEST_IMAGE"):
|
|
return IMAGE_TAG
|
|
repo_root = os.path.abspath(
|
|
os.path.join(os.path.dirname(__file__), "..", ".."),
|
|
)
|
|
result = subprocess.run(
|
|
["docker", "build", "-t", IMAGE_TAG, repo_root],
|
|
capture_output=True, text=True, timeout=1200,
|
|
)
|
|
assert result.returncode == 0, (
|
|
f"docker build failed:\n{result.stderr[-2000:]}"
|
|
)
|
|
return IMAGE_TAG
|
|
|
|
|
|
@pytest.fixture
|
|
def container_name(request) -> Iterator[str]:
|
|
"""Generate a unique container name and ensure cleanup on test exit."""
|
|
safe = request.node.name.replace("[", "_").replace("]", "_")
|
|
name = f"hermes-test-{safe}"
|
|
yield name
|
|
subprocess.run(
|
|
["docker", "rm", "-f", name],
|
|
capture_output=True, timeout=10,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# docker_exec — default to the unprivileged hermes user
|
|
# ---------------------------------------------------------------------------
|
|
#
|
|
# Background: every Hermes runtime path inside the container drops to UID
|
|
# 10000 (the ``hermes`` user) via ``s6-setuidgid hermes``. ``docker exec``
|
|
# without ``-u`` runs as root, which is **not** representative of how
|
|
# production code executes. PR #30136 review caught a real regression
|
|
# this way — ``Path('/proc/1/exe').resolve()`` works as root and silently
|
|
# fails (PermissionError swallowed) for hermes, so a test that ran as root
|
|
# couldn't catch a feature that was inert for the actual runtime user.
|
|
#
|
|
# Tests in this directory MUST exercise the realistic user context. The
|
|
# helpers below run every probe under ``-u hermes`` unless a specific
|
|
# test explicitly opts into ``user="root"`` (rare — e.g. inspecting
|
|
# /proc/1/exe itself, chowning a volume).
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def docker_exec(
|
|
container: str,
|
|
*args: str,
|
|
user: str = "hermes",
|
|
timeout: int = 30,
|
|
extra_docker_args: tuple[str, ...] = (),
|
|
) -> subprocess.CompletedProcess[str]:
|
|
"""Run a command inside ``container`` as ``user`` (default: hermes).
|
|
|
|
Returns the CompletedProcess with text=True, capture_output=True.
|
|
|
|
Pass ``user="root"`` only when the test specifically needs root
|
|
capabilities (e.g. reading /proc/1/exe, manipulating ownership).
|
|
Most tests should use the default.
|
|
"""
|
|
cmd = ["docker", "exec", "-u", user, *extra_docker_args, container, *args]
|
|
return subprocess.run(
|
|
cmd, capture_output=True, text=True, timeout=timeout,
|
|
)
|
|
|
|
|
|
def docker_exec_sh(
|
|
container: str,
|
|
command: str,
|
|
*,
|
|
user: str = "hermes",
|
|
timeout: int = 30,
|
|
) -> subprocess.CompletedProcess[str]:
|
|
"""Run ``sh -c <command>`` inside the container as ``user``."""
|
|
return docker_exec(
|
|
container, "sh", "-c", command, user=user, timeout=timeout,
|
|
)
|