hermes-agent/tests/docker/test_gateway_bootstrap_state.py

314 lines
No EOL
11 KiB
Python

"""Runtime smoke tests for Docker gateway_state.json bootstrap seeding.
Build the real image and verify the actual runtime behavior:
1. HERMES_GATEWAY_BOOTSTRAP_STATE=running on a fresh volume seeds
gateway_state.json with running state
2. An existing gateway_state.json is never clobbered (first-boot-only)
3. No env var = no seed (default down-on-first-boot preserved)
4. Only literal "running" is honored; other values are ignored
5. Symlinked gateway_state.json / auth.json are never written through
(path_has_symlink_component guard)
"""
from __future__ import annotations
import json
import subprocess
import tempfile
from pathlib import Path
from tests.docker.conftest import docker_exec_sh, wait_for_container_ready
def _start_container(
built_image: str, name: str, *env: str,
) -> str:
"""Start a container with given env vars, return its name."""
args = ["docker", "run", "-d", "--name", name]
for e in env:
args.extend(["-e", e])
args.extend([built_image, "sleep", "infinity"])
subprocess.run(args, check=True, capture_output=True, timeout=60)
wait_for_container_ready(name)
return name
def test_seeds_running_state_on_blank_volume(
built_image: str, container_name: str,
) -> None:
"""HERMES_GATEWAY_BOOTSTRAP_STATE=running on a fresh volume must
seed gateway_state.json with a valid running state."""
_start_container(
built_image, container_name,
"HERMES_GATEWAY_BOOTSTRAP_STATE=running",
)
r = docker_exec_sh(
container_name,
"cat /opt/data/gateway_state.json 2>/dev/null || echo NONE",
timeout=10,
)
assert r.stdout.strip() != "NONE", (
f"gateway_state.json not seeded on fresh volume: {r.stdout}"
)
state = json.loads(r.stdout.strip())
assert state.get("gateway_state") == "running", (
f"expected gateway_state=running, got: {state}"
)
def test_does_not_clobber_existing_state(
built_image: str, container_name: str,
) -> None:
"""An existing gateway_state.json must never be overwritten by the
seed, even when the bootstrap env var says running.
We use a named volume so we can pre-create the state file before
the container boots. The [ ! -f ] guard in stage2 must skip seeding
because the file already exists. We check the file immediately after
boot — before the gateway service has a chance to write its own
state — by reading it as fast as possible after container start.
"""
import json as _json
volume = f"{container_name}-vol"
subprocess.run(
["docker", "volume", "create", volume],
check=True, capture_output=True, timeout=10,
)
# Pre-create the state file via a throwaway container
existing = _json.dumps({"gateway_state": "stopped", "pid": 123})
subprocess.run(
["docker", "run", "--rm", "-v", f"{volume}:/opt/data",
"--entrypoint", "sh", built_image,
"-c", f"printf '{existing}\\n' > /opt/data/gateway_state.json"],
check=True, capture_output=True, timeout=30,
)
# Boot with the env var set — stage2 must NOT clobber the existing file
subprocess.run(
["docker", "run", "-d", "--name", container_name,
"-v", f"{volume}:/opt/data",
"-e", "HERMES_GATEWAY_BOOTSTRAP_STATE=running",
built_image, "sleep", "infinity"],
check=True, capture_output=True, timeout=60,
)
# Read the file as quickly as possible — the gateway service may
# start and write its own state, but the stage2 [ ! -f ] guard runs
# during cont-init (before any service starts), so the file must
# still be our "stopped" state at this point.
wait_for_container_ready(container_name)
r = docker_exec_sh(
container_name, "cat /opt/data/gateway_state.json", timeout=10,
)
state = _json.loads(r.stdout.strip())
assert state.get("gateway_state") == "stopped", (
f"existing state was clobbered by bootstrap seed: {state}"
)
# Cleanup
subprocess.run(
["docker", "rm", "-f", container_name],
capture_output=True, timeout=10,
)
subprocess.run(
["docker", "volume", "rm", "-f", volume],
capture_output=True, timeout=10,
)
def test_no_seed_when_env_unset(
built_image: str, container_name: str,
) -> None:
"""No HERMES_GATEWAY_BOOTSTRAP_STATE = no seed file written."""
_start_container(built_image, container_name)
r = docker_exec_sh(
container_name,
"test -f /opt/data/gateway_state.json && "
"echo EXISTS || echo ABSENT",
timeout=10,
)
assert "ABSENT" in r.stdout, (
f"gateway_state.json was seeded without the env var: {r.stdout}"
)
def test_non_running_value_ignored(
built_image: str, container_name: str,
) -> None:
"""Only literal 'running' is honored; any other value is ignored."""
for bogus in ("stopped", "Running", "1", "true", "starting"):
# Need a fresh container per iteration
name = f"{container_name}-{bogus}"
_start_container(
built_image, name,
f"HERMES_GATEWAY_BOOTSTRAP_STATE={bogus}",
)
r = docker_exec_sh(
name,
"test -f /opt/data/gateway_state.json && "
"echo EXISTS || echo ABSENT",
timeout=10,
)
assert "ABSENT" in r.stdout, (
f"bogus value {bogus!r} should not seed a state file: {r.stdout}"
)
subprocess.run(
["docker", "rm", "-f", name],
capture_output=True, timeout=10,
)
def _boot_with_bind_mount(
built_image: str, name: str, host_dir: Path, *env: str,
) -> None:
"""Boot a container with host_dir bind-mounted to /opt/data."""
args = ["docker", "run", "-d", "--name", name,
"-v", f"{host_dir}:/opt/data"]
for e in env:
args.extend(["-e", e])
args.extend([built_image, "sleep", "infinity"])
subprocess.run(args, check=True, capture_output=True, timeout=60)
wait_for_container_ready(name)
def _cleanup_bind_mount(built_image: str, container_name: str, host_dir: Path) -> None:
"""Remove root/hermes-owned files left in a bind-mounted host dir.
The stage2 hook chowns /opt/data (and its contents) to UID 10000
(hermes), which the host test user cannot delete. We run a throwaway
container as root to chown everything back and rm -rf the contents
before the temp dir is cleaned up.
"""
subprocess.run(
["docker", "rm", "-f", container_name],
capture_output=True, timeout=10,
)
subprocess.run(
["docker", "run", "--rm",
"-v", f"{host_dir}:/clean",
"--entrypoint", "sh", built_image,
"-c", "chown -R 0:0 /clean 2>/dev/null; rm -rf /clean/* /clean/.* 2>/dev/null; chown 0:0 /clean; true"],
capture_output=True, timeout=15,
)
def test_does_not_seed_gateway_state_through_symlink(
built_image: str, container_name: str,
) -> None:
"""A symlinked gateway_state.json must not become a host write.
The path_has_symlink_component guard in stage2-hook.sh must detect
the symlink and refuse to seed, printing a warning instead. The
symlink target file (outside /opt/data) must NOT be created.
"""
tmp = tempfile.mkdtemp()
host_data: Path | None = None
tmp_path = Path(tmp)
try:
host_data = tmp_path / "data"
host_data.mkdir()
# Pre-create the symlink as root via a throwaway container
subprocess.run(
["docker", "run", "--rm",
"-v", f"{host_data}:/opt/data",
"--entrypoint", "sh", built_image,
"-c", "ln -s /tmp/outside-gateway-state.json /opt/data/gateway_state.json"],
check=True, capture_output=True, timeout=30,
)
_boot_with_bind_mount(
built_image, container_name, host_data,
"HERMES_GATEWAY_BOOTSTRAP_STATE=running",
)
# The symlink itself must still exist (not replaced by a file)
r = docker_exec_sh(
container_name,
"test -L /opt/data/gateway_state.json && echo SYMLINK || echo NOT_SYMLINK",
timeout=5,
)
assert "SYMLINK" in r.stdout, (
f"gateway_state.json symlink was replaced by a regular file: {r.stdout}"
)
# The refusal warning goes to stdout (docker logs), not
# container-boot.log (which is written by container_boot.py).
r = subprocess.run(
["docker", "logs", container_name],
capture_output=True, text=True, timeout=10,
)
combined = r.stdout + r.stderr
assert "refusing" in combined and "gateway_state.json" in combined, (
f"expected symlink refusal warning in docker logs, got: {combined}"
)
finally:
if host_data is not None:
_cleanup_bind_mount(built_image, container_name, host_data)
try:
host_data.rmdir()
tmp_path.rmdir()
except OSError:
pass
def test_does_not_seed_auth_json_through_symlink(
built_image: str, container_name: str,
) -> None:
"""A symlinked auth.json must not become a host write.
Same guard as gateway_state.json — the auth.json seed must also
respect path_has_symlink_component and refuse to write through
the symlink.
"""
tmp = tempfile.mkdtemp()
host_data: Path | None = None
tmp_path = Path(tmp)
try:
host_data = tmp_path / "data"
host_data.mkdir()
subprocess.run(
["docker", "run", "--rm",
"-v", f"{host_data}:/opt/data",
"--entrypoint", "sh", built_image,
"-c", "ln -s /tmp/outside-auth.json /opt/data/auth.json"],
check=True, capture_output=True, timeout=30,
)
_boot_with_bind_mount(
built_image, container_name, host_data,
'HERMES_AUTH_JSON_BOOTSTRAP={"api_key":"test"}',
)
# The symlink must still exist
r = docker_exec_sh(
container_name,
"test -L /opt/data/auth.json && echo SYMLINK || echo NOT_SYMLINK",
timeout=5,
)
assert "SYMLINK" in r.stdout, (
f"auth.json symlink was replaced by a regular file: {r.stdout}"
)
# The refusal warning goes to stdout (docker logs), not
# container-boot.log (which is written by container_boot.py).
r = subprocess.run(
["docker", "logs", container_name],
capture_output=True, text=True, timeout=10,
)
combined = r.stdout + r.stderr
assert "refusing" in combined and "auth.json" in combined, (
f"expected symlink refusal warning for auth.json in docker logs: {combined}"
)
finally:
if host_data is not None:
_cleanup_bind_mount(built_image, container_name, host_data)
try:
host_data.rmdir()
tmp_path.rmdir()
except OSError:
pass