hermes-agent/tests/tools/test_dockerfile_pid1_reaping.py
Teknium acd78a457e
fix(docker): reap orphaned subprocesses via tini as PID 1 (#15116)
Install tini in the container image and route ENTRYPOINT through
`/usr/bin/tini -g -- /opt/hermes/docker/entrypoint.sh`.

Without a PID-1 init, orphans reparented to hermes (MCP stdio servers,
git, bun, browser daemons) never get waited() on and accumulate as
zombies. Long-running gateway containers eventually exhaust the PID
table and hit "fork: cannot allocate memory".

tini is the standard container init (same pattern Docker's --init flag
and Kubernetes pause container use). It handles SIGCHLD, reaps orphans,
and forwards SIGTERM/SIGINT to the entrypoint so hermes's existing
graceful-shutdown handlers still run. The -g flag sends signals to the
whole process group so `docker stop` cleanly terminates hermes and its
descendants, not just direct children.

Closes #15012.

E2E-verified with a minimal reproducer image: spawning 5 orphans that
reparent to PID 1 leaves 5 zombies without tini and 0 with tini.
2026-04-24 05:22:34 -07:00

78 lines
3.1 KiB
Python

"""Contract tests for the container Dockerfile.
These tests assert invariants about how the Dockerfile composes its runtime —
they deliberately avoid snapshotting specific package versions, line numbers,
or exact flag choices. What they DO assert is that the Dockerfile maintains
the properties required for correct production behaviour:
- A PID-1 init (tini) is installed and wraps the entrypoint, so that orphaned
subprocesses (MCP stdio servers, git, bun, browser daemons) get reaped
instead of accumulating as zombies (#15012).
- Signal forwarding runs through the init so ``docker stop`` triggers
hermes's own graceful-shutdown path.
"""
from __future__ import annotations
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parents[2]
DOCKERFILE = REPO_ROOT / "Dockerfile"
@pytest.fixture(scope="module")
def dockerfile_text() -> str:
if not DOCKERFILE.exists():
pytest.skip("Dockerfile not present in this checkout")
return DOCKERFILE.read_text()
def test_dockerfile_installs_an_init_for_zombie_reaping(dockerfile_text):
"""Some init (tini, dumb-init, catatonit) must be installed.
Without a PID-1 init that handles SIGCHLD, hermes accumulates zombie
processes from MCP stdio subprocesses, git operations, browser
daemons, etc. In long-running Docker deployments this eventually
exhausts the PID table.
"""
# Accept any of the common reapers. The contract is behavioural:
# something must be installed that reaps orphans.
known_inits = ("tini", "dumb-init", "catatonit")
installed = any(name in dockerfile_text for name in known_inits)
assert installed, (
"No PID-1 init detected in Dockerfile (looked for: "
f"{', '.join(known_inits)}). Without an init process to reap "
"orphaned subprocesses, hermes accumulates zombies in Docker "
"deployments. See issue #15012."
)
def test_dockerfile_entrypoint_routes_through_the_init(dockerfile_text):
"""The ENTRYPOINT must invoke the init, not the entrypoint script directly.
Installing tini is only half the fix — the container must actually run
with tini as PID 1. If the ENTRYPOINT executes the shell script
directly, the shell becomes PID 1 and will ``exec`` into hermes,
which then runs as PID 1 without any zombie reaping.
"""
# Find the last uncommented ENTRYPOINT line — Docker honours the final one.
entrypoint_line = None
for raw_line in dockerfile_text.splitlines():
line = raw_line.strip()
if line.startswith("#"):
continue
if line.startswith("ENTRYPOINT"):
entrypoint_line = line
assert entrypoint_line is not None, "Dockerfile is missing an ENTRYPOINT directive"
known_inits = ("tini", "dumb-init", "catatonit")
routes_through_init = any(name in entrypoint_line for name in known_inits)
assert routes_through_init, (
f"ENTRYPOINT does not route through an init: {entrypoint_line!r}. "
"If tini is only installed but not wired into ENTRYPOINT, hermes "
"still runs as PID 1 and zombies will accumulate (#15012)."
)