From acd78a457e4b6c0c09b8feccc41b76b61d006460 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 24 Apr 2026 05:22:34 -0700 Subject: [PATCH] fix(docker): reap orphaned subprocesses via tini as PID 1 (#15116) Install tini in the container image and route ENTRYPOINT through `/usr/bin/tini -g -- /opt/hermes/docker/entrypoint.sh`. Without a PID-1 init, orphans reparented to hermes (MCP stdio servers, git, bun, browser daemons) never get waited() on and accumulate as zombies. Long-running gateway containers eventually exhaust the PID table and hit "fork: cannot allocate memory". tini is the standard container init (same pattern Docker's --init flag and Kubernetes pause container use). It handles SIGCHLD, reaps orphans, and forwards SIGTERM/SIGINT to the entrypoint so hermes's existing graceful-shutdown handlers still run. The -g flag sends signals to the whole process group so `docker stop` cleanly terminates hermes and its descendants, not just direct children. Closes #15012. E2E-verified with a minimal reproducer image: spawning 5 orphans that reparent to PID 1 leaves 5 zombies without tini and 0 with tini. --- Dockerfile | 6 +- tests/tools/test_dockerfile_pid1_reaping.py | 78 +++++++++++++++++++++ 2 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 tests/tools/test_dockerfile_pid1_reaping.py diff --git a/Dockerfile b/Dockerfile index fd5f18b4b..4ab1d3804 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,9 +10,11 @@ ENV PYTHONUNBUFFERED=1 ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright # Install system dependencies in one layer, clear APT cache +# tini reaps orphaned zombie processes (MCP stdio subprocesses, git, bun, etc.) +# that would otherwise accumulate when hermes runs as PID 1. See #15012. RUN apt-get update && \ apt-get install -y --no-install-recommends \ - build-essential nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli && \ + build-essential nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli tini && \ rm -rf /var/lib/apt/lists/* # Non-root user for runtime; UID can be overridden via HERMES_UID at runtime @@ -58,4 +60,4 @@ ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist ENV HERMES_HOME=/opt/data ENV PATH="/opt/data/.local/bin:${PATH}" VOLUME [ "/opt/data" ] -ENTRYPOINT [ "/opt/hermes/docker/entrypoint.sh" ] +ENTRYPOINT [ "/usr/bin/tini", "-g", "--", "/opt/hermes/docker/entrypoint.sh" ] diff --git a/tests/tools/test_dockerfile_pid1_reaping.py b/tests/tools/test_dockerfile_pid1_reaping.py new file mode 100644 index 000000000..55bd5e069 --- /dev/null +++ b/tests/tools/test_dockerfile_pid1_reaping.py @@ -0,0 +1,78 @@ +"""Contract tests for the container Dockerfile. + +These tests assert invariants about how the Dockerfile composes its runtime — +they deliberately avoid snapshotting specific package versions, line numbers, +or exact flag choices. What they DO assert is that the Dockerfile maintains +the properties required for correct production behaviour: + +- A PID-1 init (tini) is installed and wraps the entrypoint, so that orphaned + subprocesses (MCP stdio servers, git, bun, browser daemons) get reaped + instead of accumulating as zombies (#15012). +- Signal forwarding runs through the init so ``docker stop`` triggers + hermes's own graceful-shutdown path. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + + +REPO_ROOT = Path(__file__).resolve().parents[2] +DOCKERFILE = REPO_ROOT / "Dockerfile" + + +@pytest.fixture(scope="module") +def dockerfile_text() -> str: + if not DOCKERFILE.exists(): + pytest.skip("Dockerfile not present in this checkout") + return DOCKERFILE.read_text() + + +def test_dockerfile_installs_an_init_for_zombie_reaping(dockerfile_text): + """Some init (tini, dumb-init, catatonit) must be installed. + + Without a PID-1 init that handles SIGCHLD, hermes accumulates zombie + processes from MCP stdio subprocesses, git operations, browser + daemons, etc. In long-running Docker deployments this eventually + exhausts the PID table. + """ + # Accept any of the common reapers. The contract is behavioural: + # something must be installed that reaps orphans. + known_inits = ("tini", "dumb-init", "catatonit") + installed = any(name in dockerfile_text for name in known_inits) + assert installed, ( + "No PID-1 init detected in Dockerfile (looked for: " + f"{', '.join(known_inits)}). Without an init process to reap " + "orphaned subprocesses, hermes accumulates zombies in Docker " + "deployments. See issue #15012." + ) + + +def test_dockerfile_entrypoint_routes_through_the_init(dockerfile_text): + """The ENTRYPOINT must invoke the init, not the entrypoint script directly. + + Installing tini is only half the fix — the container must actually run + with tini as PID 1. If the ENTRYPOINT executes the shell script + directly, the shell becomes PID 1 and will ``exec`` into hermes, + which then runs as PID 1 without any zombie reaping. + """ + # Find the last uncommented ENTRYPOINT line — Docker honours the final one. + entrypoint_line = None + for raw_line in dockerfile_text.splitlines(): + line = raw_line.strip() + if line.startswith("#"): + continue + if line.startswith("ENTRYPOINT"): + entrypoint_line = line + + assert entrypoint_line is not None, "Dockerfile is missing an ENTRYPOINT directive" + + known_inits = ("tini", "dumb-init", "catatonit") + routes_through_init = any(name in entrypoint_line for name in known_inits) + assert routes_through_init, ( + f"ENTRYPOINT does not route through an init: {entrypoint_line!r}. " + "If tini is only installed but not wired into ENTRYPOINT, hermes " + "still runs as PID 1 and zombies will accumulate (#15012)." + )