From fcf6cb3d7304c644c3f52031598d0441ce1ff270 Mon Sep 17 00:00:00 2001 From: Ben Barclay Date: Thu, 18 Jun 2026 10:49:02 +1000 Subject: [PATCH] fix(docker): supervised gateway uses --replace to take over stale holder (NS-505) (#47555) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(docker): supervised gateway uses --replace to take over stale holder Inside the s6 container image the per-profile gateway service rendered a bare `hermes gateway run` (no --replace). When a gateway is started OUTSIDE s6 — a stray shell `hermes gateway run`, an agent action, or the Open WebUI helper (scripts/setup_open_webui.sh) — it grabs the per-HERMES_HOME PID lock first. The supervised slot then execs the bare `gateway run`, hits the "Another gateway instance is already running" guard, exits non-zero, and s6 restarts it: a restart loop that floods the log every ~12s and never binds. The container looks up but the gateway is permanently down, and dashboard-only users (no shell) cannot recover. Render the supervised run script as `gateway run --replace` so s6 is authoritative for its slot: it reaps the stale holder via the hardened takeover path (takeover marker + SIGTERM->SIGKILL-with-confirmation + scoped-lock cleanup in gateway/run.py) and binds. This matches the systemd service path, which already builds its argv with --replace (_build_gateway_argv / 'nohup hermes gateway run --replace'), and the intent already documented in _maybe_redirect_run_to_s6_supervision. The existing HERMES_S6_SUPERVISED_CHILD sentinel still prevents the run->start->run redirect recursion. Each profile is scoped to its own HERMES_HOME and s6 guarantees one supervised instance per slot, so there is no legitimate supervised sibling for --replace to clobber. Reported via beta (NS-505): gateway.log showed PID 17907 'running (manual process)' with the guard error repeating every ~12s on v2026.6.5. Adds a regression test asserting every gateway-run exec line in the rendered script (default + named profile, both privilege branches) carries --replace, and updates the existing render-script assertion. * fix(ci): remove stray .venv symlink committed into repo The PR's commit accidentally tracked a .venv symlink pointing at the developer's local venv (mode 120000 -> /home/ben/nous/hermes-agent/.venv). The CI test/e2e/build jobs run `uv venv` to create .venv and failed with `failed to create directory .venv: File exists (os error 17)` because the checkout already contained the symlink. All test shards aborted in <15s during setup, before any test ran. Untrack the symlink and add a bare `.venv` entry to .gitignore (the existing `.venv/` rule only matches a directory, so a symlink slipped through). --- .gitignore | 1 + hermes_cli/service_manager.py | 19 +++++++++-- tests/hermes_cli/test_service_manager.py | 41 +++++++++++++++++++++++- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 6d87318e35e..489453d79bf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.pyc* __pycache__/ .venv/ +.venv .vscode/ .env .env.local diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 8a12a82d819..f5254107b8b 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -684,10 +684,25 @@ class S6ServiceManager: # start`, etc. See `_gateway_command_inner` for the matching # guard. lines.append("export HERMES_S6_SUPERVISED_CHILD=1") + # ``--replace`` makes the supervised gateway authoritative for its + # profile's HERMES_HOME. Without it, a gateway started OUTSIDE s6 + # (a stray ``hermes gateway run`` from a shell, an agent action, or + # the Open WebUI helper) grabs the per-HERMES_HOME PID lock first; + # the supervised slot then execs a bare ``gateway run``, hits the + # "Another gateway instance is already running" guard, exits + # non-zero, and s6 restarts it — a restart loop that floods the + # log and never binds (NS-505). ``--replace`` + # instead reaps the stale holder (hardened takeover path: marker + + # SIGTERM→SIGKILL-with-confirmation + scoped-lock cleanup, see + # gateway/run.py) so s6 always wins. The HERMES_S6_SUPERVISED_CHILD + # sentinel above prevents the run→start→run redirect recursion. + # Each profile is scoped to its own HERMES_HOME and s6 guarantees a + # single supervised instance per slot, so there is no legitimate + # supervised sibling for ``--replace`` to clobber. if profile == "default": - gateway_cmd = "hermes gateway run" + gateway_cmd = "hermes gateway run --replace" else: - gateway_cmd = f"hermes -p {shlex.quote(profile)} gateway run" + gateway_cmd = f"hermes -p {shlex.quote(profile)} gateway run --replace" # Skip the drop when already non-root (setgroups() lacks CAP_SETGID → # s6 boot-loop). lines.append(f'[ "$(id -u)" = 0 ] || exec {gateway_cmd}') diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index 018a60031d3..80c7432fd1e 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -631,7 +631,46 @@ def test_render_run_script_resets_home_before_exec() -> None: run_text = S6ServiceManager._render_run_script("coder", {}) assert "export HOME=/opt/data" in run_text - assert "exec s6-setuidgid hermes hermes -p coder gateway run" in run_text + assert "exec s6-setuidgid hermes hermes -p coder gateway run --replace" in run_text + + +def test_render_run_script_uses_replace_to_take_over_stale_holder() -> None: + """NS-505: the supervised gateway must exec ``gateway run --replace``. + + Without ``--replace`` a gateway started OUTSIDE s6 (a stray shell + ``hermes gateway run``, an agent action, the Open WebUI helper) holds + the per-HERMES_HOME PID lock; the supervised slot then execs a bare + ``gateway run``, hits the "Another gateway instance is already + running" guard, exits non-zero, and s6 restarts it — a restart loop + that never binds. ``--replace`` makes the supervised gateway reap the + stale holder and win, so s6 is authoritative for the slot. + + Covers both the default (root HERMES_HOME, no ``-p``) and named-profile + render paths. + """ + default_text = S6ServiceManager._render_run_script("default", {}) + # Root profile: bare `hermes gateway run --replace` (no -p flag). + assert "hermes gateway run --replace" in default_text + assert "hermes -p default" not in default_text + # Every exec line that launches the gateway must carry --replace, so + # neither the non-root nor the privilege-drop branch can spin. + gateway_execs = [ + line for line in default_text.splitlines() + if "gateway run" in line + ] + assert gateway_execs, "no gateway run exec line rendered" + assert all("--replace" in line for line in gateway_execs), ( + f"a gateway run line is missing --replace: {gateway_execs}" + ) + + named_text = S6ServiceManager._render_run_script("coder", {}) + named_execs = [ + line for line in named_text.splitlines() if "gateway run" in line + ] + assert named_execs + assert all("--replace" in line for line in named_execs), ( + f"a named-profile gateway run line is missing --replace: {named_execs}" + ) def test_s6_register_rejects_invalid_profile_name(s6_scandir) -> None: