diff --git a/.gitignore b/.gitignore index 6d87318e35e..489453d79bf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.pyc* __pycache__/ .venv/ +.venv .vscode/ .env .env.local diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 8a12a82d819..f5254107b8b 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -684,10 +684,25 @@ class S6ServiceManager: # start`, etc. See `_gateway_command_inner` for the matching # guard. lines.append("export HERMES_S6_SUPERVISED_CHILD=1") + # ``--replace`` makes the supervised gateway authoritative for its + # profile's HERMES_HOME. Without it, a gateway started OUTSIDE s6 + # (a stray ``hermes gateway run`` from a shell, an agent action, or + # the Open WebUI helper) grabs the per-HERMES_HOME PID lock first; + # the supervised slot then execs a bare ``gateway run``, hits the + # "Another gateway instance is already running" guard, exits + # non-zero, and s6 restarts it — a restart loop that floods the + # log and never binds (NS-505). ``--replace`` + # instead reaps the stale holder (hardened takeover path: marker + + # SIGTERM→SIGKILL-with-confirmation + scoped-lock cleanup, see + # gateway/run.py) so s6 always wins. The HERMES_S6_SUPERVISED_CHILD + # sentinel above prevents the run→start→run redirect recursion. + # Each profile is scoped to its own HERMES_HOME and s6 guarantees a + # single supervised instance per slot, so there is no legitimate + # supervised sibling for ``--replace`` to clobber. if profile == "default": - gateway_cmd = "hermes gateway run" + gateway_cmd = "hermes gateway run --replace" else: - gateway_cmd = f"hermes -p {shlex.quote(profile)} gateway run" + gateway_cmd = f"hermes -p {shlex.quote(profile)} gateway run --replace" # Skip the drop when already non-root (setgroups() lacks CAP_SETGID → # s6 boot-loop). lines.append(f'[ "$(id -u)" = 0 ] || exec {gateway_cmd}') diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index 018a60031d3..80c7432fd1e 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -631,7 +631,46 @@ def test_render_run_script_resets_home_before_exec() -> None: run_text = S6ServiceManager._render_run_script("coder", {}) assert "export HOME=/opt/data" in run_text - assert "exec s6-setuidgid hermes hermes -p coder gateway run" in run_text + assert "exec s6-setuidgid hermes hermes -p coder gateway run --replace" in run_text + + +def test_render_run_script_uses_replace_to_take_over_stale_holder() -> None: + """NS-505: the supervised gateway must exec ``gateway run --replace``. + + Without ``--replace`` a gateway started OUTSIDE s6 (a stray shell + ``hermes gateway run``, an agent action, the Open WebUI helper) holds + the per-HERMES_HOME PID lock; the supervised slot then execs a bare + ``gateway run``, hits the "Another gateway instance is already + running" guard, exits non-zero, and s6 restarts it — a restart loop + that never binds. ``--replace`` makes the supervised gateway reap the + stale holder and win, so s6 is authoritative for the slot. + + Covers both the default (root HERMES_HOME, no ``-p``) and named-profile + render paths. + """ + default_text = S6ServiceManager._render_run_script("default", {}) + # Root profile: bare `hermes gateway run --replace` (no -p flag). + assert "hermes gateway run --replace" in default_text + assert "hermes -p default" not in default_text + # Every exec line that launches the gateway must carry --replace, so + # neither the non-root nor the privilege-drop branch can spin. + gateway_execs = [ + line for line in default_text.splitlines() + if "gateway run" in line + ] + assert gateway_execs, "no gateway run exec line rendered" + assert all("--replace" in line for line in gateway_execs), ( + f"a gateway run line is missing --replace: {gateway_execs}" + ) + + named_text = S6ServiceManager._render_run_script("coder", {}) + named_execs = [ + line for line in named_text.splitlines() if "gateway run" in line + ] + assert named_execs + assert all("--replace" in line for line in named_execs), ( + f"a named-profile gateway run line is missing --replace: {named_execs}" + ) def test_s6_register_rejects_invalid_profile_name(s6_scandir) -> None: