mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-18 09:51:59 +00:00
fix(docker): supervised gateway uses --replace to take over stale holder (NS-505) (#47555)
* fix(docker): supervised gateway uses --replace to take over stale holder Inside the s6 container image the per-profile gateway service rendered a bare `hermes gateway run` (no --replace). When a gateway is started OUTSIDE s6 — a stray shell `hermes gateway run`, an agent action, or the Open WebUI helper (scripts/setup_open_webui.sh) — it grabs the per-HERMES_HOME PID lock first. The supervised slot then execs the bare `gateway run`, hits the "Another gateway instance is already running" guard, exits non-zero, and s6 restarts it: a restart loop that floods the log every ~12s and never binds. The container looks up but the gateway is permanently down, and dashboard-only users (no shell) cannot recover. Render the supervised run script as `gateway run --replace` so s6 is authoritative for its slot: it reaps the stale holder via the hardened takeover path (takeover marker + SIGTERM->SIGKILL-with-confirmation + scoped-lock cleanup in gateway/run.py) and binds. This matches the systemd service path, which already builds its argv with --replace (_build_gateway_argv / 'nohup hermes gateway run --replace'), and the intent already documented in _maybe_redirect_run_to_s6_supervision. The existing HERMES_S6_SUPERVISED_CHILD sentinel still prevents the run->start->run redirect recursion. Each profile is scoped to its own HERMES_HOME and s6 guarantees one supervised instance per slot, so there is no legitimate supervised sibling for --replace to clobber. Reported via beta (NS-505): gateway.log showed PID 17907 'running (manual process)' with the guard error repeating every ~12s on v2026.6.5. Adds a regression test asserting every gateway-run exec line in the rendered script (default + named profile, both privilege branches) carries --replace, and updates the existing render-script assertion. * fix(ci): remove stray .venv symlink committed into repo The PR's commit accidentally tracked a .venv symlink pointing at the developer's local venv (mode 120000 -> /home/ben/nous/hermes-agent/.venv). The CI test/e2e/build jobs run `uv venv` to create .venv and failed with `failed to create directory .venv: File exists (os error 17)` because the checkout already contained the symlink. All test shards aborted in <15s during setup, before any test ran. Untrack the symlink and add a bare `.venv` entry to .gitignore (the existing `.venv/` rule only matches a directory, so a symlink slipped through).
This commit is contained in:
parent
c5eb64b9f7
commit
fcf6cb3d73
3 changed files with 58 additions and 3 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -5,6 +5,7 @@
|
|||
*.pyc*
|
||||
__pycache__/
|
||||
.venv/
|
||||
.venv
|
||||
.vscode/
|
||||
.env
|
||||
.env.local
|
||||
|
|
|
|||
|
|
@ -684,10 +684,25 @@ class S6ServiceManager:
|
|||
# start`, etc. See `_gateway_command_inner` for the matching
|
||||
# guard.
|
||||
lines.append("export HERMES_S6_SUPERVISED_CHILD=1")
|
||||
# ``--replace`` makes the supervised gateway authoritative for its
|
||||
# profile's HERMES_HOME. Without it, a gateway started OUTSIDE s6
|
||||
# (a stray ``hermes gateway run`` from a shell, an agent action, or
|
||||
# the Open WebUI helper) grabs the per-HERMES_HOME PID lock first;
|
||||
# the supervised slot then execs a bare ``gateway run``, hits the
|
||||
# "Another gateway instance is already running" guard, exits
|
||||
# non-zero, and s6 restarts it — a restart loop that floods the
|
||||
# log and never binds (NS-505). ``--replace``
|
||||
# instead reaps the stale holder (hardened takeover path: marker +
|
||||
# SIGTERM→SIGKILL-with-confirmation + scoped-lock cleanup, see
|
||||
# gateway/run.py) so s6 always wins. The HERMES_S6_SUPERVISED_CHILD
|
||||
# sentinel above prevents the run→start→run redirect recursion.
|
||||
# Each profile is scoped to its own HERMES_HOME and s6 guarantees a
|
||||
# single supervised instance per slot, so there is no legitimate
|
||||
# supervised sibling for ``--replace`` to clobber.
|
||||
if profile == "default":
|
||||
gateway_cmd = "hermes gateway run"
|
||||
gateway_cmd = "hermes gateway run --replace"
|
||||
else:
|
||||
gateway_cmd = f"hermes -p {shlex.quote(profile)} gateway run"
|
||||
gateway_cmd = f"hermes -p {shlex.quote(profile)} gateway run --replace"
|
||||
# Skip the drop when already non-root (setgroups() lacks CAP_SETGID →
|
||||
# s6 boot-loop).
|
||||
lines.append(f'[ "$(id -u)" = 0 ] || exec {gateway_cmd}')
|
||||
|
|
|
|||
|
|
@ -631,7 +631,46 @@ def test_render_run_script_resets_home_before_exec() -> None:
|
|||
run_text = S6ServiceManager._render_run_script("coder", {})
|
||||
|
||||
assert "export HOME=/opt/data" in run_text
|
||||
assert "exec s6-setuidgid hermes hermes -p coder gateway run" in run_text
|
||||
assert "exec s6-setuidgid hermes hermes -p coder gateway run --replace" in run_text
|
||||
|
||||
|
||||
def test_render_run_script_uses_replace_to_take_over_stale_holder() -> None:
|
||||
"""NS-505: the supervised gateway must exec ``gateway run --replace``.
|
||||
|
||||
Without ``--replace`` a gateway started OUTSIDE s6 (a stray shell
|
||||
``hermes gateway run``, an agent action, the Open WebUI helper) holds
|
||||
the per-HERMES_HOME PID lock; the supervised slot then execs a bare
|
||||
``gateway run``, hits the "Another gateway instance is already
|
||||
running" guard, exits non-zero, and s6 restarts it — a restart loop
|
||||
that never binds. ``--replace`` makes the supervised gateway reap the
|
||||
stale holder and win, so s6 is authoritative for the slot.
|
||||
|
||||
Covers both the default (root HERMES_HOME, no ``-p``) and named-profile
|
||||
render paths.
|
||||
"""
|
||||
default_text = S6ServiceManager._render_run_script("default", {})
|
||||
# Root profile: bare `hermes gateway run --replace` (no -p flag).
|
||||
assert "hermes gateway run --replace" in default_text
|
||||
assert "hermes -p default" not in default_text
|
||||
# Every exec line that launches the gateway must carry --replace, so
|
||||
# neither the non-root nor the privilege-drop branch can spin.
|
||||
gateway_execs = [
|
||||
line for line in default_text.splitlines()
|
||||
if "gateway run" in line
|
||||
]
|
||||
assert gateway_execs, "no gateway run exec line rendered"
|
||||
assert all("--replace" in line for line in gateway_execs), (
|
||||
f"a gateway run line is missing --replace: {gateway_execs}"
|
||||
)
|
||||
|
||||
named_text = S6ServiceManager._render_run_script("coder", {})
|
||||
named_execs = [
|
||||
line for line in named_text.splitlines() if "gateway run" in line
|
||||
]
|
||||
assert named_execs
|
||||
assert all("--replace" in line for line in named_execs), (
|
||||
f"a named-profile gateway run line is missing --replace: {named_execs}"
|
||||
)
|
||||
|
||||
|
||||
def test_s6_register_rejects_invalid_profile_name(s6_scandir) -> None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue