diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 86731957480..8a9a5e802d8 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -5150,11 +5150,83 @@ def gateway_command(args): sys.exit(1) +def _maybe_redirect_run_to_s6_supervision(args) -> bool: + """Inside an s6 container, redirect bare ``gateway run`` to the + supervised path. + + Background. Before the s6 image landed, ``docker run gateway + run`` was the standard way to start a containerized gateway: the + gateway was the container's main process, tini reaped zombies, and + container exit code == gateway exit code. With s6-overlay as PID 1, + we'd much rather have the gateway run as a supervised s6 longrun + (auto-restart on crash, dashboard supervised alongside, multiple + profile gateways under the same /init). This redirect upgrades the + old invocation transparently — the user gets the new behavior + without changing their docker run command. + + Three gates make this a no-op outside the intended scope: + + 1. ``_dispatch_via_service_manager_if_s6`` returns False unless + we're in a container with s6 as PID 1. Host runs of + ``hermes gateway run`` are unaffected. + 2. ``HERMES_S6_SUPERVISED_CHILD`` is exported by + ``S6ServiceManager._render_run_script`` for the supervised + process itself — i.e. when s6-supervise execs ``hermes gateway + run --replace`` as a longrun, this guard short-circuits the + redirect so the supervised gateway actually runs in + foreground (otherwise we'd recurse: run → start → run → start + → ...). + 3. ``--no-supervise`` (or ``HERMES_GATEWAY_NO_SUPERVISE=1``) opts + out for users who genuinely want pre-s6 semantics — CI smoke + tests, debugging the foreground startup path, etc. + + Returns True iff dispatched (caller should ``return``). + """ + no_supervise = getattr(args, "no_supervise", False) or \ + os.environ.get("HERMES_GATEWAY_NO_SUPERVISE", "").lower() in ("1", "true", "yes") + if no_supervise: + return False + if os.environ.get("HERMES_S6_SUPERVISED_CHILD"): + # We ARE the supervised child s6-supervise is running. Fall + # through to the foreground code path so the gateway actually + # starts. + return False + if not _dispatch_via_service_manager_if_s6("start"): + return False + # Loud breadcrumb: explain the upgrade and how to opt out. Print to + # stderr so it doesn't pollute stdout-parsing scripts. The + # supervised gateway's own logs are routed by s6-log to both + # `docker logs` and ${HERMES_HOME}/logs/gateways//current, + # so the user sees a clear sequence: this banner first, then the + # gateway's own stdout/stderr from the supervisor. + print( + "→ gateway is now running under s6 supervision (auto-restart on crash,\n" + " dashboard supervised alongside if HERMES_DASHBOARD is set).\n" + " This is the recommended setup for the s6 container image — the\n" + " gateway will keep running even if it crashes.\n" + " Use `--no-supervise` (or HERMES_GATEWAY_NO_SUPERVISE=1) to opt out\n" + " and get the pre-s6 foreground behavior instead.", + file=sys.stderr, + flush=True, + ) + # Block until the container is signalled. The supervised gateway's + # lifetime is independent of this process — s6-supervise restarts + # it on crash, and we don't want the container to exit when the + # gateway flaps. `sleep infinity` matches the static main-hermes + # service's pattern (see docker/s6-rc.d/main-hermes/run): the CMD + # process is a no-op heartbeat that keeps /init alive until + # `docker stop` sends SIGTERM, at which point /init runs stage 3 + # shutdown (which tears down the supervised gateway cleanly). + os.execvp("sleep", ["sleep", "infinity"]) + + def _gateway_command_inner(args): subcmd = getattr(args, 'gateway_command', None) # Default to run if no subcommand if subcmd is None or subcmd == "run": + if _maybe_redirect_run_to_s6_supervision(args): + return # unreachable; execvp doesn't return verbose = getattr(args, 'verbose', 0) quiet = getattr(args, 'quiet', False) replace = getattr(args, 'replace', False) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index e6cb23b7a83..6f205a1039f 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -11326,6 +11326,19 @@ def main(): action="store_true", help="Replace any existing gateway instance (useful for systemd)", ) + gateway_run.add_argument( + "--no-supervise", + action="store_true", + help=( + "Inside the s6-overlay Docker image, normally `gateway run` is " + "automatically redirected to the supervised s6 service (so the " + "gateway gets auto-restart on crash, plus a supervised dashboard " + "if HERMES_DASHBOARD is set). Pass --no-supervise to opt out and " + "get the historical pre-s6 foreground behavior: the gateway is " + "the container's main process and the container exits with the " + "gateway's exit code. No effect outside an s6 container." + ), + ) _add_accept_hooks_flag(gateway_run) _add_accept_hooks_flag(gateway_parser) diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 417ec4ec982..b894ca764a8 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -602,6 +602,14 @@ class S6ServiceManager: ] for k, v in sorted(extra_env.items()): lines.append(f"export {k}={shlex.quote(v)}") + # Sentinel for the supervised-child path. Prevents recursive + # redirect when the supervised gateway re-enters + # `_gateway_command_inner` with subcmd == "run" — without it the + # supervisor would dispatch `gateway start` which would re-exec + # `gateway run --replace` which would re-dispatch `gateway + # start`, etc. See `_gateway_command_inner` for the matching + # guard. + lines.append("export HERMES_S6_SUPERVISED_CHILD=1") if profile == "default": lines.append("exec s6-setuidgid hermes hermes gateway run") else: diff --git a/tests/docker/test_gateway_run_supervised.py b/tests/docker/test_gateway_run_supervised.py new file mode 100644 index 00000000000..aec2257b0e2 --- /dev/null +++ b/tests/docker/test_gateway_run_supervised.py @@ -0,0 +1,329 @@ +"""Harness: `docker run gateway run` redirects to supervised mode. + +Before the s6 migration, ``docker run nousresearch/hermes-agent gateway +run`` was the standard pattern — the gateway ran as the container's +main process, container exit code matched gateway exit code, no +supervision. With s6 as PID 1, the same invocation now auto-redirects +to the supervised path (`gateway start`) so users get auto-restart on +crash and a supervised dashboard alongside (when ``HERMES_DASHBOARD=1``). + +These tests verify the three load-bearing properties of that redirect: + + 1. The default invocation **does** redirect (container stays up via + ``sleep infinity`` while s6 supervises ``gateway-default``). + 2. ``--no-supervise`` / ``HERMES_GATEWAY_NO_SUPERVISE=1`` opts out. + 3. The supervised process itself does NOT recurse — the + ``HERMES_S6_SUPERVISED_CHILD`` sentinel breaks the loop. + +Every ``docker exec`` runs as ``hermes`` per the conftest module +docstring; see ``tests/docker/conftest.py`` for rationale. +""" +from __future__ import annotations + +import subprocess +import time + +from tests.docker.conftest import docker_exec_sh + + +def _sh(container: str, command: str, timeout: int = 30): + return docker_exec_sh(container, command, timeout=timeout) + + +def _svstat(container: str, slot: str = "gateway-default") -> str: + r = _sh(container, f"/command/s6-svstat /run/service/{slot}") + return r.stdout if r.returncode == 0 else "" + + +def _svstat_wants_up(container: str, slot: str = "gateway-default") -> bool: + """See test_profile_gateway._svstat_wants_up for the format rules.""" + state = _svstat(container, slot) + if not state: + return False + head = state.split()[0] if state.split() else "" + if head == "up": + return "want down" not in state + return "want up" in state + + +def test_gateway_run_redirects_to_supervised( + built_image: str, container_name: str, +) -> None: + """``docker run gateway run`` (the historical invocation) + should now register and start the ``gateway-default`` s6 slot. + + The CMD process itself shouldn't be the gateway — it should be + blocked on ``sleep infinity``, leaving s6 to supervise the actual + gateway process. We verify by: + + * Confirming the CMD process is sleeping (not python/gateway). + * Confirming ``s6-svstat gateway-default`` reports want-up. + """ + # Start the container detached using the historical gateway-run + # pattern. The redirect should fire and the container should NOT + # exit immediately (which is what would happen pre-this-PR on the + # s6 image — the foreground gateway would crash without config, + # the CMD would exit, /init would shut down). + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "gateway", "run"], + check=True, capture_output=True, timeout=30, + ) + + # Give /init time to run cont-init.d, the wrapper time to dispatch + # the redirect, and s6-supervise time to spin up the slot. + time.sleep(5) + + # Container should still be running. If the redirect didn't fire, + # the foreground gateway would have crashed and the container + # would be in `Exited` state by now. + r = subprocess.run( + ["docker", "inspect", "-f", "{{.State.Status}}", container_name], + capture_output=True, text=True, timeout=10, + ) + assert r.returncode == 0 and r.stdout.strip() == "running", ( + f"container exited prematurely: {r.stdout!r}; " + f"docker logs:\n{subprocess.run(['docker', 'logs', container_name], capture_output=True, text=True).stdout}" + ) + + # s6's intent for the default-profile gateway slot should be up. + # Same accept-either rule as test_profile_gateway: the supervised + # gateway may or may not be currently up depending on whether the + # harness profile has a configured model, but the want-intent + # contract holds either way. + assert _svstat_wants_up(container_name), ( + f"gateway-default slot want-state not up: {_svstat(container_name)!r}" + ) + + # The CMD process (PID under /init that the wrapper exec'd into) + # should be sleeping, not the gateway. We grep `ps` for the + # `sleep infinity` heartbeat. + r = _sh(container_name, "ps -eo pid,cmd | grep -v grep | grep 'sleep infinity'") + assert r.returncode == 0 and "sleep infinity" in r.stdout, ( + f"expected `sleep infinity` heartbeat process; got ps:\n{r.stdout}\n" + f"stderr: {r.stderr}" + ) + + # And the loud breadcrumb should be in `docker logs` so users see + # the upgrade explanation. + r = subprocess.run( + ["docker", "logs", container_name], + capture_output=True, text=True, timeout=10, + ) + logs = r.stdout + r.stderr + assert "s6 supervision" in logs, ( + f"expected loud breadcrumb in docker logs; got:\n{logs}" + ) + assert "--no-supervise" in logs, ( + f"breadcrumb missing opt-out hint; got:\n{logs}" + ) + + +def test_gateway_run_no_supervise_flag_preserves_legacy_behavior( + built_image: str, container_name: str, +) -> None: + """``docker run gateway run --no-supervise`` opts out of + the redirect and runs the gateway as the foreground CMD process + (pre-s6 semantics). + + With the redirect in place, the container's CMD process would be + ``sleep infinity`` and the supervised gateway would be a separate + process under ``s6-supervise gateway-default``. WITHOUT the + redirect (opt-out path), there's no supervised gateway slot at + all — the gateway IS the CMD process. + + Three positive assertions confirm we took the pre-s6 path: + + * The CMD process is a python ``hermes gateway run`` invocation + (not ``sleep infinity``). + * The ``gateway-default`` s6 service slot is NOT created. + * No supervision-redirect breadcrumb appears in docker logs. + """ + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "gateway", "run", "--no-supervise"], + check=True, capture_output=True, timeout=30, + ) + # Give startup time. The unconfigured-profile case used to fail + # fast; with a config bind-mounted profile (and a real volume on + # most realistic deployments) the gateway just runs. + time.sleep(6) + + # Container should still be running OR have exited cleanly with + # the gateway's status code. Either is correct for pre-s6 + # semantics — what's NOT correct is the supervised behavior + # (sleep infinity heartbeat + supervised gateway slot). + inspect = subprocess.run( + ["docker", "inspect", "-f", "{{.State.Status}}", container_name], + capture_output=True, text=True, timeout=10, + ) + status = inspect.stdout.strip() + + # No redirect breadcrumb anywhere. + logs = subprocess.run( + ["docker", "logs", container_name], + capture_output=True, text=True, timeout=10, + ).stdout + subprocess.run( + ["docker", "logs", container_name], + capture_output=True, text=True, timeout=10, + ).stderr + assert "s6 supervision" not in logs, ( + f"--no-supervise should have skipped the redirect; " + f"breadcrumb in logs:\n{logs}" + ) + + if status == "running": + # Gateway running in foreground — the CMD process should be + # the gateway itself, NOT a sleep-infinity heartbeat. + r = _sh( + container_name, + "ps -eo pid,ppid,cmd | grep -v grep | awk '/main-wrapper.sh|rc.init top/ { wrapper_pid=$1 } " + "$3==\"sleep\" && $4==\"infinity\" && $2==wrapper_pid { c++ } END { print c+0 }'", + ) + assert r.returncode == 0 + redirected_sleeps = int(r.stdout.strip() or 0) + assert redirected_sleeps == 0, ( + f"--no-supervise: expected NO `sleep infinity` parented to " + f"the CMD wrapper (foreground gateway should be the CMD), " + f"found {redirected_sleeps}. " + f"ps:\n{_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}" + ) + + # The gateway-default s6 slot exists (the cont-init.d + # reconciler creates it on every boot regardless of opt-out) + # but should NOT have its want-state set to "up" — the + # opt-out path doesn't dispatch `start` to s6. + assert not _svstat_wants_up(container_name, "gateway-default"), ( + "--no-supervise: gateway-default slot has want-state up, " + "implying the redirect dispatched `start` despite the " + f"opt-out. svstat:\n{_svstat(container_name)!r}" + ) + # If status == "exited" instead, the gateway exited (also valid + # pre-s6 semantics). The breadcrumb-absence check above is + # already enough to confirm the redirect didn't fire. + + +def test_gateway_run_no_supervise_env_var( + built_image: str, container_name: str, +) -> None: + """Env-var opt-out works identically to the CLI flag. + + Useful when users can't easily change their `docker run` args + (orchestration templates, K8s manifests) but can set env vars. + """ + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_GATEWAY_NO_SUPERVISE=1", + built_image, "gateway", "run"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(6) + + logs = subprocess.run( + ["docker", "logs", container_name], + capture_output=True, text=True, timeout=10, + ) + combined = logs.stdout + logs.stderr + assert "s6 supervision" not in combined, ( + f"env-var opt-out should have skipped the redirect; " + f"breadcrumb in logs:\n{combined}" + ) + + # Same as the CLI-flag test: the slot exists (reconciler creates + # it) but should not have want-state up. + inspect = subprocess.run( + ["docker", "inspect", "-f", "{{.State.Status}}", container_name], + capture_output=True, text=True, timeout=10, + ) + if inspect.stdout.strip() == "running": + assert not _svstat_wants_up(container_name, "gateway-default"), ( + "HERMES_GATEWAY_NO_SUPERVISE=1: gateway-default has " + "want-state up, implying the redirect dispatched `start` " + f"despite the env-var opt-out. svstat:\n{_svstat(container_name)!r}" + ) + + +def test_supervised_gateway_does_not_recurse( + built_image: str, container_name: str, +) -> None: + """The HERMES_S6_SUPERVISED_CHILD sentinel must prevent the + supervised ``hermes gateway run`` from re-entering the redirect. + + If recursion happened, every supervised gateway start would itself + re-dispatch to s6 and exec ``sleep infinity`` — so the supervised + gateway slot would never actually run a python ``hermes gateway + run`` process. The slot would oscillate or settle into a state + with no python in the supervise tree at all. + + We verify by counting python processes whose argv contains + ``gateway run``: there should be at most one (the legitimately + supervised gateway). Two or more would imply recursive spawning + via the redirect → start → run → redirect → ... loop. + """ + subprocess.run( + ["docker", "run", "-d", "--name", container_name, built_image, + "gateway", "run"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(6) + + # Count python processes running `hermes gateway run`. If the + # recursion guard fails, s6 would respawn fresh `gateway run` + # processes on every cycle, leaving multiple Python-process + # descendants under the gateway-default supervise tree. + r = _sh(container_name, "ps -eo pid,cmd | grep -v grep | grep -E 'python.*hermes.*gateway run' | wc -l") + assert r.returncode == 0 + n = int(r.stdout.strip() or 0) + assert n <= 1, ( + f"expected at most one supervised python `hermes gateway run` " + f"process (the legitimately-supervised gateway); found {n}. " + f"Recursion guard may have failed. " + f"ps:\n{_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}" + ) + + # Stronger positive assertion: there should be exactly one + # `sleep infinity` process whose parent is the main-wrapper.sh + # CMD process (PID 17 typically). The static `main-hermes` + # service has its own `sleep infinity` child; THAT one is fine + # and unrelated to our redirect. + r = _sh( + container_name, + # Find PID of the CMD process (main-wrapper.sh or its sh + # parent), then count `sleep infinity` children. + "ps -eo pid,ppid,cmd | grep -v grep | awk '/main-wrapper.sh|rc.init top/ { wrapper_pid=$1 } " + "$3==\"sleep\" && $4==\"infinity\" && $2==wrapper_pid { c++ } END { print c+0 }'", + ) + assert r.returncode == 0 + redirected = int(r.stdout.strip() or 0) + assert redirected == 1, ( + f"expected exactly one `sleep infinity` parented to the CMD " + f"wrapper (the redirect heartbeat); found {redirected}. " + f"ps:\n{_sh(container_name, 'ps -eo pid,ppid,cmd').stdout}" + ) + + +def test_dashboard_supervised_when_env_set( + built_image: str, container_name: str, +) -> None: + """When ``HERMES_DASHBOARD=1`` is set, ``docker run gateway + run`` should result in BOTH the gateway and the dashboard being + supervised by s6 — the dashboard slot was always there but only + activates with the env var. This is the headline benefit of the + redirect: one container = supervised gateway + supervised + dashboard, with zero extra user effort. + """ + subprocess.run( + ["docker", "run", "-d", "--name", container_name, + "-e", "HERMES_DASHBOARD=1", + built_image, "gateway", "run"], + check=True, capture_output=True, timeout=30, + ) + time.sleep(5) + + # Both slots should report want-up. + assert _svstat_wants_up(container_name, "gateway-default"), ( + f"gateway-default slot not up: {_svstat(container_name)!r}" + ) + assert _svstat_wants_up(container_name, "dashboard"), ( + f"dashboard slot not up: {_svstat(container_name, 'dashboard')!r}" + ) diff --git a/tests/hermes_cli/test_gateway_s6_dispatch.py b/tests/hermes_cli/test_gateway_s6_dispatch.py index ba83c1a1187..d7146b2a397 100644 --- a/tests/hermes_cli/test_gateway_s6_dispatch.py +++ b/tests/hermes_cli/test_gateway_s6_dispatch.py @@ -333,3 +333,194 @@ def test_dispatch_renders_s6_command_error_friendly( assert "rc=111" in out assert "Permission denied" in out assert "Traceback" not in out + + +# ============================================================================= +# `_maybe_redirect_run_to_s6_supervision`: the "upgrade old `gateway run` +# invocation to supervised semantics inside an s6 container" helper. +# ============================================================================= + + +class _Args: + """Lightweight argparse-like namespace for the helper.""" + + def __init__(self, no_supervise: bool = False) -> None: + self.no_supervise = no_supervise + + +def _stub_s6(monkeypatch: pytest.MonkeyPatch, *, on_s6: bool) -> _CallRecorder: + """Wire up service-manager stubs so the underlying dispatcher will + fire (on_s6=True) or return False (on_s6=False).""" + rec = _CallRecorder() + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", + lambda: "s6" if on_s6 else "systemd", + ) + monkeypatch.setattr( + "hermes_cli.service_manager.get_service_manager", lambda: rec, + ) + return rec + + +class _ExecvpCalled(BaseException): + """Sentinel raised by the os.execvp stub so tests can assert on it + without actually replacing the test runner process. Inherits from + BaseException so it bypasses generic ``except Exception`` blocks in + the code under test (just like a real exec would).""" + + def __init__(self, argv: list[str]) -> None: + self.argv = argv + + +def _stub_execvp(monkeypatch: pytest.MonkeyPatch) -> list[list[str]]: + """Replace os.execvp with a recorder that raises _ExecvpCalled.""" + calls: list[list[str]] = [] + + def fake_execvp(file: str, args: list[str]) -> None: # noqa: ANN401 + calls.append([file, *args]) + raise _ExecvpCalled([file, *args]) + + monkeypatch.setattr("hermes_cli.gateway.os.execvp", fake_execvp) + return calls + + +def test_redirect_noop_on_host(monkeypatch: pytest.MonkeyPatch) -> None: + """Host runs (non-s6) must not redirect. Returns False; caller + continues to the foreground gateway code path unchanged.""" + from hermes_cli import gateway as gw + + _stub_s6(monkeypatch, on_s6=False) + # If execvp got called we'd raise — keep it bound so test fails loudly. + monkeypatch.setattr( + "hermes_cli.gateway.os.execvp", + lambda *a, **kw: pytest.fail("execvp should not be called on host"), + ) + monkeypatch.delenv("HERMES_S6_SUPERVISED_CHILD", raising=False) + monkeypatch.delenv("HERMES_GATEWAY_NO_SUPERVISE", raising=False) + + assert gw._maybe_redirect_run_to_s6_supervision(_Args()) is False + + +def test_redirect_fires_inside_s6_container( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str], +) -> None: + """Inside an s6 container, `gateway run` should: + + 1. Dispatch `start` to the service manager. + 2. Print the loud breadcrumb to stderr. + 3. exec `sleep infinity` to keep the CMD alive without binding + container lifetime to gateway PID lifetime. + """ + from hermes_cli import gateway as gw + + rec = _stub_s6(monkeypatch, on_s6=True) + monkeypatch.setattr("hermes_cli.gateway._profile_suffix", lambda: "") + execvp_calls = _stub_execvp(monkeypatch) + monkeypatch.delenv("HERMES_S6_SUPERVISED_CHILD", raising=False) + monkeypatch.delenv("HERMES_GATEWAY_NO_SUPERVISE", raising=False) + + with pytest.raises(_ExecvpCalled) as excinfo: + gw._maybe_redirect_run_to_s6_supervision(_Args()) + + # 1. Dispatcher fired. + assert rec.calls == [("start", "gateway-default")] + # 2. Breadcrumb went to stderr and mentions the opt-out path. + err = capsys.readouterr().err + assert "s6 supervision" in err + assert "--no-supervise" in err + assert "HERMES_GATEWAY_NO_SUPERVISE" in err + # 3. exec'd `sleep infinity`. + assert execvp_calls == [["sleep", "sleep", "infinity"]] + assert excinfo.value.argv == ["sleep", "sleep", "infinity"] + + +def test_redirect_short_circuits_supervised_child( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """The recursion guard: when the supervised gateway s6-supervise is + running execs `hermes gateway run --replace`, the + HERMES_S6_SUPERVISED_CHILD sentinel must short-circuit the redirect + so the gateway actually starts foreground. Without this guard the + supervised process would re-dispatch `start` → re-exec `run` → ... + in an infinite loop. + """ + from hermes_cli import gateway as gw + + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", + lambda: pytest.fail("dispatcher should not run when sentinel is set"), + ) + monkeypatch.setattr( + "hermes_cli.gateway.os.execvp", + lambda *a, **kw: pytest.fail("execvp should not run when sentinel is set"), + ) + monkeypatch.setenv("HERMES_S6_SUPERVISED_CHILD", "1") + monkeypatch.delenv("HERMES_GATEWAY_NO_SUPERVISE", raising=False) + + assert gw._maybe_redirect_run_to_s6_supervision(_Args()) is False + + +def test_redirect_respects_no_supervise_flag( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """`--no-supervise` (CLI flag) must skip the redirect even inside + an s6 container, restoring pre-s6 foreground semantics.""" + from hermes_cli import gateway as gw + + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", + lambda: pytest.fail("dispatcher should not run when --no-supervise is set"), + ) + monkeypatch.setattr( + "hermes_cli.gateway.os.execvp", + lambda *a, **kw: pytest.fail("execvp should not run when --no-supervise is set"), + ) + monkeypatch.delenv("HERMES_S6_SUPERVISED_CHILD", raising=False) + monkeypatch.delenv("HERMES_GATEWAY_NO_SUPERVISE", raising=False) + + assert gw._maybe_redirect_run_to_s6_supervision(_Args(no_supervise=True)) is False + + +@pytest.mark.parametrize("value", ["1", "true", "TRUE", "yes", "Yes"]) +def test_redirect_respects_no_supervise_env( + monkeypatch: pytest.MonkeyPatch, value: str, +) -> None: + """`HERMES_GATEWAY_NO_SUPERVISE=1` (env var) must skip the redirect. + + Truthiness mirrors the dashboard service's own env var parsing — + 1/true/yes are all accepted, case-insensitively. + """ + from hermes_cli import gateway as gw + + monkeypatch.setattr( + "hermes_cli.service_manager.detect_service_manager", + lambda: pytest.fail("dispatcher should not run when env opt-out is set"), + ) + monkeypatch.setattr( + "hermes_cli.gateway.os.execvp", + lambda *a, **kw: pytest.fail("execvp should not run when env opt-out is set"), + ) + monkeypatch.delenv("HERMES_S6_SUPERVISED_CHILD", raising=False) + monkeypatch.setenv("HERMES_GATEWAY_NO_SUPERVISE", value) + + assert gw._maybe_redirect_run_to_s6_supervision(_Args()) is False + + +def test_redirect_no_supervise_env_falsy_values_dont_opt_out( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Falsy / unrecognized values of HERMES_GATEWAY_NO_SUPERVISE must + NOT opt out. We're strict about what counts as "yes" so a typo + like `HERMES_GATEWAY_NO_SUPERVISE=0` doesn't silently enable the + historical foreground behavior.""" + from hermes_cli import gateway as gw + + _stub_s6(monkeypatch, on_s6=True) + monkeypatch.setattr("hermes_cli.gateway._profile_suffix", lambda: "") + _stub_execvp(monkeypatch) + monkeypatch.delenv("HERMES_S6_SUPERVISED_CHILD", raising=False) + + for falsy in ("", "0", "false", "no", "off", "garbage"): + monkeypatch.setenv("HERMES_GATEWAY_NO_SUPERVISE", falsy) + with pytest.raises(_ExecvpCalled): + gw._maybe_redirect_run_to_s6_supervision(_Args()) diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index cd5761bb049..c627a9044fa 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -538,6 +538,11 @@ def test_s6_register_creates_service_dir_and_triggers_scan( run_text = run_path.read_text() assert "hermes -p coder gateway run" in run_text assert "s6-setuidgid hermes" in run_text + # Sentinel marking this as the supervised-child invocation. Without + # it, the supervised `gateway run` would re-enter the s6 redirect + # in `_gateway_command_inner` and recurse. See the matching guard + # in hermes_cli/gateway.py::_gateway_command_inner. + assert "export HERMES_S6_SUPERVISED_CHILD=1" in run_text log_run = svc_dir / "log" / "run" assert log_run.is_file() diff --git a/website/docs/user-guide/docker.md b/website/docs/user-guide/docker.md index e92ab685dfe..fefde13a5b6 100644 --- a/website/docs/user-guide/docker.md +++ b/website/docs/user-guide/docker.md @@ -41,6 +41,14 @@ docker run -d \ Port 8642 exposes the gateway's [OpenAI-compatible API server](./features/api-server.md) and health endpoint. It's optional if you only use chat platforms (Telegram, Discord, etc.), but required if you want the dashboard or external tools to reach the gateway. +:::tip Gateway runs supervised +Inside the official Docker image, `gateway run` is **automatically supervised by s6-overlay**: if the gateway process crashes it's restarted within a couple of seconds without losing the container, and the dashboard (when `HERMES_DASHBOARD=1` is set) is supervised alongside it. The `gateway run` CMD process itself is a `sleep infinity` heartbeat that keeps the container alive while s6 manages the actual gateway process — so `docker stop` still shuts everything down cleanly, but `docker logs` shows the supervised gateway's output. + +You'll see a one-line breadcrumb in `docker logs` confirming the upgrade. To opt out — and get the historical "gateway is the container's main process, container exit = gateway exit" semantics — pass `--no-supervise` or set `HERMES_GATEWAY_NO_SUPERVISE=1`. The opt-out is useful for CI smoke tests that want the container to exit with the gateway's status code; for production deployments the supervised default is strictly better. + +This behavior applies to the s6-based image only. Earlier (tini-based) images still run `gateway run` as the foreground main process. +::: + Note: the API server is gated on `API_SERVER_ENABLED=true`. To expose it beyond `127.0.0.1` inside the container, also set `API_SERVER_HOST=0.0.0.0` and an `API_SERVER_KEY` (minimum 8 characters — generate one with `openssl rand -hex 32`). Example: ```sh