diff --git a/gateway/run.py b/gateway/run.py index 57f86d7ab31..5d04c450aa7 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1953,6 +1953,16 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._exit_code: Optional[int] = None self._draining = False self._restart_requested = False + # Set by shutdown_signal_handler when a SIGTERM/SIGINT arrived + # WITHOUT a planned-stop / takeover marker — i.e. an unexpected + # external signal (container/s6 SIGTERM on `docker restart` or + # image upgrade, OOM-killer, bare `kill`). Distinct from an + # operator-requested stop, which writes a marker first. Used by + # _stop_impl to decide whether to persist gateway_state=stopped + # (see issue #42675): an unexpected signal must NOT persist + # "stopped", or container_boot refuses to auto-start the gateway + # on the next boot. + self._signal_initiated_shutdown = False self._restart_task_started = False self._restart_detached = False self._restart_via_service = False @@ -5952,7 +5962,36 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._exit_reason = self._exit_reason or "Gateway restart requested" self._draining = False - self._update_runtime_status("stopped", self._exit_reason) + # Persist the terminal gateway_state. The default is "stopped", + # but when this teardown was triggered by an UNEXPECTED external + # signal (container/s6 SIGTERM on `docker restart` or image + # upgrade, OOM-killer, bare `kill`) we instead persist "running" + # to preserve the operator's run-intent across the restart. + # + # On Docker (s6-overlay), container_boot.py reads gateway_state + # on the next boot and only auto-starts gateways whose last + # state was "running" (_AUTOSTART_STATES). Persisting "stopped" + # — or leaving the mid-shutdown "draining" marker in place — for + # a routine `docker compose up --force-recreate` permanently + # suppresses auto-start, so the messaging channels silently stay + # dark until the operator manually restarts (issue #42675). + # + # An operator-initiated stop (`hermes gateway stop`, + # systemd/launchd ExecStop, the s6 stop path, Ctrl+C) writes a + # planned-stop marker BEFORE signalling, so it is classified as + # a planned stop (not signal-initiated) and correctly persists + # "stopped" — respecting the explicit intent. A restart also + # persists "stopped" here; the restarting process brings the + # gateway back up itself. + if getattr(self, "_signal_initiated_shutdown", False) and not self._restart_requested: + logger.info( + "Gateway stopped by an unexpected signal — persisting " + "gateway_state=running so container_boot auto-starts on " + "the next boot (issue #42675)" + ) + self._update_runtime_status("running", self._exit_reason) + else: + self._update_runtime_status("stopped", self._exit_reason) logger.info("Gateway stopped (total teardown %.2fs)", _phase_elapsed()) self._stop_task = asyncio.create_task(_stop_impl()) @@ -15711,6 +15750,13 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = ) else: _signal_initiated_shutdown = True + # Mirror onto the runner so _stop_impl can suppress the + # gateway_state=stopped persist for unexpected signals + # (container/s6 SIGTERM on restart, OOM, bare kill) — see + # issue #42675. Operator-initiated stops set a planned-stop + # marker first, land in the `planned_stop` branch above, and + # leave this flag False so they DO persist "stopped". + runner._signal_initiated_shutdown = True logger.info( "Received %s — initiating shutdown", _shutdown_ctx["signal"] if _shutdown_ctx else "SIGTERM/SIGINT", diff --git a/hermes_cli/service_manager.py b/hermes_cli/service_manager.py index 731c37cd5af..254c34fc17f 100644 --- a/hermes_cli/service_manager.py +++ b/hermes_cli/service_manager.py @@ -739,13 +739,55 @@ class S6ServiceManager: """ self._run_svc("-u", "start", name) + def _supervised_pid(self, name: str) -> int | None: + """Return the PID of the supervised gateway process, or None. + + Parses ``s6-svstat`` output (``up (pid NNNN) ...``). Used to + mark an operator-initiated stop with the planned-stop marker so + the gateway's shutdown handler classifies the incoming SIGTERM + as intentional rather than an unexpected kill (issue #42675). + Best-effort: any parse/exec failure returns None. + """ + import subprocess + + try: + result = subprocess.run( + [f"{_S6_BIN_DIR}/s6-svstat", str(self.scandir / name)], + capture_output=True, text=True, timeout=5, + ) + except (OSError, subprocess.SubprocessError): + return None + if result.returncode != 0: + return None + m = re.search(r"\(pid (\d+)\)", result.stdout) + return int(m.group(1)) if m else None + def stop(self, name: str) -> None: """Bring down a registered service (``s6-svc -d``). + Writes a planned-stop marker naming the supervised gateway PID + BEFORE sending the down command, so the gateway's shutdown + handler recognises this SIGTERM as an operator-initiated stop + and persists ``gateway_state=stopped`` (respecting the explicit + intent). Without the marker, an intentional ``hermes gateway + stop`` is indistinguishable from the container/s6 SIGTERM sent on + ``docker restart``; the latter must NOT persist ``stopped`` or + container_boot refuses to auto-start on the next boot (#42675). + The marker write is best-effort — a failure only means the stop + is treated as signal-initiated, which is the safe fallback. + Raises: GatewayNotRegisteredError: no service directory for ``name``. S6CommandError: s6-svc exited non-zero for any other reason. """ + pid = self._supervised_pid(name) + if pid is not None: + try: + from gateway.status import write_planned_stop_marker + + write_planned_stop_marker(pid) + except Exception: + pass self._run_svc("-d", "stop", name) def restart(self, name: str) -> None: diff --git a/tests/docker/test_container_restart.py b/tests/docker/test_container_restart.py index c8615898375..2ad00ef294a 100644 --- a/tests/docker/test_container_restart.py +++ b/tests/docker/test_container_restart.py @@ -250,3 +250,79 @@ def test_stale_gateway_pid_cleaned_up_on_restart(restart_container: str) -> None assert r.returncode != 0, "stale gateway.pid survived restart" r = _sh(container, "test -f /opt/data/profiles/ghost/processes.json") assert r.returncode != 0, "stale processes.json survived restart" + + +def test_live_gateway_autostarts_after_real_restart_without_manual_state_stamp( + restart_container: str, +) -> None: + """End-to-end guard for issue #42675. + + The other tests in this module stamp gateway_state.json directly to + exercise the reconciler's READ side. This one exercises the WRITE + side: a real, live gateway is killed by the container/s6 SIGTERM that + `docker restart` sends — no manual state stamp — and must come back up + on the next boot. + + Before the fix, the shutdown handler unconditionally persisted + gateway_state=stopped on that SIGTERM, so the reconciler saw 'stopped' + and registered the slot DOWN — the gateway silently stayed dark after + every container restart. The fix classifies an unmarked SIGTERM as + signal-initiated and persists 'running' instead, so auto-start works. + """ + container = restart_container + + _exec(container, "hermes", "profile", "create", "live").check_returncode() + r = _exec(container, "hermes", "-p", "live", "gateway", "start", timeout=60) + assert r.returncode == 0, f"gateway start failed: {r.stderr}" + + # Wait for the gateway to actually come up under supervision AND write + # its own gateway_state=running (we do NOT stamp it ourselves). + deadline = time.monotonic() + 20.0 + while time.monotonic() < deadline: + r = _sh(container, "/command/s6-svstat /run/service/gateway-live") + if r.returncode == 0 and "up " in r.stdout: + break + time.sleep(0.5) + assert "up " in r.stdout, f"gateway never came up pre-restart: {r.stdout!r}" + + # Confirm the gateway persisted its own 'running' state (sanity: we're + # testing the real write path, not a stamped fixture). + deadline = time.monotonic() + 15.0 + state = "" + while time.monotonic() < deadline: + r = _sh( + container, + "cat /opt/data/profiles/live/gateway_state.json 2>/dev/null", + ) + if r.returncode == 0 and '"gateway_state"' in r.stdout: + state = r.stdout + break + time.sleep(0.5) + assert '"running"' in state, ( + f"gateway never persisted running state pre-restart: {state!r}" + ) + + # Real restart — Docker sends SIGTERM to PID 1; s6 propagates it to the + # supervised gateway. No planned-stop marker is written (this is not an + # operator `hermes gateway stop`), so the shutdown is signal-initiated. + _docker("restart", container, timeout=60).check_returncode() + + log = _wait_for_reconcile_log_mention(container, "live", deadline_s=30.0) + assert "profile=live" in log, ( + f"reconciler never logged live after restart: {log!r}" + ) + # The crux: the reconciler must AUTO-START it, not register it down. + assert "action=started" in log, ( + f"gateway did NOT auto-start after a real restart (issue #42675 " + f"regression): {log!r}" + ) + + # Slot recreated, and NO down marker (we expect auto-start). + assert _wait_for_path( + container, "/run/service/gateway-live", kind="d", deadline_s=10.0, + ), "slot not recreated after restart" + r = _sh(container, "test -f /run/service/gateway-live/down") + assert r.returncode != 0, ( + "down marker present despite a live gateway being restarted — " + "the signal-initiated shutdown wrongly persisted 'stopped' (#42675)" + ) diff --git a/tests/gateway/restart_test_helpers.py b/tests/gateway/restart_test_helpers.py index 01be2b4cc39..77c56ec40eb 100644 --- a/tests/gateway/restart_test_helpers.py +++ b/tests/gateway/restart_test_helpers.py @@ -66,6 +66,7 @@ def make_restart_runner( runner._background_tasks = set() runner._draining = False runner._restart_requested = False + runner._signal_initiated_shutdown = False runner._restart_task_started = False runner._restart_detached = False runner._restart_via_service = False diff --git a/tests/gateway/test_gateway_shutdown.py b/tests/gateway/test_gateway_shutdown.py index eae7d0377a3..25f9c123557 100644 --- a/tests/gateway/test_gateway_shutdown.py +++ b/tests/gateway/test_gateway_shutdown.py @@ -358,3 +358,90 @@ async def test_gateway_stop_kills_tool_subprocesses_on_graceful_path(monkeypatch # Only the final catch-all fires on the graceful path. assert kill_count == 1 + + +# --------------------------------------------------------------------------- +# gateway_state persistence on shutdown (issue #42675) +# +# On Docker/s6, container_boot.py only auto-starts gateways whose last +# persisted gateway_state was "running". An unexpected external signal +# (the SIGTERM s6/Docker sends on `docker compose up --force-recreate`, +# OOM, bare kill) must NOT persist "stopped" — otherwise the gateway +# stays down after every container restart. An operator-initiated stop +# writes a planned-stop marker first, so it is NOT signal-initiated and +# DOES persist "stopped", respecting the explicit intent. +# --------------------------------------------------------------------------- + + +def _persisted_states(runner) -> list: + """All gateway_state values passed to _update_runtime_status, in order.""" + states = [] + for call in runner._update_runtime_status.call_args_list: + args, kwargs = call + state = kwargs.get("gateway_state", args[0] if args else None) + states.append(state) + return states + + +def _stopped_state_persisted(runner) -> bool: + """True iff _update_runtime_status was called with gateway_state='stopped'.""" + return "stopped" in _persisted_states(runner) + + +@pytest.mark.asyncio +async def test_signal_initiated_shutdown_persists_running_not_stopped(tmp_path, monkeypatch): + """Unexpected SIGTERM (container restart / OOM / kill) must persist + gateway_state=running — NOT stopped, and NOT leave the mid-shutdown + 'draining' marker — so container_boot auto-starts on next boot (#42675).""" + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + runner, adapter = make_restart_runner() + adapter.disconnect = AsyncMock() + runner._signal_initiated_shutdown = True # set by handler on unmarked signal + + with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"): + await runner.stop() + + assert not _stopped_state_persisted(runner), ( + "signal-initiated shutdown must NOT persist gateway_state=stopped" + ) + # The FINAL terminal write must be 'running' so container_boot's + # _AUTOSTART_STATES check passes (it only auto-starts 'running'). + assert _persisted_states(runner)[-1] == "running", ( + f"final state must be 'running', got: {_persisted_states(runner)}" + ) + + +@pytest.mark.asyncio +async def test_operator_initiated_stop_persists_stopped(tmp_path, monkeypatch): + """A planned stop (marker written → not signal-initiated) must persist + gateway_state=stopped so an explicit `hermes gateway stop` stays down.""" + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + runner, adapter = make_restart_runner() + adapter.disconnect = AsyncMock() + runner._signal_initiated_shutdown = False # planned stop classification + + with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"): + await runner.stop() + + assert _stopped_state_persisted(runner), ( + "operator-initiated stop must persist gateway_state=stopped" + ) + + +@pytest.mark.asyncio +async def test_signal_initiated_restart_still_persists_stopped(tmp_path, monkeypatch): + """A restart is not a 'stay down' — it persists normally (the new + process/container brings the gateway back up itself). The suppression + only applies to a terminal signal-initiated stop, not a restart.""" + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + runner, adapter = make_restart_runner() + adapter.disconnect = AsyncMock() + runner._signal_initiated_shutdown = True + runner._launch_systemd_restart_shortcut = MagicMock() + + with patch("gateway.status.remove_pid_file"), patch("gateway.status.write_runtime_status"): + await runner.stop(restart=True, service_restart=True) + + assert _stopped_state_persisted(runner), ( + "a restart must persist gateway_state=stopped via the normal path" + ) diff --git a/tests/hermes_cli/test_service_manager.py b/tests/hermes_cli/test_service_manager.py index 8c37c3878bc..e351ed284e4 100644 --- a/tests/hermes_cli/test_service_manager.py +++ b/tests/hermes_cli/test_service_manager.py @@ -799,3 +799,111 @@ def test_s6_is_running_parses_svstat( return _sp.CompletedProcess(cmd, 0, "", "") monkeypatch.setattr("subprocess.run", _svstat_down) assert S6ServiceManager(scandir=s6_scandir).is_running("gateway-coder") is False + + +# --------------------------------------------------------------------------- +# S6 stop writes a planned-stop marker (issue #42675) +# +# `hermes gateway stop` inside a container dispatches through +# S6ServiceManager.stop() -> `s6-svc -d`, which SIGTERMs the gateway. +# That SIGTERM is indistinguishable from the one s6/Docker sends on a +# container restart unless we mark the intentional stop first. Without +# the marker, the gateway's shutdown handler can't tell an operator +# stop from a restart kill, and the gateway_state=stopped suppression +# (run.py) would never engage for explicit stops. +# --------------------------------------------------------------------------- + + +def test_s6_supervised_pid_parses_svstat(monkeypatch, s6_scandir): + """_supervised_pid extracts the PID from `up (pid NNNN) ...`.""" + import subprocess as _sp + + def _fake(cmd, **kw): + return _sp.CompletedProcess(cmd, 0, "up (pid 4242) 17 seconds\n", "") + + monkeypatch.setattr("subprocess.run", _fake) + mgr = S6ServiceManager(scandir=s6_scandir) + assert mgr._supervised_pid("gateway-coder") == 4242 + + +def test_s6_supervised_pid_none_when_down(monkeypatch, s6_scandir): + """A down service (`s6-svstat` rc!=0 or no pid) yields None.""" + import subprocess as _sp + + def _fake(cmd, **kw): + return _sp.CompletedProcess(cmd, 0, "down (exitcode 0) 3 seconds\n", "") + + monkeypatch.setattr("subprocess.run", _fake) + mgr = S6ServiceManager(scandir=s6_scandir) + assert mgr._supervised_pid("gateway-coder") is None + + +def test_s6_stop_writes_planned_stop_marker(monkeypatch, s6_scandir): + """stop() must mark the supervised PID before `s6-svc -d` so the + gateway recognises the SIGTERM as an intentional stop (#42675).""" + import subprocess as _sp + + svc_dir = s6_scandir / "gateway-coder" + svc_dir.mkdir() # so _run_svc doesn't raise GatewayNotRegisteredError + + svc_calls: list[list[str]] = [] + + def _fake(cmd, **kw): + seq = list(cmd) if isinstance(cmd, (list, tuple)) else [str(cmd)] + if seq and seq[0].startswith("/command/"): + seq[0] = seq[0][len("/command/"):] + svc_calls.append(seq) + if seq and seq[0] == "s6-svstat": + return _sp.CompletedProcess(cmd, 0, "up (pid 9090) 5 seconds\n", "") + return _sp.CompletedProcess(cmd, 0, "", "") + + monkeypatch.setattr("subprocess.run", _fake) + + marked: list[int] = [] + monkeypatch.setattr( + "gateway.status.write_planned_stop_marker", + lambda pid: marked.append(pid) or True, + ) + + mgr = S6ServiceManager(scandir=s6_scandir) + mgr.stop("gateway-coder") + + assert marked == [9090], ( + f"stop() must write the planned-stop marker for the supervised PID; " + f"marked={marked}" + ) + # And it must still issue the down command. + assert any( + cmd[0] == "s6-svc" and "-d" in cmd for cmd in svc_calls + ), f"s6-svc -d not invoked; saw: {svc_calls}" + + +def test_s6_stop_tolerates_marker_write_failure(monkeypatch, s6_scandir): + """A marker-write failure must not block the stop (best-effort).""" + import subprocess as _sp + + svc_dir = s6_scandir / "gateway-coder" + svc_dir.mkdir() + + svc_calls: list[list[str]] = [] + + def _fake(cmd, **kw): + seq = list(cmd) if isinstance(cmd, (list, tuple)) else [str(cmd)] + if seq and seq[0].startswith("/command/"): + seq[0] = seq[0][len("/command/"):] + svc_calls.append(seq) + if seq and seq[0] == "s6-svstat": + return _sp.CompletedProcess(cmd, 0, "up (pid 9090) 5 seconds\n", "") + return _sp.CompletedProcess(cmd, 0, "", "") + + monkeypatch.setattr("subprocess.run", _fake) + + def _boom(pid): + raise OSError("disk full") + + monkeypatch.setattr("gateway.status.write_planned_stop_marker", _boom) + + mgr = S6ServiceManager(scandir=s6_scandir) + mgr.stop("gateway-coder") # must not raise + + assert any(cmd[0] == "s6-svc" and "-d" in cmd for cmd in svc_calls) diff --git a/website/docs/user-guide/docker.md b/website/docs/user-guide/docker.md index cebfbf397f5..f442a204265 100644 --- a/website/docs/user-guide/docker.md +++ b/website/docs/user-guide/docker.md @@ -183,7 +183,7 @@ Each profile created with `hermes profile create ` gets: - A dedicated s6 service slot at `/run/service/gateway-/`, registered dynamically by the runtime — no container rebuild required. - Auto-restart on crash, backoff-managed by `s6-supervise`. - Per-profile rotated logs at `${HERMES_HOME}/logs/gateways//current` (10 archives × 1 MB each). -- State persistence across container restarts: the boot-time reconciler reads `gateway_state.json` from each profile directory and brings the slot back up only for profiles whose last recorded state was `running`. Stopped profiles stay stopped. +- State persistence across container restarts: the boot-time reconciler reads `gateway_state.json` from each profile directory and brings the slot back up only for profiles whose last recorded state was `running`. Only a gateway you explicitly stopped (`hermes gateway stop`) stays down across a restart — a container restart, image upgrade, or unexpected exit leaves the recorded state as `running`, so the gateway auto-starts on the next boot. The lifecycle commands you'd run on the host work the same way from inside the container: @@ -473,7 +473,7 @@ Each profile created with `hermes profile create ` automatically gets an s - Gateway crashes are auto-restarted by `s6-supervise` after a ~1s backoff. - Dashboard, when enabled with `HERMES_DASHBOARD=1`, is supervised on the same supervision tree and gets the same auto-restart treatment. -- `docker restart` preserves running gateways: the cont-init reconciler reads `$HERMES_HOME/profiles//gateway_state.json` and brings the slot back up if the last recorded state was `running`. Stopped gateways stay stopped. +- `docker restart`, image upgrades (`docker compose up -d --force-recreate`), and unexpected exits preserve running gateways: the cont-init reconciler reads `$HERMES_HOME/profiles//gateway_state.json` and brings the slot back up if the last recorded state was `running`. Only an explicit `hermes gateway stop` records `stopped` and keeps the gateway down across the restart; the container/s6 SIGTERM sent on a restart or upgrade is treated as "still running" and auto-starts. - Per-profile gateway logs persist under `$HERMES_HOME/logs/gateways//current` (rotated by `s6-log`), and the reconciler's actions are appended to `$HERMES_HOME/logs/container-boot.log` per boot. See [Where the logs go](#where-the-logs-go) for the full routing map. `hermes status` inside the container reports `Manager: s6 (container supervisor)`. Use `/command/s6-svstat /run/service/gateway-` for the raw supervisor view (note `/command/` is on PATH for supervision-tree processes only; pass the absolute path when calling from `docker exec`).