diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py index 3d99681a715..fc85529bbce 100644 --- a/hermes_cli/container_boot.py +++ b/hermes_cli/container_boot.py @@ -38,6 +38,29 @@ log = logging.getLogger(__name__) # durable start/stop intent across pod/container recreation. _AUTOSTART_STATES = frozenset({"running"}) +# Transient runtime sub-states of a RUNNING gateway. A gateway only ever +# reaches these while it is up and serving — `draining` is written by the +# drain watcher / scale-to-zero go-dormant path when an in-flight quiesce +# begins (gateway/run.py). It is NOT an operator stop and NOT a failed boot. +# +# When a gateway is hard-killed *while draining* (a container/VM recreate +# SIGTERMs it before `_stop_impl` reaches its terminal-state persist), the +# last value left in gateway_state.json is `draining`. With no explicit +# `desired_state` to fall back to, treating that literal value as the +# autostart intent would leave the gateway DOWN on every subsequent boot — +# the gateway never comes back, the dashboard is up but messaging stays dark +# (observed on a relay-opted-in staging instance, 2026-06). Map these +# transient sub-states to `running` so a stranded drain marker reads as the +# run-intent it actually represents. This mirrors gateway/run.py's #42675 +# handling, which persists `running` (not the mid-shutdown `draining`) when an +# unexpected signal tears the gateway down — extended here to the case where +# the gateway died before it could persist anything at all. +# +# `starting` / `startup_failed` are deliberately NOT included: those mean the +# gateway died mid-boot or failed to come up, so auto-restarting them would +# reintroduce the crash-loop the down-marker guard exists to prevent. +_TRANSIENT_RUNNING_STATES = frozenset({"draining"}) + # Stale runtime files we sweep before recreating service slots. These # all hold container-namespaced state (PIDs, process tables) that's # garbage post-restart — a numerically-equal PID in the new container @@ -324,6 +347,15 @@ def _read_desired_state(profile_dir: Path) -> str | None: that as a compatibility fallback so existing running/stopped profiles preserve their behavior until the next explicit start/stop. + When falling back to ``gateway_state`` (no explicit ``desired_state``), + a transient running sub-state (``draining``) is normalised to ``running`` + — see ``_TRANSIENT_RUNNING_STATES``. A gateway hard-killed mid-drain + leaves ``draining`` as its last persisted value; without this it would be + treated as a non-autostart state and the gateway would stay DOWN forever. + An explicit ``desired_state`` is always honoured verbatim (it is the + operator's durable intent), so this normalisation only affects the + legacy/transient fallback path. + Missing or unparseable files count as "no desired state" so we don't bork the whole reconciliation on a corrupt file. """ @@ -335,7 +367,10 @@ def _read_desired_state(profile_dir: Path) -> str | None: desired_state = data.get("desired_state") if desired_state is not None: return desired_state - return data.get("gateway_state") + gateway_state = data.get("gateway_state") + if gateway_state in _TRANSIENT_RUNNING_STATES: + return "running" + return gateway_state except (OSError, json.JSONDecodeError): log.warning( "could not read %s; treating as no prior state", state_file, diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py index 9d9981cf6de..0eec6899f7a 100644 --- a/tests/hermes_cli/test_container_boot.py +++ b/tests/hermes_cli/test_container_boot.py @@ -229,6 +229,72 @@ def test_starting_state_does_not_autostart(tmp_path: Path) -> None: assert named[0].action == "registered" +def test_draining_runtime_state_autostarts(tmp_path: Path) -> None: + """A gateway hard-killed mid-drain leaves `gateway_state=draining` as its + last persisted value (the recreate SIGTERMs it before `_stop_impl` can + write a terminal `stopped`/`running`). `draining` is a transient sub-state + of RUNNING, not an operator stop, so with no explicit `desired_state` it + must normalise to running-intent and auto-start — otherwise the gateway + stays DOWN forever and messaging silently goes dark (the relay-opted-in + staging wedge, 2026-06).""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "drained", state="draining") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert _named_actions(actions) == [ReconcileAction( + profile="drained", prior_state="running", action="started", + )] + # Autostart means NO down-marker — the gateway comes back up. + assert not (scandir / "gateway-drained" / "down").exists() + + +def test_draining_default_root_autostarts(tmp_path: Path) -> None: + """The hosted-agent path: the default (root) profile, not a named one. + A managed Fly instance runs the root profile; a stranded `draining` there + is exactly what wedged the relay-opted-in staging instance. Mirror the + named-profile case for the default slot.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _seed_default_root(tmp_path, state="draining") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + default_action = next(a for a in actions if a.profile == "default") + assert default_action.prior_state == "running" + assert default_action.action == "started" + assert not (scandir / "gateway-default" / "down").exists() + + +def test_desired_state_stopped_overrides_draining_runtime(tmp_path: Path) -> None: + """An explicit operator stop must survive even when the transient runtime + state is `draining`. The `desired_state` is the durable intent and is + honoured verbatim — the draining→running normalisation only applies to the + legacy/transient `gateway_state` fallback, never over an explicit + `desired_state`.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile( + tmp_path, + "stopped-while-draining", + state="draining", + desired_state="stopped", + ) + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert _named_actions(actions) == [ReconcileAction( + profile="stopped-while-draining", + prior_state="stopped", + action="registered", + )] + assert (scandir / "gateway-stopped-while-draining" / "down").exists() + + def test_stale_runtime_files_are_removed(tmp_path: Path) -> None: scandir = tmp_path / "run-service"; scandir.mkdir() profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)