diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py index fc85529bbce..327e6a5ce7f 100644 --- a/hermes_cli/container_boot.py +++ b/hermes_cli/container_boot.py @@ -39,27 +39,34 @@ log = logging.getLogger(__name__) _AUTOSTART_STATES = frozenset({"running"}) # Transient runtime sub-states of a RUNNING gateway. A gateway only ever -# reaches these while it is up and serving — `draining` is written by the -# drain watcher / scale-to-zero go-dormant path when an in-flight quiesce -# begins (gateway/run.py). It is NOT an operator stop and NOT a failed boot. +# reaches these while it is up and serving, so they are NOT an operator stop +# and NOT a failed boot: +# - `draining` — written by the drain watcher / scale-to-zero go-dormant +# path when an in-flight quiesce begins (gateway/run.py). +# - `degraded` — written when the gateway comes up with some platforms +# queued for retry, then "falls through to the normal +# running state" (gateway/run.py #5196): the process is up, +# serving cron + whatever platforms connected, and the +# reconnect watcher takes the rest from there. # -# When a gateway is hard-killed *while draining* (a container/VM recreate -# SIGTERMs it before `_stop_impl` reaches its terminal-state persist), the -# last value left in gateway_state.json is `draining`. With no explicit -# `desired_state` to fall back to, treating that literal value as the -# autostart intent would leave the gateway DOWN on every subsequent boot — -# the gateway never comes back, the dashboard is up but messaging stays dark -# (observed on a relay-opted-in staging instance, 2026-06). Map these -# transient sub-states to `running` so a stranded drain marker reads as the -# run-intent it actually represents. This mirrors gateway/run.py's #42675 -# handling, which persists `running` (not the mid-shutdown `draining`) when an -# unexpected signal tears the gateway down — extended here to the case where -# the gateway died before it could persist anything at all. +# When a gateway is hard-killed *while in one of these states* (a container/VM +# recreate SIGTERMs it before `_stop_impl` reaches its terminal-state persist), +# the last value left in gateway_state.json is the transient sub-state. With no +# explicit `desired_state` to fall back to, treating that literal value as the +# autostart intent would leave the gateway DOWN on every subsequent boot — the +# gateway never comes back, the dashboard is up but messaging stays dark +# (observed on a relay-opted-in staging instance stranded at `draining`, +# 2026-06; `degraded` is the same wedge class). Map these transient sub-states +# to `running` so a stranded marker reads as the run-intent it actually +# represents. This mirrors gateway/run.py's #42675 handling, which persists +# `running` (not the mid-shutdown `draining`) when an unexpected signal tears +# the gateway down — extended here to the case where the gateway died before it +# could persist anything at all. # # `starting` / `startup_failed` are deliberately NOT included: those mean the # gateway died mid-boot or failed to come up, so auto-restarting them would # reintroduce the crash-loop the down-marker guard exists to prevent. -_TRANSIENT_RUNNING_STATES = frozenset({"draining"}) +_TRANSIENT_RUNNING_STATES = frozenset({"draining", "degraded"}) # Stale runtime files we sweep before recreating service slots. These # all hold container-namespaced state (PIDs, process tables) that's diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py index 0eec6899f7a..2bfea1b0c5d 100644 --- a/tests/hermes_cli/test_container_boot.py +++ b/tests/hermes_cli/test_container_boot.py @@ -251,6 +251,27 @@ def test_draining_runtime_state_autostarts(tmp_path: Path) -> None: assert not (scandir / "gateway-drained" / "down").exists() +def test_degraded_runtime_state_autostarts(tmp_path: Path) -> None: + """`degraded` is the same wedge class as `draining`: the gateway came up + with some platforms queued for retry, then fell through to the normal + running state (gateway/run.py #5196) and is serving cron + connected + platforms. A hard-kill there strands `gateway_state=degraded`, which is + NOT an operator stop and NOT a failed boot. With no explicit + `desired_state` it must normalise to running-intent and auto-start — + otherwise the gateway stays DOWN forever exactly like the draining wedge.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "degraded-box", state="degraded") + + actions = reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert _named_actions(actions) == [ReconcileAction( + profile="degraded-box", prior_state="running", action="started", + )] + assert not (scandir / "gateway-degraded-box" / "down").exists() + + def test_draining_default_root_autostarts(tmp_path: Path) -> None: """The hosted-agent path: the default (root) profile, not a named one. A managed Fly instance runs the root profile; a stranded `draining` there