fix(container-boot): autostart a gateway stranded in 'draining' state

A gateway hard-killed while draining (a container/VM recreate SIGTERMs it
before _stop_impl reaches its terminal-state persist) leaves
gateway_state.json frozen at 'draining'. With no explicit desired_state to
fall back to, container_boot read that transient value literally, found it
not in _AUTOSTART_STATES, and left the gateway DOWN on every subsequent
boot — dashboard up, messaging silently dark. Observed on a relay-opted-in
staging instance (2026-06): the s6 gateway-default slot kept its 'down'
marker across recreates and the gateway never came back.

'draining' is a transient sub-state of RUNNING (written by the drain
watcher / scale-to-zero go-dormant path), never an operator stop and never
a failed boot. Normalise it to 'running' in the gateway_state fallback so a
stranded drain marker reads as the run-intent it represents. This extends
gateway/run.py's #42675 handling (persist 'running' on an unexpected signal)
to the case where the gateway died before persisting anything at all.

'starting'/'startup_failed' are deliberately NOT normalised — those mean a
mid-boot death and must stay down to avoid the crash-loop the down-marker
guard prevents. An explicit desired_state still wins verbatim, so an
operator stop survives a transient 'draining' runtime value.

Tests: draining named-profile + default-root autostart (both fail without
the fix), plus a guard that an explicit desired_state=stopped still blocks a
draining runtime.
This commit is contained in:
Ben 2026-06-29 15:46:54 +10:00 committed by Teknium
parent d4c14011eb
commit d3f2931b8c
2 changed files with 102 additions and 1 deletions

View file

@ -38,6 +38,29 @@ log = logging.getLogger(__name__)
# durable start/stop intent across pod/container recreation.
_AUTOSTART_STATES = frozenset({"running"})
# Transient runtime sub-states of a RUNNING gateway. A gateway only ever
# reaches these while it is up and serving — `draining` is written by the
# drain watcher / scale-to-zero go-dormant path when an in-flight quiesce
# begins (gateway/run.py). It is NOT an operator stop and NOT a failed boot.
#
# When a gateway is hard-killed *while draining* (a container/VM recreate
# SIGTERMs it before `_stop_impl` reaches its terminal-state persist), the
# last value left in gateway_state.json is `draining`. With no explicit
# `desired_state` to fall back to, treating that literal value as the
# autostart intent would leave the gateway DOWN on every subsequent boot —
# the gateway never comes back, the dashboard is up but messaging stays dark
# (observed on a relay-opted-in staging instance, 2026-06). Map these
# transient sub-states to `running` so a stranded drain marker reads as the
# run-intent it actually represents. This mirrors gateway/run.py's #42675
# handling, which persists `running` (not the mid-shutdown `draining`) when an
# unexpected signal tears the gateway down — extended here to the case where
# the gateway died before it could persist anything at all.
#
# `starting` / `startup_failed` are deliberately NOT included: those mean the
# gateway died mid-boot or failed to come up, so auto-restarting them would
# reintroduce the crash-loop the down-marker guard exists to prevent.
_TRANSIENT_RUNNING_STATES = frozenset({"draining"})
# Stale runtime files we sweep before recreating service slots. These
# all hold container-namespaced state (PIDs, process tables) that's
# garbage post-restart — a numerically-equal PID in the new container
@ -324,6 +347,15 @@ def _read_desired_state(profile_dir: Path) -> str | None:
that as a compatibility fallback so existing running/stopped profiles
preserve their behavior until the next explicit start/stop.
When falling back to ``gateway_state`` (no explicit ``desired_state``),
a transient running sub-state (``draining``) is normalised to ``running``
see ``_TRANSIENT_RUNNING_STATES``. A gateway hard-killed mid-drain
leaves ``draining`` as its last persisted value; without this it would be
treated as a non-autostart state and the gateway would stay DOWN forever.
An explicit ``desired_state`` is always honoured verbatim (it is the
operator's durable intent), so this normalisation only affects the
legacy/transient fallback path.
Missing or unparseable files count as "no desired state" so we don't
bork the whole reconciliation on a corrupt file.
"""
@ -335,7 +367,10 @@ def _read_desired_state(profile_dir: Path) -> str | None:
desired_state = data.get("desired_state")
if desired_state is not None:
return desired_state
return data.get("gateway_state")
gateway_state = data.get("gateway_state")
if gateway_state in _TRANSIENT_RUNNING_STATES:
return "running"
return gateway_state
except (OSError, json.JSONDecodeError):
log.warning(
"could not read %s; treating as no prior state", state_file,

View file

@ -229,6 +229,72 @@ def test_starting_state_does_not_autostart(tmp_path: Path) -> None:
assert named[0].action == "registered"
def test_draining_runtime_state_autostarts(tmp_path: Path) -> None:
"""A gateway hard-killed mid-drain leaves `gateway_state=draining` as its
last persisted value (the recreate SIGTERMs it before `_stop_impl` can
write a terminal `stopped`/`running`). `draining` is a transient sub-state
of RUNNING, not an operator stop, so with no explicit `desired_state` it
must normalise to running-intent and auto-start otherwise the gateway
stays DOWN forever and messaging silently goes dark (the relay-opted-in
staging wedge, 2026-06)."""
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "drained", state="draining")
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert _named_actions(actions) == [ReconcileAction(
profile="drained", prior_state="running", action="started",
)]
# Autostart means NO down-marker — the gateway comes back up.
assert not (scandir / "gateway-drained" / "down").exists()
def test_draining_default_root_autostarts(tmp_path: Path) -> None:
"""The hosted-agent path: the default (root) profile, not a named one.
A managed Fly instance runs the root profile; a stranded `draining` there
is exactly what wedged the relay-opted-in staging instance. Mirror the
named-profile case for the default slot."""
scandir = tmp_path / "run-service"; scandir.mkdir()
_seed_default_root(tmp_path, state="draining")
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
default_action = next(a for a in actions if a.profile == "default")
assert default_action.prior_state == "running"
assert default_action.action == "started"
assert not (scandir / "gateway-default" / "down").exists()
def test_desired_state_stopped_overrides_draining_runtime(tmp_path: Path) -> None:
"""An explicit operator stop must survive even when the transient runtime
state is `draining`. The `desired_state` is the durable intent and is
honoured verbatim the drainingrunning normalisation only applies to the
legacy/transient `gateway_state` fallback, never over an explicit
`desired_state`."""
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(
tmp_path,
"stopped-while-draining",
state="draining",
desired_state="stopped",
)
actions = reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert _named_actions(actions) == [ReconcileAction(
profile="stopped-while-draining",
prior_state="stopped",
action="registered",
)]
assert (scandir / "gateway-stopped-while-draining" / "down").exists()
def test_stale_runtime_files_are_removed(tmp_path: Path) -> None:
scandir = tmp_path / "run-service"; scandir.mkdir()
profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)