mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(container-boot): autostart a gateway stranded in 'draining' state
A gateway hard-killed while draining (a container/VM recreate SIGTERMs it before _stop_impl reaches its terminal-state persist) leaves gateway_state.json frozen at 'draining'. With no explicit desired_state to fall back to, container_boot read that transient value literally, found it not in _AUTOSTART_STATES, and left the gateway DOWN on every subsequent boot — dashboard up, messaging silently dark. Observed on a relay-opted-in staging instance (2026-06): the s6 gateway-default slot kept its 'down' marker across recreates and the gateway never came back. 'draining' is a transient sub-state of RUNNING (written by the drain watcher / scale-to-zero go-dormant path), never an operator stop and never a failed boot. Normalise it to 'running' in the gateway_state fallback so a stranded drain marker reads as the run-intent it represents. This extends gateway/run.py's #42675 handling (persist 'running' on an unexpected signal) to the case where the gateway died before persisting anything at all. 'starting'/'startup_failed' are deliberately NOT normalised — those mean a mid-boot death and must stay down to avoid the crash-loop the down-marker guard prevents. An explicit desired_state still wins verbatim, so an operator stop survives a transient 'draining' runtime value. Tests: draining named-profile + default-root autostart (both fail without the fix), plus a guard that an explicit desired_state=stopped still blocks a draining runtime.
This commit is contained in:
parent
d4c14011eb
commit
d3f2931b8c
2 changed files with 102 additions and 1 deletions
|
|
@ -38,6 +38,29 @@ log = logging.getLogger(__name__)
|
|||
# durable start/stop intent across pod/container recreation.
|
||||
_AUTOSTART_STATES = frozenset({"running"})
|
||||
|
||||
# Transient runtime sub-states of a RUNNING gateway. A gateway only ever
|
||||
# reaches these while it is up and serving — `draining` is written by the
|
||||
# drain watcher / scale-to-zero go-dormant path when an in-flight quiesce
|
||||
# begins (gateway/run.py). It is NOT an operator stop and NOT a failed boot.
|
||||
#
|
||||
# When a gateway is hard-killed *while draining* (a container/VM recreate
|
||||
# SIGTERMs it before `_stop_impl` reaches its terminal-state persist), the
|
||||
# last value left in gateway_state.json is `draining`. With no explicit
|
||||
# `desired_state` to fall back to, treating that literal value as the
|
||||
# autostart intent would leave the gateway DOWN on every subsequent boot —
|
||||
# the gateway never comes back, the dashboard is up but messaging stays dark
|
||||
# (observed on a relay-opted-in staging instance, 2026-06). Map these
|
||||
# transient sub-states to `running` so a stranded drain marker reads as the
|
||||
# run-intent it actually represents. This mirrors gateway/run.py's #42675
|
||||
# handling, which persists `running` (not the mid-shutdown `draining`) when an
|
||||
# unexpected signal tears the gateway down — extended here to the case where
|
||||
# the gateway died before it could persist anything at all.
|
||||
#
|
||||
# `starting` / `startup_failed` are deliberately NOT included: those mean the
|
||||
# gateway died mid-boot or failed to come up, so auto-restarting them would
|
||||
# reintroduce the crash-loop the down-marker guard exists to prevent.
|
||||
_TRANSIENT_RUNNING_STATES = frozenset({"draining"})
|
||||
|
||||
# Stale runtime files we sweep before recreating service slots. These
|
||||
# all hold container-namespaced state (PIDs, process tables) that's
|
||||
# garbage post-restart — a numerically-equal PID in the new container
|
||||
|
|
@ -324,6 +347,15 @@ def _read_desired_state(profile_dir: Path) -> str | None:
|
|||
that as a compatibility fallback so existing running/stopped profiles
|
||||
preserve their behavior until the next explicit start/stop.
|
||||
|
||||
When falling back to ``gateway_state`` (no explicit ``desired_state``),
|
||||
a transient running sub-state (``draining``) is normalised to ``running``
|
||||
— see ``_TRANSIENT_RUNNING_STATES``. A gateway hard-killed mid-drain
|
||||
leaves ``draining`` as its last persisted value; without this it would be
|
||||
treated as a non-autostart state and the gateway would stay DOWN forever.
|
||||
An explicit ``desired_state`` is always honoured verbatim (it is the
|
||||
operator's durable intent), so this normalisation only affects the
|
||||
legacy/transient fallback path.
|
||||
|
||||
Missing or unparseable files count as "no desired state" so we don't
|
||||
bork the whole reconciliation on a corrupt file.
|
||||
"""
|
||||
|
|
@ -335,7 +367,10 @@ def _read_desired_state(profile_dir: Path) -> str | None:
|
|||
desired_state = data.get("desired_state")
|
||||
if desired_state is not None:
|
||||
return desired_state
|
||||
return data.get("gateway_state")
|
||||
gateway_state = data.get("gateway_state")
|
||||
if gateway_state in _TRANSIENT_RUNNING_STATES:
|
||||
return "running"
|
||||
return gateway_state
|
||||
except (OSError, json.JSONDecodeError):
|
||||
log.warning(
|
||||
"could not read %s; treating as no prior state", state_file,
|
||||
|
|
|
|||
|
|
@ -229,6 +229,72 @@ def test_starting_state_does_not_autostart(tmp_path: Path) -> None:
|
|||
assert named[0].action == "registered"
|
||||
|
||||
|
||||
def test_draining_runtime_state_autostarts(tmp_path: Path) -> None:
|
||||
"""A gateway hard-killed mid-drain leaves `gateway_state=draining` as its
|
||||
last persisted value (the recreate SIGTERMs it before `_stop_impl` can
|
||||
write a terminal `stopped`/`running`). `draining` is a transient sub-state
|
||||
of RUNNING, not an operator stop, so with no explicit `desired_state` it
|
||||
must normalise to running-intent and auto-start — otherwise the gateway
|
||||
stays DOWN forever and messaging silently goes dark (the relay-opted-in
|
||||
staging wedge, 2026-06)."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "drained", state="draining")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert _named_actions(actions) == [ReconcileAction(
|
||||
profile="drained", prior_state="running", action="started",
|
||||
)]
|
||||
# Autostart means NO down-marker — the gateway comes back up.
|
||||
assert not (scandir / "gateway-drained" / "down").exists()
|
||||
|
||||
|
||||
def test_draining_default_root_autostarts(tmp_path: Path) -> None:
|
||||
"""The hosted-agent path: the default (root) profile, not a named one.
|
||||
A managed Fly instance runs the root profile; a stranded `draining` there
|
||||
is exactly what wedged the relay-opted-in staging instance. Mirror the
|
||||
named-profile case for the default slot."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_seed_default_root(tmp_path, state="draining")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
default_action = next(a for a in actions if a.profile == "default")
|
||||
assert default_action.prior_state == "running"
|
||||
assert default_action.action == "started"
|
||||
assert not (scandir / "gateway-default" / "down").exists()
|
||||
|
||||
|
||||
def test_desired_state_stopped_overrides_draining_runtime(tmp_path: Path) -> None:
|
||||
"""An explicit operator stop must survive even when the transient runtime
|
||||
state is `draining`. The `desired_state` is the durable intent and is
|
||||
honoured verbatim — the draining→running normalisation only applies to the
|
||||
legacy/transient `gateway_state` fallback, never over an explicit
|
||||
`desired_state`."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(
|
||||
tmp_path,
|
||||
"stopped-while-draining",
|
||||
state="draining",
|
||||
desired_state="stopped",
|
||||
)
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert _named_actions(actions) == [ReconcileAction(
|
||||
profile="stopped-while-draining",
|
||||
prior_state="stopped",
|
||||
action="registered",
|
||||
)]
|
||||
assert (scandir / "gateway-stopped-while-draining" / "down").exists()
|
||||
|
||||
|
||||
def test_stale_runtime_files_are_removed(tmp_path: Path) -> None:
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue