mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(container-boot): also autostart a gateway stranded in 'degraded'
degraded is the same wedge class as draining: the gateway came up with some platforms queued for retry, fell through to the running state (gateway/run.py #5196), and is serving. A hard-kill there strands gateway_state=degraded, which (like draining) is not in _AUTOSTART_STATES and is not an operator stop or a failed boot — so it would stay DOWN forever on every recreate. Add degraded to _TRANSIENT_RUNNING_STATES so the fallback path normalises it to running-intent too.
This commit is contained in:
parent
d3f2931b8c
commit
463b1dfa9c
2 changed files with 44 additions and 16 deletions
|
|
@ -39,27 +39,34 @@ log = logging.getLogger(__name__)
|
|||
_AUTOSTART_STATES = frozenset({"running"})
|
||||
|
||||
# Transient runtime sub-states of a RUNNING gateway. A gateway only ever
|
||||
# reaches these while it is up and serving — `draining` is written by the
|
||||
# drain watcher / scale-to-zero go-dormant path when an in-flight quiesce
|
||||
# begins (gateway/run.py). It is NOT an operator stop and NOT a failed boot.
|
||||
# reaches these while it is up and serving, so they are NOT an operator stop
|
||||
# and NOT a failed boot:
|
||||
# - `draining` — written by the drain watcher / scale-to-zero go-dormant
|
||||
# path when an in-flight quiesce begins (gateway/run.py).
|
||||
# - `degraded` — written when the gateway comes up with some platforms
|
||||
# queued for retry, then "falls through to the normal
|
||||
# running state" (gateway/run.py #5196): the process is up,
|
||||
# serving cron + whatever platforms connected, and the
|
||||
# reconnect watcher takes the rest from there.
|
||||
#
|
||||
# When a gateway is hard-killed *while draining* (a container/VM recreate
|
||||
# SIGTERMs it before `_stop_impl` reaches its terminal-state persist), the
|
||||
# last value left in gateway_state.json is `draining`. With no explicit
|
||||
# `desired_state` to fall back to, treating that literal value as the
|
||||
# autostart intent would leave the gateway DOWN on every subsequent boot —
|
||||
# the gateway never comes back, the dashboard is up but messaging stays dark
|
||||
# (observed on a relay-opted-in staging instance, 2026-06). Map these
|
||||
# transient sub-states to `running` so a stranded drain marker reads as the
|
||||
# run-intent it actually represents. This mirrors gateway/run.py's #42675
|
||||
# handling, which persists `running` (not the mid-shutdown `draining`) when an
|
||||
# unexpected signal tears the gateway down — extended here to the case where
|
||||
# the gateway died before it could persist anything at all.
|
||||
# When a gateway is hard-killed *while in one of these states* (a container/VM
|
||||
# recreate SIGTERMs it before `_stop_impl` reaches its terminal-state persist),
|
||||
# the last value left in gateway_state.json is the transient sub-state. With no
|
||||
# explicit `desired_state` to fall back to, treating that literal value as the
|
||||
# autostart intent would leave the gateway DOWN on every subsequent boot — the
|
||||
# gateway never comes back, the dashboard is up but messaging stays dark
|
||||
# (observed on a relay-opted-in staging instance stranded at `draining`,
|
||||
# 2026-06; `degraded` is the same wedge class). Map these transient sub-states
|
||||
# to `running` so a stranded marker reads as the run-intent it actually
|
||||
# represents. This mirrors gateway/run.py's #42675 handling, which persists
|
||||
# `running` (not the mid-shutdown `draining`) when an unexpected signal tears
|
||||
# the gateway down — extended here to the case where the gateway died before it
|
||||
# could persist anything at all.
|
||||
#
|
||||
# `starting` / `startup_failed` are deliberately NOT included: those mean the
|
||||
# gateway died mid-boot or failed to come up, so auto-restarting them would
|
||||
# reintroduce the crash-loop the down-marker guard exists to prevent.
|
||||
_TRANSIENT_RUNNING_STATES = frozenset({"draining"})
|
||||
_TRANSIENT_RUNNING_STATES = frozenset({"draining", "degraded"})
|
||||
|
||||
# Stale runtime files we sweep before recreating service slots. These
|
||||
# all hold container-namespaced state (PIDs, process tables) that's
|
||||
|
|
|
|||
|
|
@ -251,6 +251,27 @@ def test_draining_runtime_state_autostarts(tmp_path: Path) -> None:
|
|||
assert not (scandir / "gateway-drained" / "down").exists()
|
||||
|
||||
|
||||
def test_degraded_runtime_state_autostarts(tmp_path: Path) -> None:
|
||||
"""`degraded` is the same wedge class as `draining`: the gateway came up
|
||||
with some platforms queued for retry, then fell through to the normal
|
||||
running state (gateway/run.py #5196) and is serving cron + connected
|
||||
platforms. A hard-kill there strands `gateway_state=degraded`, which is
|
||||
NOT an operator stop and NOT a failed boot. With no explicit
|
||||
`desired_state` it must normalise to running-intent and auto-start —
|
||||
otherwise the gateway stays DOWN forever exactly like the draining wedge."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "degraded-box", state="degraded")
|
||||
|
||||
actions = reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert _named_actions(actions) == [ReconcileAction(
|
||||
profile="degraded-box", prior_state="running", action="started",
|
||||
)]
|
||||
assert not (scandir / "gateway-degraded-box" / "down").exists()
|
||||
|
||||
|
||||
def test_draining_default_root_autostarts(tmp_path: Path) -> None:
|
||||
"""The hosted-agent path: the default (root) profile, not a named one.
|
||||
A managed Fly instance runs the root profile; a stranded `draining` there
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue