fix(gateway): cold-start installed Windows gateway after update when none was running (#50804)

The post-update gateway resume path (`_resume_windows_gateways_after_update`)
only relaunched gateways that were *running* when the update began — it
enumerates live PIDs in `_pause_windows_gateways_for_update` and respawns
exactly those. A gateway that had already died between updates (e.g. it was
launched attached to a terminal/TUI that later closed, taking the child with
it) was never brought back: the Startup-folder / Scheduled-Task autostart
entry only fires on the next login, not after an in-place update.

So a Desktop-GUI update (which runs `hermes update --yes --gateway`) on a box
whose gateway had quietly died would complete with no gateway running, and the
user had no indication anything should have come up.

Fix: when no gateway is running at pause time but an autostart entry is
installed (`gateway_windows.is_installed()` — an explicit "I want a gateway"
signal), return a `cold_start_if_installed` token. The resume step then does a
fresh detached spawn via `gateway_windows._spawn_detached()` — the same
windowless `pythonw` + `CREATE_BREAKAWAY_FROM_JOB` path `hermes gateway start`
uses. It re-checks liveness immediately before spawning so a concurrent start
(autostart entry firing) can't produce a duplicate.

Gateway-less users (no autostart entry) get nothing forced on them — the
pause step still returns None for them. POSIX is unaffected: enabled systemd
units already restart via `Restart=always`.

Windows-only; best-effort throughout (logs at debug and no-ops on any error).

Tests: pause returns the cold-start token only when installed, returns None
when not installed, resume cold-starts on the token, and resume skips the
cold-start when a gateway is already running.
This commit is contained in:
Teknium 2026-06-22 06:02:31 -07:00 committed by GitHub
parent da498ed99b
commit ef6492b648
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 187 additions and 0 deletions

View file

@ -8431,6 +8431,31 @@ def _pause_windows_gateways_for_update() -> dict | None:
logger.debug("Could not discover Windows gateway PIDs before update: %s", exc)
return None
if not running_pids:
# No gateway is running right now, but the user may have installed an
# autostart entry (Scheduled Task or Startup-folder login item) — that
# is an explicit "I want a gateway" signal. A gateway that died between
# updates (e.g. the spawning terminal/TUI closed, taking its child with
# it) would otherwise never come back: the autostart entry only fires on
# the next login, and the update flow's resume path only relaunched
# gateways that were running when the update began. Cold-start one after
# the update so an installed gateway is actually up post-update. Users
# who run gateway-less (no autostart entry) get nothing forced on them.
try:
from hermes_cli import gateway_windows
if gateway_windows.is_installed():
return {
"resume_needed": True,
"profiles": {},
"unmapped_pids": [],
"unmapped": [],
"cold_start_if_installed": True,
}
except Exception as exc:
logger.debug(
"Could not check Windows gateway autostart state before update: %s",
exc,
)
return None
profile_processes = {}
@ -8508,6 +8533,51 @@ def _pause_windows_gateways_for_update() -> dict | None:
}
def _cold_start_windows_gateway_after_update() -> None:
"""Start a fresh detached gateway after update when one is installed but down.
Invoked from ``_resume_windows_gateways_after_update`` for the
``cold_start_if_installed`` case: no gateway was running when the update
began, but an autostart entry (Scheduled Task / Startup-folder login item)
is installed, signalling the user wants a gateway. Unlike the relaunch
paths which watch an old PID and respawn once it exits this is a direct
fresh spawn via the same windowless ``pythonw`` + breakaway path that
``hermes gateway start`` uses (``gateway_windows._spawn_detached``).
Best-effort and idempotent: re-checks that nothing is running first so a
concurrent start (e.g. the autostart entry firing) can't produce a
duplicate gateway.
"""
if not _is_windows():
return
try:
from hermes_cli import gateway_windows
from hermes_cli.gateway import find_gateway_pids
except Exception as exc:
logger.debug("Could not load Windows gateway cold-start helpers: %s", exc)
return
# Re-check liveness right before spawning — between pause and resume the
# autostart entry may have already brought a gateway up, or a leftover
# process may have re-registered. Don't double-start.
try:
if list(find_gateway_pids(all_profiles=True)):
return
except Exception as exc:
logger.debug("Could not re-check gateway liveness before cold-start: %s", exc)
return
try:
pid = gateway_windows._spawn_detached()
except Exception as exc:
logger.debug("Could not cold-start Windows gateway after update: %s", exc)
return
if pid:
print()
print(f" ✓ Starting Windows gateway after update (PID {pid})")
def _resume_windows_gateways_after_update(token: dict | None) -> None:
"""Restart Windows profile gateways previously paused for update."""
if not token or not token.get("resume_needed"):
@ -8518,7 +8588,10 @@ def _resume_windows_gateways_after_update(token: dict | None) -> None:
profiles = token.get("profiles") or {}
unmapped = token.get("unmapped") or []
cold_start = bool(token.get("cold_start_if_installed"))
if not profiles and not any(u.get("argv") for u in unmapped):
if cold_start:
_cold_start_windows_gateway_after_update()
return
try:

View file

@ -597,6 +597,120 @@ def test_resume_windows_gateways_after_update_respawns_unmapped_by_cmdline(
assert "Restarting 1 unmapped Windows gateway process(es)" in out
@patch.object(cli_main, "_is_windows", return_value=True)
def test_pause_returns_cold_start_token_when_installed_but_none_running(
_winp,
monkeypatch,
):
"""No gateway running + autostart entry installed → cold-start token.
A gateway that died between updates (spawning terminal/TUI closed) leaves
nothing for the resume path to relaunch, but the installed autostart entry
is an explicit "I want a gateway" signal. The pause step must return a
token that tells resume to cold-start one.
"""
import hermes_cli.gateway as gateway_mod
from hermes_cli import gateway_windows
monkeypatch.setattr(gateway_mod, "find_gateway_pids", lambda **_k: [])
monkeypatch.setattr(gateway_windows, "is_installed", lambda: True)
token = cli_main._pause_windows_gateways_for_update()
assert token == {
"resume_needed": True,
"profiles": {},
"unmapped_pids": [],
"unmapped": [],
"cold_start_if_installed": True,
}
@patch.object(cli_main, "_is_windows", return_value=True)
def test_pause_returns_none_when_nothing_running_and_not_installed(
_winp,
monkeypatch,
):
"""No gateway running + no autostart entry → no token (gateway-less user).
Users who deliberately run without a gateway must not get one forced on
them by an update.
"""
import hermes_cli.gateway as gateway_mod
from hermes_cli import gateway_windows
monkeypatch.setattr(gateway_mod, "find_gateway_pids", lambda **_k: [])
monkeypatch.setattr(gateway_windows, "is_installed", lambda: False)
assert cli_main._pause_windows_gateways_for_update() is None
@patch.object(cli_main, "_is_windows", return_value=True)
def test_resume_cold_starts_gateway_when_token_requests_it(
_winp,
monkeypatch,
capsys,
):
"""cold_start_if_installed token + nothing running → fresh detached spawn."""
import hermes_cli.gateway as gateway_mod
from hermes_cli import gateway_windows
monkeypatch.setattr(gateway_mod, "find_gateway_pids", lambda **_k: [])
spawned = []
monkeypatch.setattr(
gateway_windows,
"_spawn_detached",
lambda: spawned.append(True) or 4242,
)
token = {
"resume_needed": True,
"profiles": {},
"unmapped_pids": [],
"unmapped": [],
"cold_start_if_installed": True,
}
cli_main._resume_windows_gateways_after_update(token)
assert token["resume_needed"] is False
assert spawned == [True]
assert "Starting Windows gateway after update (PID 4242)" in capsys.readouterr().out
@patch.object(cli_main, "_is_windows", return_value=True)
def test_resume_cold_start_skips_when_gateway_already_running(
_winp,
monkeypatch,
capsys,
):
"""Don't double-start: if a gateway came up between pause and resume
(e.g. the autostart entry fired), the cold-start must no-op."""
import hermes_cli.gateway as gateway_mod
from hermes_cli import gateway_windows
monkeypatch.setattr(gateway_mod, "find_gateway_pids", lambda **_k: [9001])
spawned = []
monkeypatch.setattr(
gateway_windows,
"_spawn_detached",
lambda: spawned.append(True) or 4242,
)
token = {
"resume_needed": True,
"profiles": {},
"unmapped_pids": [],
"unmapped": [],
"cold_start_if_installed": True,
}
cli_main._resume_windows_gateways_after_update(token)
assert spawned == []
assert "Starting Windows gateway after update" not in capsys.readouterr().out
# ---------------------------------------------------------------------------
# cmd_update integration — concurrent-instance gate
# ---------------------------------------------------------------------------