fix: dedupe concurrent gateway restarts + surface restart outcome in onboarding UI

Follow-ups to the salvaged Telegram QR onboarding auto-restart:

- _spawn_gateway_restart() reuses a live in-flight 'hermes gateway restart'
  child instead of spawning a second racing one (stale cached frontend +
  new backend both requesting a restart, or restart-button double-click).
  Both /api/gateway/restart and the onboarding apply path go through it.
- ChannelsPage polls /api/actions/gateway-restart/status after a
  server-initiated restart and surfaces a non-zero exit (e.g. systemd
  linger missing) via the manual-restart banner, since restart_started
  only means the child spawned.
- Test for the reuse path + _ACTION_PROCS isolation in existing tests.
This commit is contained in:
teknium1 2026-06-10 01:14:16 -07:00 committed by Teknium
parent 984e69ff62
commit fa32af886f
3 changed files with 112 additions and 2 deletions

View file

@ -1372,11 +1372,28 @@ def _tail_lines(path: Path, n: int) -> List[str]:
return lines[-n:] if n > 0 else lines
def _spawn_gateway_restart() -> Tuple[subprocess.Popen, bool]:
"""Spawn ``hermes gateway restart``, reusing an in-flight restart.
Multiple dashboard paths can request a restart in quick succession
(restart button double-click, or a stale cached frontend firing its own
restart after the server already auto-restarted post-onboarding). Two
concurrent ``hermes gateway restart`` children race each other on the
manual kill-and-start path, so reuse the live one instead.
Returns ``(proc, reused)``.
"""
existing = _ACTION_PROCS.get("gateway-restart")
if existing is not None and existing.poll() is None:
return existing, True
return _spawn_hermes_action(["gateway", "restart"], "gateway-restart"), False
@app.post("/api/gateway/restart")
async def restart_gateway():
"""Kick off a ``hermes gateway restart`` in the background."""
try:
proc = _spawn_hermes_action(["gateway", "restart"], "gateway-restart")
proc, _reused = _spawn_gateway_restart()
except Exception as exc:
_log.exception("Failed to spawn gateway restart")
raise HTTPException(status_code=500, detail=f"Failed to restart gateway: {exc}")
@ -3757,13 +3774,18 @@ def _restart_gateway_after_telegram_onboarding() -> dict[str, Any]:
restart failures so the UI can fall back to the existing manual banner.
"""
try:
proc = _spawn_hermes_action(["gateway", "restart"], "gateway-restart")
proc, reused = _spawn_gateway_restart()
except Exception as exc:
_log.exception("Failed to auto-restart gateway after Telegram onboarding")
return {
"restart_started": False,
"restart_error": str(exc),
}
if reused:
_log.info(
"Telegram onboarding: reusing in-flight gateway restart (pid %s)",
proc.pid,
)
return {
"restart_started": True,
"restart_action": "gateway-restart",