From fa32af886fa89acec2584ea6ad2b9ca173b82e4b Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Wed, 10 Jun 2026 01:14:16 -0700 Subject: [PATCH] fix: dedupe concurrent gateway restarts + surface restart outcome in onboarding UI Follow-ups to the salvaged Telegram QR onboarding auto-restart: - _spawn_gateway_restart() reuses a live in-flight 'hermes gateway restart' child instead of spawning a second racing one (stale cached frontend + new backend both requesting a restart, or restart-button double-click). Both /api/gateway/restart and the onboarding apply path go through it. - ChannelsPage polls /api/actions/gateway-restart/status after a server-initiated restart and surfaces a non-zero exit (e.g. systemd linger missing) via the manual-restart banner, since restart_started only means the child spawned. - Test for the reuse path + _ACTION_PROCS isolation in existing tests. --- hermes_cli/web_server.py | 26 +++++++++++- tests/hermes_cli/test_web_server.py | 61 +++++++++++++++++++++++++++++ web/src/pages/ChannelsPage.tsx | 27 +++++++++++++ 3 files changed, 112 insertions(+), 2 deletions(-) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index fe873fd622b..45a7ea51634 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -1372,11 +1372,28 @@ def _tail_lines(path: Path, n: int) -> List[str]: return lines[-n:] if n > 0 else lines +def _spawn_gateway_restart() -> Tuple[subprocess.Popen, bool]: + """Spawn ``hermes gateway restart``, reusing an in-flight restart. + + Multiple dashboard paths can request a restart in quick succession + (restart button double-click, or a stale cached frontend firing its own + restart after the server already auto-restarted post-onboarding). Two + concurrent ``hermes gateway restart`` children race each other on the + manual kill-and-start path, so reuse the live one instead. + + Returns ``(proc, reused)``. + """ + existing = _ACTION_PROCS.get("gateway-restart") + if existing is not None and existing.poll() is None: + return existing, True + return _spawn_hermes_action(["gateway", "restart"], "gateway-restart"), False + + @app.post("/api/gateway/restart") async def restart_gateway(): """Kick off a ``hermes gateway restart`` in the background.""" try: - proc = _spawn_hermes_action(["gateway", "restart"], "gateway-restart") + proc, _reused = _spawn_gateway_restart() except Exception as exc: _log.exception("Failed to spawn gateway restart") raise HTTPException(status_code=500, detail=f"Failed to restart gateway: {exc}") @@ -3757,13 +3774,18 @@ def _restart_gateway_after_telegram_onboarding() -> dict[str, Any]: restart failures so the UI can fall back to the existing manual banner. """ try: - proc = _spawn_hermes_action(["gateway", "restart"], "gateway-restart") + proc, reused = _spawn_gateway_restart() except Exception as exc: _log.exception("Failed to auto-restart gateway after Telegram onboarding") return { "restart_started": False, "restart_error": str(exc), } + if reused: + _log.info( + "Telegram onboarding: reusing in-flight gateway restart (pid %s)", + proc.pid, + ) return { "restart_started": True, "restart_action": "gateway-restart", diff --git a/tests/hermes_cli/test_web_server.py b/tests/hermes_cli/test_web_server.py index e0925439bb5..6a0a1b5b0a2 100644 --- a/tests/hermes_cli/test_web_server.py +++ b/tests/hermes_cli/test_web_server.py @@ -1399,6 +1399,7 @@ class TestWebServerEndpoints: } monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request) + ws._ACTION_PROCS.pop("gateway-restart", None) restart_calls = [] class FakeRestartProc: @@ -1471,6 +1472,7 @@ class TestWebServerEndpoints: } monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request) + ws._ACTION_PROCS.pop("gateway-restart", None) def fail_spawn_action(subcommand, name): assert subcommand == ["gateway", "restart"] @@ -1502,6 +1504,65 @@ class TestWebServerEndpoints: assert env["TELEGRAM_ALLOWED_USERS"] == "123456789" assert load_config()["platforms"]["telegram"]["enabled"] is True + def test_telegram_onboarding_apply_reuses_inflight_gateway_restart( + self, monkeypatch + ): + """A live in-flight gateway restart is reused instead of spawning a + second racing ``hermes gateway restart`` child (e.g. when a stale + cached frontend also fires its own restart call).""" + import hermes_cli.web_server as ws + + with ws._telegram_onboarding_lock: + ws._telegram_onboarding_pairings.clear() + + def fake_request(method, path, *, body=None, bearer_token=None): + if method == "POST": + return { + "pairing_id": "pair-reuse", + "poll_token": "poll-secret", + "suggested_username": "hermes_pair_reuse_bot", + "deep_link": "https://t.me/newbot/HermesSetupBot/hermes_pair_reuse_bot", + "qr_payload": "https://t.me/newbot/HermesSetupBot/hermes_pair_reuse_bot", + "expires_at": "2027-05-18T00:00:00.000Z", + } + return { + "status": "ready", + "bot_username": "hermes_pair_reuse_bot", + "owner_user_id": 123456789, + "token": "123456:SECRET", + } + + monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request) + + class FakeRunningProc: + pid = 5151 + + def poll(self): + return None # still running + + monkeypatch.setitem(ws._ACTION_PROCS, "gateway-restart", FakeRunningProc()) + + def fail_spawn_action(subcommand, name): + raise AssertionError("must not spawn a second concurrent restart") + + monkeypatch.setattr(ws, "_spawn_hermes_action", fail_spawn_action) + + start = self.client.post("/api/messaging/telegram/onboarding/start", json={}) + assert start.status_code == 200 + ready = self.client.get("/api/messaging/telegram/onboarding/pair-reuse") + assert ready.status_code == 200 + + applied = self.client.post( + "/api/messaging/telegram/onboarding/pair-reuse/apply", + json={"allowed_user_ids": ["123456789"]}, + ) + + assert applied.status_code == 200 + applied_data = applied.json() + assert applied_data["needs_restart"] is False + assert applied_data["restart_started"] is True + assert applied_data["restart_pid"] == 5151 + def test_telegram_onboarding_apply_requires_ready_pairing(self, monkeypatch): import hermes_cli.web_server as ws diff --git a/web/src/pages/ChannelsPage.tsx b/web/src/pages/ChannelsPage.tsx index 07ad159ceee..ea5e81b5afb 100644 --- a/web/src/pages/ChannelsPage.tsx +++ b/web/src/pages/ChannelsPage.tsx @@ -599,6 +599,32 @@ function TelegramOnboardingPanel({ setNewAllowedId(""); }; + // restart_started only means the `hermes gateway restart` child spawned — + // not that the restart will succeed (e.g. systemd linger missing, service + // manager failure). Poll the action status briefly and surface a non-zero + // exit via the manual-restart banner. Note: in no-service installs the + // child becomes the foreground gateway and never exits, so "still running + // when the window closes" counts as success. + const watchRestartOutcome = async () => { + for (let i = 0; i < 20; i++) { + await new Promise((resolve) => setTimeout(resolve, 1500)); + try { + const st = await api.getActionStatus("gateway-restart", 5); + if (st.running) continue; + if (st.exit_code !== 0 && st.exit_code !== null) { + onRestartNeeded(); + showToast( + `Gateway restart failed (exit ${st.exit_code}) — restart manually`, + "error", + ); + } + return; + } catch { + // transient fetch error; keep polling + } + } + }; + const apply = async () => { if (!setup) return; if (allowedIds.length === 0) { @@ -616,6 +642,7 @@ function TelegramOnboardingPanel({ showToast("Telegram saved; gateway restarting…", "success"); setRestartNeeded(false); setTimeout(() => void onChanged(), 4000); + void watchRestartOutcome(); } else if (result.restart_started === undefined && result.needs_restart) { try { await api.restartGateway();