fix: dedupe concurrent gateway restarts + surface restart outcome in onboarding UI

Follow-ups to the salvaged Telegram QR onboarding auto-restart:

- _spawn_gateway_restart() reuses a live in-flight 'hermes gateway restart'
  child instead of spawning a second racing one (stale cached frontend +
  new backend both requesting a restart, or restart-button double-click).
  Both /api/gateway/restart and the onboarding apply path go through it.
- ChannelsPage polls /api/actions/gateway-restart/status after a
  server-initiated restart and surfaces a non-zero exit (e.g. systemd
  linger missing) via the manual-restart banner, since restart_started
  only means the child spawned.
- Test for the reuse path + _ACTION_PROCS isolation in existing tests.
This commit is contained in:
teknium1 2026-06-10 01:14:16 -07:00 committed by Teknium
parent 984e69ff62
commit fa32af886f
3 changed files with 112 additions and 2 deletions

View file

@ -1372,11 +1372,28 @@ def _tail_lines(path: Path, n: int) -> List[str]:
return lines[-n:] if n > 0 else lines
def _spawn_gateway_restart() -> Tuple[subprocess.Popen, bool]:
"""Spawn ``hermes gateway restart``, reusing an in-flight restart.
Multiple dashboard paths can request a restart in quick succession
(restart button double-click, or a stale cached frontend firing its own
restart after the server already auto-restarted post-onboarding). Two
concurrent ``hermes gateway restart`` children race each other on the
manual kill-and-start path, so reuse the live one instead.
Returns ``(proc, reused)``.
"""
existing = _ACTION_PROCS.get("gateway-restart")
if existing is not None and existing.poll() is None:
return existing, True
return _spawn_hermes_action(["gateway", "restart"], "gateway-restart"), False
@app.post("/api/gateway/restart")
async def restart_gateway():
"""Kick off a ``hermes gateway restart`` in the background."""
try:
proc = _spawn_hermes_action(["gateway", "restart"], "gateway-restart")
proc, _reused = _spawn_gateway_restart()
except Exception as exc:
_log.exception("Failed to spawn gateway restart")
raise HTTPException(status_code=500, detail=f"Failed to restart gateway: {exc}")
@ -3757,13 +3774,18 @@ def _restart_gateway_after_telegram_onboarding() -> dict[str, Any]:
restart failures so the UI can fall back to the existing manual banner.
"""
try:
proc = _spawn_hermes_action(["gateway", "restart"], "gateway-restart")
proc, reused = _spawn_gateway_restart()
except Exception as exc:
_log.exception("Failed to auto-restart gateway after Telegram onboarding")
return {
"restart_started": False,
"restart_error": str(exc),
}
if reused:
_log.info(
"Telegram onboarding: reusing in-flight gateway restart (pid %s)",
proc.pid,
)
return {
"restart_started": True,
"restart_action": "gateway-restart",

View file

@ -1399,6 +1399,7 @@ class TestWebServerEndpoints:
}
monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request)
ws._ACTION_PROCS.pop("gateway-restart", None)
restart_calls = []
class FakeRestartProc:
@ -1471,6 +1472,7 @@ class TestWebServerEndpoints:
}
monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request)
ws._ACTION_PROCS.pop("gateway-restart", None)
def fail_spawn_action(subcommand, name):
assert subcommand == ["gateway", "restart"]
@ -1502,6 +1504,65 @@ class TestWebServerEndpoints:
assert env["TELEGRAM_ALLOWED_USERS"] == "123456789"
assert load_config()["platforms"]["telegram"]["enabled"] is True
def test_telegram_onboarding_apply_reuses_inflight_gateway_restart(
self, monkeypatch
):
"""A live in-flight gateway restart is reused instead of spawning a
second racing ``hermes gateway restart`` child (e.g. when a stale
cached frontend also fires its own restart call)."""
import hermes_cli.web_server as ws
with ws._telegram_onboarding_lock:
ws._telegram_onboarding_pairings.clear()
def fake_request(method, path, *, body=None, bearer_token=None):
if method == "POST":
return {
"pairing_id": "pair-reuse",
"poll_token": "poll-secret",
"suggested_username": "hermes_pair_reuse_bot",
"deep_link": "https://t.me/newbot/HermesSetupBot/hermes_pair_reuse_bot",
"qr_payload": "https://t.me/newbot/HermesSetupBot/hermes_pair_reuse_bot",
"expires_at": "2027-05-18T00:00:00.000Z",
}
return {
"status": "ready",
"bot_username": "hermes_pair_reuse_bot",
"owner_user_id": 123456789,
"token": "123456:SECRET",
}
monkeypatch.setattr(ws, "_telegram_onboarding_request_sync", fake_request)
class FakeRunningProc:
pid = 5151
def poll(self):
return None # still running
monkeypatch.setitem(ws._ACTION_PROCS, "gateway-restart", FakeRunningProc())
def fail_spawn_action(subcommand, name):
raise AssertionError("must not spawn a second concurrent restart")
monkeypatch.setattr(ws, "_spawn_hermes_action", fail_spawn_action)
start = self.client.post("/api/messaging/telegram/onboarding/start", json={})
assert start.status_code == 200
ready = self.client.get("/api/messaging/telegram/onboarding/pair-reuse")
assert ready.status_code == 200
applied = self.client.post(
"/api/messaging/telegram/onboarding/pair-reuse/apply",
json={"allowed_user_ids": ["123456789"]},
)
assert applied.status_code == 200
applied_data = applied.json()
assert applied_data["needs_restart"] is False
assert applied_data["restart_started"] is True
assert applied_data["restart_pid"] == 5151
def test_telegram_onboarding_apply_requires_ready_pairing(self, monkeypatch):
import hermes_cli.web_server as ws

View file

@ -599,6 +599,32 @@ function TelegramOnboardingPanel({
setNewAllowedId("");
};
// restart_started only means the `hermes gateway restart` child spawned —
// not that the restart will succeed (e.g. systemd linger missing, service
// manager failure). Poll the action status briefly and surface a non-zero
// exit via the manual-restart banner. Note: in no-service installs the
// child becomes the foreground gateway and never exits, so "still running
// when the window closes" counts as success.
const watchRestartOutcome = async () => {
for (let i = 0; i < 20; i++) {
await new Promise((resolve) => setTimeout(resolve, 1500));
try {
const st = await api.getActionStatus("gateway-restart", 5);
if (st.running) continue;
if (st.exit_code !== 0 && st.exit_code !== null) {
onRestartNeeded();
showToast(
`Gateway restart failed (exit ${st.exit_code}) — restart manually`,
"error",
);
}
return;
} catch {
// transient fetch error; keep polling
}
}
};
const apply = async () => {
if (!setup) return;
if (allowedIds.length === 0) {
@ -616,6 +642,7 @@ function TelegramOnboardingPanel({
showToast("Telegram saved; gateway restarting…", "success");
setRestartNeeded(false);
setTimeout(() => void onChanged(), 4000);
void watchRestartOutcome();
} else if (result.restart_started === undefined && result.needs_restart) {
try {
await api.restartGateway();