diff --git a/apps/desktop/electron/main.cjs b/apps/desktop/electron/main.cjs index d874d7991d9..32634e3ac41 100644 --- a/apps/desktop/electron/main.cjs +++ b/apps/desktop/electron/main.cjs @@ -4737,6 +4737,45 @@ function createWindow() { } ipcMain.handle('hermes:connection', async (_event, profile) => ensureBackend(profile)) +// Reconnect-after-wake recovery. A REMOTE primary backend has no child process, +// so the 'exit'/'error' handlers that would clear a dead connectionPromise never +// fire — once the remote becomes unreachable across a sleep/wake the renderer +// re-dials the same dead descriptor forever and the composer stays stuck on +// "Starting Hermes…". Before the renderer's backoff loop reconnects, it asks us +// to confirm the cached PRIMARY backend is still reachable; if a remote one is +// not, we drop the cache so the next getConnection() rebuilds it. Local backends +// self-heal via their child 'exit' handler, so we never touch them here. +ipcMain.handle('hermes:connection:revalidate', async () => { + if (!connectionPromise) { + return { ok: true, rebuilt: false } + } + + let conn = null + try { + conn = await connectionPromise + } catch { + // The cached boot already rejected (its own catch nulls connectionPromise); + // nothing to revalidate — the next getConnection() builds fresh. + return { ok: true, rebuilt: false } + } + + if (!conn || conn.mode !== 'remote' || !conn.baseUrl) { + return { ok: true, rebuilt: false } + } + + const base = conn.baseUrl.replace(/\/+$/, '') + try { + await fetchPublicJson(`${base}/api/status`, { timeoutMs: 2_500 }) + return { ok: true, rebuilt: false } + } catch { + // Unreachable remote: drop the stale cache so the renderer's next reconnect + // tick rebuilds a fresh, reachable descriptor. resetHermesConnection only + // nulls connectionPromise for a remote (no child to SIGTERM). + rememberLog('Cached remote Hermes backend failed liveness probe; dropping stale connection.') + resetHermesConnection() + return { ok: true, rebuilt: true } + } +}) ipcMain.handle('hermes:backend:touch', async (_event, profile) => { touchPoolBackend(profile) return { ok: true } diff --git a/apps/desktop/electron/preload.cjs b/apps/desktop/electron/preload.cjs index 27bc1b20b53..cf094e751c3 100644 --- a/apps/desktop/electron/preload.cjs +++ b/apps/desktop/electron/preload.cjs @@ -2,6 +2,7 @@ const { contextBridge, ipcRenderer, webUtils } = require('electron') contextBridge.exposeInMainWorld('hermesDesktop', { getConnection: profile => ipcRenderer.invoke('hermes:connection', profile), + revalidateConnection: () => ipcRenderer.invoke('hermes:connection:revalidate'), touchBackend: profile => ipcRenderer.invoke('hermes:backend:touch', profile), getGatewayWsUrl: profile => ipcRenderer.invoke('hermes:gateway:ws-url', profile), getBootProgress: () => ipcRenderer.invoke('hermes:boot-progress:get'), diff --git a/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts b/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts index db43c41a89f..b9bfbf021e9 100644 --- a/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts +++ b/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts @@ -120,6 +120,13 @@ export function useGatewayBoot({ reconnecting = true try { + // Drop a stale REMOTE backend cache before re-dialing. After sleep/wake a + // remote backend can become unreachable, but it has no child process + // whose 'exit' would clear the main process's cached descriptor — without + // this the renderer re-dials the same dead endpoint forever and stays on + // "Starting Hermes…". The probe is a no-op for a healthy or local backend. + await desktop.revalidateConnection?.().catch(() => undefined) + const conn = await desktop.getConnection($activeGatewayProfile.get()) if (cancelled) { @@ -218,6 +225,15 @@ export function useGatewayBoot({ reconnectAttempt = 0 reauthNotified = false clearReconnectTimer() + + // A revalidate-driven reconnect can rebuild the backend in place when the + // cached remote was found dead, which re-drives the boot-progress overlay. + // Unlike the initial boot, nothing calls completeDesktopBoot() afterwards, + // so dismiss it here once we're open again — otherwise the overlay sticks + // at ~94%. A no-op on a normal (non-rebuild) reconnect. + if (bootCompleted) { + completeDesktopBoot() + } } else if (bootCompleted && (st === 'closed' || st === 'error')) { // The socket dropped after a healthy boot (typically sleep/wake). Try // to bring it back instead of leaving the composer stuck disabled. diff --git a/apps/desktop/src/global.d.ts b/apps/desktop/src/global.d.ts index aff578ac502..213fe5c08d5 100644 --- a/apps/desktop/src/global.d.ts +++ b/apps/desktop/src/global.d.ts @@ -7,6 +7,13 @@ declare global { // the window's backend; pass a named profile to lazily spawn/reuse that // profile's backend from the pool. getConnection: (profile?: string | null) => Promise + // Reconnect-after-wake recovery: liveness-probe the cached PRIMARY backend + // and drop it if a remote one has gone unreachable, so the next + // getConnection() rebuilds a reachable descriptor instead of the renderer + // re-dialing a dead remote forever. No-op for local backends (they + // self-heal via the child 'exit' handler). `rebuilt` is true when a stale + // remote cache was dropped. + revalidateConnection: () => Promise<{ ok: boolean; rebuilt: boolean }> // Keepalive: mark a pool profile backend as recently used so the idle // reaper spares it while its chat is active. touchBackend: (profile?: string | null) => Promise<{ ok: boolean }>