From cadb74adad3cf7ba2e77258b9094e244d9de4a49 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sun, 7 Jun 2026 07:57:26 -0700 Subject: [PATCH] fix(desktop): recover chat after sleep/wake by revalidating a stale remote backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After sleep/wake, a remote (global-remote) primary backend can become unreachable, but it has no child process whose 'exit' clears the main process's cached connectionPromise. The renderer then re-dials the same dead remote forever and the composer stays stuck on "Starting Hermes…"; only a quit+reopen recovered. Fix: the renderer's existing backoff-paced reconnect loop now asks the main process to revalidate the cached connection before re-dialing. The main process liveness-probes the cached REMOTE backend's public /api/status and, if unreachable, drops the cache (resetHermesConnection only nulls connectionPromise for a remote — no child to SIGTERM) so the next getConnection() rebuilds a reachable descriptor. Local backends are never touched here; they self-heal via the child 'exit' handler. The renderer's loop already provides retry pacing and rides out transient blips, so no streak/episode bookkeeping is needed in the main process. The boot hook dismisses the boot-progress overlay on the post-rebuild 'open' so an in-place rebuild can't leave it stuck at ~94%. Reimplements #40135 by @AlchemistChaos on a smaller, more interpretable path (63 added lines vs 555): no extracted helper module, no failure-streak / episode-window state, the renderer's backoff loop is the retry mechanism. Original diagnosis and fix by @AlchemistChaos. Co-authored-by: AlchemistChaos --- apps/desktop/electron/main.cjs | 39 +++++++++++++++++++ apps/desktop/electron/preload.cjs | 1 + .../src/app/gateway/hooks/use-gateway-boot.ts | 16 ++++++++ apps/desktop/src/global.d.ts | 7 ++++ 4 files changed, 63 insertions(+) diff --git a/apps/desktop/electron/main.cjs b/apps/desktop/electron/main.cjs index d874d7991d9..32634e3ac41 100644 --- a/apps/desktop/electron/main.cjs +++ b/apps/desktop/electron/main.cjs @@ -4737,6 +4737,45 @@ function createWindow() { } ipcMain.handle('hermes:connection', async (_event, profile) => ensureBackend(profile)) +// Reconnect-after-wake recovery. A REMOTE primary backend has no child process, +// so the 'exit'/'error' handlers that would clear a dead connectionPromise never +// fire — once the remote becomes unreachable across a sleep/wake the renderer +// re-dials the same dead descriptor forever and the composer stays stuck on +// "Starting Hermes…". Before the renderer's backoff loop reconnects, it asks us +// to confirm the cached PRIMARY backend is still reachable; if a remote one is +// not, we drop the cache so the next getConnection() rebuilds it. Local backends +// self-heal via their child 'exit' handler, so we never touch them here. +ipcMain.handle('hermes:connection:revalidate', async () => { + if (!connectionPromise) { + return { ok: true, rebuilt: false } + } + + let conn = null + try { + conn = await connectionPromise + } catch { + // The cached boot already rejected (its own catch nulls connectionPromise); + // nothing to revalidate — the next getConnection() builds fresh. + return { ok: true, rebuilt: false } + } + + if (!conn || conn.mode !== 'remote' || !conn.baseUrl) { + return { ok: true, rebuilt: false } + } + + const base = conn.baseUrl.replace(/\/+$/, '') + try { + await fetchPublicJson(`${base}/api/status`, { timeoutMs: 2_500 }) + return { ok: true, rebuilt: false } + } catch { + // Unreachable remote: drop the stale cache so the renderer's next reconnect + // tick rebuilds a fresh, reachable descriptor. resetHermesConnection only + // nulls connectionPromise for a remote (no child to SIGTERM). + rememberLog('Cached remote Hermes backend failed liveness probe; dropping stale connection.') + resetHermesConnection() + return { ok: true, rebuilt: true } + } +}) ipcMain.handle('hermes:backend:touch', async (_event, profile) => { touchPoolBackend(profile) return { ok: true } diff --git a/apps/desktop/electron/preload.cjs b/apps/desktop/electron/preload.cjs index 27bc1b20b53..cf094e751c3 100644 --- a/apps/desktop/electron/preload.cjs +++ b/apps/desktop/electron/preload.cjs @@ -2,6 +2,7 @@ const { contextBridge, ipcRenderer, webUtils } = require('electron') contextBridge.exposeInMainWorld('hermesDesktop', { getConnection: profile => ipcRenderer.invoke('hermes:connection', profile), + revalidateConnection: () => ipcRenderer.invoke('hermes:connection:revalidate'), touchBackend: profile => ipcRenderer.invoke('hermes:backend:touch', profile), getGatewayWsUrl: profile => ipcRenderer.invoke('hermes:gateway:ws-url', profile), getBootProgress: () => ipcRenderer.invoke('hermes:boot-progress:get'), diff --git a/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts b/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts index db43c41a89f..b9bfbf021e9 100644 --- a/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts +++ b/apps/desktop/src/app/gateway/hooks/use-gateway-boot.ts @@ -120,6 +120,13 @@ export function useGatewayBoot({ reconnecting = true try { + // Drop a stale REMOTE backend cache before re-dialing. After sleep/wake a + // remote backend can become unreachable, but it has no child process + // whose 'exit' would clear the main process's cached descriptor — without + // this the renderer re-dials the same dead endpoint forever and stays on + // "Starting Hermes…". The probe is a no-op for a healthy or local backend. + await desktop.revalidateConnection?.().catch(() => undefined) + const conn = await desktop.getConnection($activeGatewayProfile.get()) if (cancelled) { @@ -218,6 +225,15 @@ export function useGatewayBoot({ reconnectAttempt = 0 reauthNotified = false clearReconnectTimer() + + // A revalidate-driven reconnect can rebuild the backend in place when the + // cached remote was found dead, which re-drives the boot-progress overlay. + // Unlike the initial boot, nothing calls completeDesktopBoot() afterwards, + // so dismiss it here once we're open again — otherwise the overlay sticks + // at ~94%. A no-op on a normal (non-rebuild) reconnect. + if (bootCompleted) { + completeDesktopBoot() + } } else if (bootCompleted && (st === 'closed' || st === 'error')) { // The socket dropped after a healthy boot (typically sleep/wake). Try // to bring it back instead of leaving the composer stuck disabled. diff --git a/apps/desktop/src/global.d.ts b/apps/desktop/src/global.d.ts index aff578ac502..213fe5c08d5 100644 --- a/apps/desktop/src/global.d.ts +++ b/apps/desktop/src/global.d.ts @@ -7,6 +7,13 @@ declare global { // the window's backend; pass a named profile to lazily spawn/reuse that // profile's backend from the pool. getConnection: (profile?: string | null) => Promise + // Reconnect-after-wake recovery: liveness-probe the cached PRIMARY backend + // and drop it if a remote one has gone unreachable, so the next + // getConnection() rebuilds a reachable descriptor instead of the renderer + // re-dialing a dead remote forever. No-op for local backends (they + // self-heal via the child 'exit' handler). `rebuilt` is true when a stale + // remote cache was dropped. + revalidateConnection: () => Promise<{ ok: boolean; rebuilt: boolean }> // Keepalive: mark a pool profile backend as recently used so the idle // reaper spares it while its chat is active. touchBackend: (profile?: string | null) => Promise<{ ok: boolean }>