From 1e2c91eaff8595c180f31a963faad3175c569c14 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 17 Jun 2026 01:08:37 -0400 Subject: [PATCH] fix(desktop): recover stranded session windows when resume fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Opening a session in a new window (or any routed resume) could latch the thread loader on "session" forever — the reported "stays stuck loading, even after a nap" bug. Two compounding causes: 1. use-session-actions.resumeSession's catch ran the REST transcript fallback OUTSIDE its own try. When session.resume rejected AND the fallback also threw (the common case on a wedged/unreachable backend), the throw skipped setMessages and left activeSessionId null with an empty transcript — exactly the state the loader gates on (messagesEmpty && !activeSessionId), with no terminal/error state. 2. use-route-resume's self-heal could never re-fire: resumeSession sets selectedStoredSessionIdRef synchronously at entry (before failing), so stuckOnRoutedSession stays false, and on an already-open idle window neither pathnameChanged nor gatewayBecameOpen fire again. The window never retried — naps, focus, nothing recovered it. Fix: - Wrap the REST fallback in its own try so a fallback failure can't strand the loader. - Add $resumeFailedSessionId: armed on terminal resume failure, cleared at the next resume's entry (and left clear on success). - use-route-resume gains a bounded backoff auto-retry (4 attempts, 1s→8s) that re-resumes while the routed session matches the failure flag, with a fire-time liveness recheck so a recovered session isn't double-resumed. Regression tests cover: fallback-wrap arming the flag without throwing, flag cleared on success, retry fires on backoff, no retry for a non-routed/recovered session, and the retry cap. --- apps/desktop/src/app/desktop-controller.tsx | 3 + .../session/hooks/use-route-resume.test.tsx | 104 +++++++++++++++- .../src/app/session/hooks/use-route-resume.ts | 95 +++++++++++++++ .../hooks/use-session-actions.test.tsx | 113 +++++++++++++++++- .../app/session/hooks/use-session-actions.ts | 36 +++++- apps/desktop/src/store/session.ts | 9 ++ 6 files changed, 353 insertions(+), 7 deletions(-) diff --git a/apps/desktop/src/app/desktop-controller.tsx b/apps/desktop/src/app/desktop-controller.tsx index 45251ceef9b..9ac4fcadead 100644 --- a/apps/desktop/src/app/desktop-controller.tsx +++ b/apps/desktop/src/app/desktop-controller.tsx @@ -53,6 +53,7 @@ import { $freshDraftReady, $gatewayState, $messagingSessions, + $resumeFailedSessionId, $selectedStoredSessionId, $sessions, $workingSessionIds, @@ -199,6 +200,7 @@ export function DesktopController() { const activeSessionId = useStore($activeSessionId) const currentCwd = useStore($currentCwd) const freshDraftReady = useStore($freshDraftReady) + const resumeFailedSessionId = useStore($resumeFailedSessionId) const filePreviewTarget = useStore($filePreviewTarget) const previewTarget = useStore($previewTarget) const selectedStoredSessionId = useStore($selectedStoredSessionId) @@ -845,6 +847,7 @@ export function DesktopController() { gatewayState, locationPathname: location.pathname, resumeSession, + resumeFailedSessionId, routedSessionId, runtimeIdByStoredSessionIdRef, selectedStoredSessionId, diff --git a/apps/desktop/src/app/session/hooks/use-route-resume.test.tsx b/apps/desktop/src/app/session/hooks/use-route-resume.test.tsx index e0d984c37f5..0b0f10be7b5 100644 --- a/apps/desktop/src/app/session/hooks/use-route-resume.test.tsx +++ b/apps/desktop/src/app/session/hooks/use-route-resume.test.tsx @@ -13,6 +13,7 @@ interface HarnessProps { gatewayState: string locationPathname: string resumeSession: (sessionId: string, focus: boolean) => Promise + resumeFailedSessionId?: null | string routedSessionId: null | string runtimeIdByStoredSessionIdRef: MutableRefObject> selectedStoredSessionId: null | string @@ -20,8 +21,8 @@ interface HarnessProps { startFreshSessionDraft: (focus: boolean) => unknown } -function RouteResumeHarness(props: HarnessProps) { - useRouteResume(props) +function RouteResumeHarness({ resumeFailedSessionId = null, ...props }: HarnessProps) { + useRouteResume({ ...props, resumeFailedSessionId }) return null } @@ -256,3 +257,102 @@ describe('useRouteResume', () => { expect(resumeSession).toHaveBeenCalledWith('session-1', true) }) }) + +describe('useRouteResume bounded auto-retry after a failed resume', () => { + afterEach(() => { + cleanup() + vi.useRealTimers() + vi.restoreAllMocks() + }) + + // Common stranded-window props: gateway open, route on the session, no runtime + // yet, and the ref already synced to the route (resumeSession sets it at entry + // before failing) — the exact state that defeats the main effect's self-heal. + function strandedProps(resumeSession: (sid: string, focus: boolean) => Promise) { + return { + activeSessionId: null, + activeSessionIdRef: { current: null } as MutableRefObject, + creatingSessionRef: { current: false }, + currentView: 'chat', + freshDraftReady: false, + gatewayState: 'open', + locationPathname: '/session-1', + resumeSession, + routedSessionId: 'session-1', + runtimeIdByStoredSessionIdRef: { current: new Map() }, + selectedStoredSessionId: 'session-1', + // Synced to the route by the failed resume's synchronous entry-write. + selectedStoredSessionIdRef: { current: 'session-1' } as MutableRefObject, + startFreshSessionDraft: vi.fn() + } + } + + it('retries the resume on backoff when the routed session is flagged as failed', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + + render() + + // The main effect fires one resume on mount (pathname-changed). Clear it so + // we assert purely the bounded-retry effect's scheduled retry below. + resumeSession.mockClear() + + // No immediate fire — the retry is scheduled behind the backoff timer. + expect(resumeSession).not.toHaveBeenCalled() + + // First backoff window (1s) elapses → one retry. + vi.advanceTimersByTime(1_000) + expect(resumeSession).toHaveBeenCalledTimes(1) + expect(resumeSession).toHaveBeenCalledWith('session-1', true) + }) + + it('does NOT retry a failed session that is not the routed one', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + + // The failure flag points at a different session than the route. + render() + resumeSession.mockClear() // drop the mount resume + + vi.advanceTimersByTime(10_000) + expect(resumeSession).not.toHaveBeenCalled() + }) + + it('skips the scheduled retry if the session already recovered when the timer fires', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + const props = strandedProps(resumeSession) + + render() + resumeSession.mockClear() // drop the mount resume + + // A resume landed while we waited: runtime is now bound. + props.activeSessionIdRef.current = 'runtime-1' + + vi.advanceTimersByTime(8_000) + expect(resumeSession).not.toHaveBeenCalled() + }) + + it('stops retrying after MAX_RESUME_RETRIES consecutive failures', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + const props = strandedProps(resumeSession) + + // Model the real re-arm loop: resumeSession clears $resumeFailedSessionId at + // entry (null) and a repeat failure re-sets it ('session-1'). That null->id + // toggle is what re-runs the effect and advances the bounded counter. The + // routed session never changes, so the counter is NOT reset between cycles. + const { rerender } = render() + resumeSession.mockClear() // drop the mount resume; count only the retries + + for (let i = 0; i < 8; i += 1) { + vi.advanceTimersByTime(8_000) // fire the scheduled retry (if any) + rerender() // cleared at entry + rerender() // re-armed on failure + } + + // Capped at MAX_RESUME_RETRIES (4): a persistently dead backend can't + // hot-loop the resume forever. + expect(resumeSession.mock.calls.length).toBe(4) + }) +}) diff --git a/apps/desktop/src/app/session/hooks/use-route-resume.ts b/apps/desktop/src/app/session/hooks/use-route-resume.ts index ad7677cc4b5..04bd4cc46f6 100644 --- a/apps/desktop/src/app/session/hooks/use-route-resume.ts +++ b/apps/desktop/src/app/session/hooks/use-route-resume.ts @@ -11,6 +11,11 @@ interface RouteResumeOptions { gatewayState: string | undefined locationPathname: string resumeSession: (sessionId: string, focus: boolean) => Promise + // Stored-session id whose most recent resume failed terminally (set by + // useSessionActions, mirrored from $resumeFailedSessionId). While this equals + // routedSessionId the window would otherwise latch on the loader forever, so + // the bounded-retry effect below re-attempts the resume. + resumeFailedSessionId: string | null routedSessionId: string | null runtimeIdByStoredSessionIdRef: MutableRefObject> selectedStoredSessionId: string | null @@ -18,6 +23,19 @@ interface RouteResumeOptions { startFreshSessionDraft: (focus: boolean) => unknown } +// Bounded auto-retry for a stranded session window. A resume can fail terminally +// (gateway RPC reject + REST fallback failure) on a transiently wedged backend — +// dead provider key, a runaway turn hogging the dispatcher, flaky DNS. Without a +// retry the loader latches forever. We retry with backoff, capped, so a +// genuinely dead backend doesn't hot-loop the resume. +const MAX_RESUME_RETRIES = 4 +const RESUME_RETRY_BASE_MS = 1_000 +const RESUME_RETRY_MAX_MS = 8_000 + +function resumeRetryDelayMs(attempt: number): number { + return Math.min(RESUME_RETRY_MAX_MS, RESUME_RETRY_BASE_MS * 2 ** attempt) +} + // HashRouter boot edge case: pathname briefly reads `/` before the hash is // parsed. If the hash references a real session, defer; resume picks it up // next tick. Without this, ctrl+R on `#/:sessionId` flashes 5 loading states. @@ -49,6 +67,7 @@ export function useRouteResume({ gatewayState, locationPathname, resumeSession, + resumeFailedSessionId, routedSessionId, runtimeIdByStoredSessionIdRef, selectedStoredSessionId, @@ -58,6 +77,10 @@ export function useRouteResume({ const lastPathnameRef = useRef(null) const seenGatewayStateRef = useRef(false) const wasGatewayOpenRef = useRef(false) + // Per-session retry bookkeeping for the bounded auto-retry effect below. Keyed + // by the session id we're retrying so switching chats resets the counter. + const retrySessionIdRef = useRef(null) + const retryAttemptRef = useRef(0) useEffect(() => { const gatewayOpen = gatewayState === 'open' @@ -139,4 +162,76 @@ export function useRouteResume({ selectedStoredSessionIdRef, startFreshSessionDraft ]) + + // Bounded auto-retry: when the routed session's resume failed terminally + // (resumeFailedSessionId matches the route), schedule a backoff retry so the + // window recovers on its own instead of latching the loader forever. This is + // the safety net the main effect above can't provide: after a failed resume, + // selectedStoredSessionIdRef.current already equals the route (resumeSession + // sets it synchronously at entry) and the pathname/gateway are unchanged, so + // none of stuckOnRoutedSession / pathnameChanged / gatewayBecameOpen fire + // again. resumeSession clears resumeFailedSessionId on its next attempt; a + // success keeps it clear (the effect's guard then no-ops), a repeat failure + // re-arms it and we back off further, capped at MAX_RESUME_RETRIES. + useEffect(() => { + if (currentView !== 'chat' || gatewayState !== 'open') { + return + } + + const stranded = + Boolean(routedSessionId) && + resumeFailedSessionId === routedSessionId && + !creatingSessionRef.current + + if (!stranded) { + // Route moved off the stranded session (or it recovered) — reset the + // counter so a future failure on another session starts fresh. + if (retrySessionIdRef.current !== routedSessionId) { + retrySessionIdRef.current = null + retryAttemptRef.current = 0 + } + + return + } + + // New stranded session id → reset the attempt counter. + if (retrySessionIdRef.current !== routedSessionId) { + retrySessionIdRef.current = routedSessionId + retryAttemptRef.current = 0 + } + + if (retryAttemptRef.current >= MAX_RESUME_RETRIES) { + // Give up auto-retrying a persistently dead backend; the user can still + // reconnect / reselect (which resets the counter via the branch above). + return + } + + const attempt = retryAttemptRef.current + retryAttemptRef.current = attempt + 1 + const sessionId = routedSessionId as string + + const timer = setTimeout(() => { + // Re-check liveness at fire time: a resume may have landed while we waited. + if ( + creatingSessionRef.current || + selectedStoredSessionIdRef.current !== sessionId || + activeSessionIdRef.current !== null + ) { + return + } + + void resumeSession(sessionId, true) + }, resumeRetryDelayMs(attempt)) + + return () => clearTimeout(timer) + }, [ + activeSessionIdRef, + creatingSessionRef, + currentView, + gatewayState, + resumeSession, + resumeFailedSessionId, + routedSessionId, + selectedStoredSessionIdRef + ]) } diff --git a/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx b/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx index 739e8b93756..d635caa837f 100644 --- a/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx +++ b/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx @@ -3,8 +3,9 @@ import type { MutableRefObject } from 'react' import { useEffect } from 'react' import { afterEach, describe, expect, it, vi } from 'vitest' +import { getSessionMessages } from '@/hermes' import { $activeGatewayProfile, $newChatProfile } from '@/store/profile' -import { $currentCwd } from '@/store/session' +import { $currentCwd, $resumeFailedSessionId, setMessages, setResumeFailedSessionId } from '@/store/session' import type { ClientSessionState } from '../../types' @@ -117,3 +118,113 @@ describe('createBackendSessionForSend profile routing', () => { expect(params).toMatchObject({ profile: 'default' }) }) }) + +// ── Resume failure recovery (the "stuck loading session window" bug) ────────── +// When session.resume rejects AND the REST transcript fallback ALSO fails, the +// hook must (a) not throw out of the fallback (which stranded the loader), and +// (b) arm $resumeFailedSessionId so use-route-resume can retry. A resume that +// succeeds must NOT leave the flag armed. +function ResumeHarness({ + onReady, + requestGateway +}: { + onReady: (resume: (storedSessionId: string, replaceRoute?: boolean) => Promise) => void + requestGateway: (method: string, params?: Record) => Promise +}) { + const ref = (value: T): MutableRefObject => ({ current: value }) + + const actions = useSessionActions({ + activeSessionId: null, + activeSessionIdRef: ref(null), + busyRef: ref(false), + creatingSessionRef: ref(false), + ensureSessionState: () => ({}) as ClientSessionState, + getRouteToken: () => 'token', + navigate: vi.fn() as never, + requestGateway, + runtimeIdByStoredSessionIdRef: ref(new Map()), + selectedStoredSessionId: null, + selectedStoredSessionIdRef: ref(null), + sessionStateByRuntimeIdRef: ref(new Map()), + syncSessionStateToView: vi.fn(), + updateSessionState: (_sessionId, updater) => updater({} as ClientSessionState) + }) + + useEffect(() => { + onReady(actions.resumeSession) + }, [actions.resumeSession, onReady]) + + return null +} + +describe('resumeSession failure recovery', () => { + afterEach(() => { + cleanup() + setResumeFailedSessionId(null) + setMessages([]) + vi.restoreAllMocks() + }) + + async function runResume( + requestGateway: (method: string, params?: Record) => Promise + ): Promise { + let resume: ((storedSessionId: string, replaceRoute?: boolean) => Promise) | null = null + render( (resume = r)} requestGateway={requestGateway} />) + await waitFor(() => expect(resume).not.toBeNull()) + await resume!('stored-1', true) + } + + it('arms $resumeFailedSessionId when resume RPC and REST fallback both fail', async () => { + // session.resume rejects (e.g. timeout against a wedged backend)... + const requestGateway = vi.fn(async (method: string) => { + if (method === 'session.resume') { + throw new Error('request timed out: session.resume') + } + + return {} as never + }) + + // ...and the REST transcript fallback also rejects (backend unreachable). + vi.mocked(getSessionMessages).mockRejectedValue(new Error('network down')) + + await runResume(requestGateway) + + // The window is no longer silently stranded: the failure latch is armed for + // the stored session, which use-route-resume consumes to retry. + expect($resumeFailedSessionId.get()).toBe('stored-1') + }) + + it('does NOT throw out of the fallback when REST also fails (no unhandled rejection)', async () => { + const requestGateway = vi.fn(async (method: string) => { + if (method === 'session.resume') { + throw new Error('request timed out: session.resume') + } + + return {} as never + }) + + vi.mocked(getSessionMessages).mockRejectedValue(new Error('network down')) + + // resumeSession must resolve (swallow the fallback failure), not reject. + await expect(runResume(requestGateway)).resolves.toBeUndefined() + }) + + it('leaves the failure latch clear when resume succeeds', async () => { + // Pre-arm to prove a successful resume clears it (entry-clear path). + setResumeFailedSessionId('stored-1') + + const requestGateway = vi.fn(async (method: string, params?: Record) => { + if (method === 'session.resume') { + return { session_id: 'runtime-1', resumed: params?.session_id, messages: [], info: {} } as never + } + + return {} as never + }) + + vi.mocked(getSessionMessages).mockResolvedValue({ messages: [] } as never) + + await runResume(requestGateway) + + expect($resumeFailedSessionId.get()).toBeNull() + }) +}) diff --git a/apps/desktop/src/app/session/hooks/use-session-actions.ts b/apps/desktop/src/app/session/hooks/use-session-actions.ts index 6f7a779e8ea..9ed68cd595f 100644 --- a/apps/desktop/src/app/session/hooks/use-session-actions.ts +++ b/apps/desktop/src/app/session/hooks/use-session-actions.ts @@ -38,6 +38,7 @@ import { setFreshDraftReady, setIntroSeed, setMessages, + setResumeFailedSessionId, setSelectedStoredSessionId, setSessions, setSessionStartedAt, @@ -579,6 +580,11 @@ export function useSessionActions({ clearNotifications() setSelectedStoredSessionId(storedSessionId) selectedStoredSessionIdRef.current = storedSessionId + // Optimistically clear any prior resume-failure latch for this session: + // we're attempting a fresh resume, so the self-heal in use-route-resume + // must not keep treating it as stranded. It's re-armed below only if THIS + // attempt fails terminally (RPC reject + REST fallback failure). + setResumeFailedSessionId(current => (current === storedSessionId ? null : current)) const warmRuntimeId = runtimeIdByStoredSessionIdRef.current.get(storedSessionId) @@ -769,13 +775,35 @@ export function useSessionActions({ return } - const fallback = await getSessionMessages(storedSessionId, sessionProfile) + // The gateway resume RPC failed. Try the REST transcript as a fallback + // so the window at least shows history. CRITICAL: this fallback must be + // wrapped in its own try — if it ALSO throws (wedged/unreachable backend, + // the common case when resume failed in the first place), an unguarded + // throw here skips setMessages AND leaves activeSessionId null with an + // empty transcript. That is the exact state the thread loader latches on + // forever (messagesEmpty && !activeSessionId) with no recovery path — + // the "open in new window stays stuck loading, even after a nap" bug. + try { + const fallback = await getSessionMessages(storedSessionId, sessionProfile) - if (!isCurrentResume()) { - return + if (!isCurrentResume()) { + return + } + + setMessages(preserveLocalAssistantErrors(toChatMessages(fallback.messages), $messages.get())) + } catch { + // Fallback also failed: nothing to paint. Leave whatever messages are + // already shown and fall through to arm the resume-failure latch so + // use-route-resume re-attempts the resume on the next render / window + // focus / gateway reconnect instead of stranding the loader. + } + + if (isCurrentResume()) { + // Arm the self-heal: tell use-route-resume this routed session's resume + // failed terminally so it retries (bounded) rather than spinning forever. + setResumeFailedSessionId(storedSessionId) } - setMessages(preserveLocalAssistantErrors(toChatMessages(fallback.messages), $messages.get())) notifyError(err, copy.resumeFailed) } finally { if (isCurrentResume()) { diff --git a/apps/desktop/src/store/session.ts b/apps/desktop/src/store/session.ts index e40484cfec1..e0e525fef66 100644 --- a/apps/desktop/src/store/session.ts +++ b/apps/desktop/src/store/session.ts @@ -218,6 +218,14 @@ export const $lastVisibleMessageIsUser = computed($messages, lastVisibleMessageI export const $freshDraftReady = atom(false) export const $busy = atom(false) export const $awaitingResponse = atom(false) +// Stored-session id whose most recent resume FAILED terminally (the gateway RPC +// rejected AND the REST transcript fallback also failed), leaving the window +// with no runtime and an empty transcript. Drives use-route-resume's self-heal: +// while this matches the routed session the loader would otherwise latch +// forever (messagesEmpty && !activeSessionId), so the hook re-attempts the +// resume on the next render/focus/reconnect instead of stranding the window. +// Null whenever the active route has a healthy (or in-flight) resume. +export const $resumeFailedSessionId = atom(null) export const $currentModel = atom(storedString(COMPOSER_MODEL_KEY) ?? '') export const $currentProvider = atom(storedString(COMPOSER_PROVIDER_KEY) ?? '') export const $currentReasoningEffort = atom(storedString(COMPOSER_EFFORT_KEY) ?? '') @@ -262,6 +270,7 @@ export const setActiveSessionId = (next: Updater) => updateAtom($ export const setSelectedStoredSessionId = (next: Updater) => updateAtom($selectedStoredSessionId, next) export const setMessages = (next: Updater) => updateAtom($messages, next) export const setFreshDraftReady = (next: Updater) => updateAtom($freshDraftReady, next) +export const setResumeFailedSessionId = (next: Updater) => updateAtom($resumeFailedSessionId, next) export const setBusy = (next: Updater) => updateAtom($busy, next) export const setAwaitingResponse = (next: Updater) => updateAtom($awaitingResponse, next)