fix(desktop): recover stranded session windows when resume fails

Opening a session in a new window (or any routed resume) could latch the
thread loader on "session" forever — the reported "stays stuck loading,
even after a nap" bug. Two compounding causes:

1. use-session-actions.resumeSession's catch ran the REST transcript
   fallback OUTSIDE its own try. When session.resume rejected AND the
   fallback also threw (the common case on a wedged/unreachable backend),
   the throw skipped setMessages and left activeSessionId null with an
   empty transcript — exactly the state the loader gates on
   (messagesEmpty && !activeSessionId), with no terminal/error state.

2. use-route-resume's self-heal could never re-fire: resumeSession sets
   selectedStoredSessionIdRef synchronously at entry (before failing), so
   stuckOnRoutedSession stays false, and on an already-open idle window
   neither pathnameChanged nor gatewayBecameOpen fire again. The window
   never retried — naps, focus, nothing recovered it.

Fix:
- Wrap the REST fallback in its own try so a fallback failure can't strand
  the loader.
- Add $resumeFailedSessionId: armed on terminal resume failure, cleared at
  the next resume's entry (and left clear on success).
- use-route-resume gains a bounded backoff auto-retry (4 attempts, 1s→8s)
  that re-resumes while the routed session matches the failure flag, with a
  fire-time liveness recheck so a recovered session isn't double-resumed.

Regression tests cover: fallback-wrap arming the flag without throwing,
flag cleared on success, retry fires on backoff, no retry for a
non-routed/recovered session, and the retry cap.
This commit is contained in:
Teknium 2026-06-17 01:08:37 -04:00
parent 5e01a5dbf1
commit 1e2c91eaff
6 changed files with 353 additions and 7 deletions

View file

@ -53,6 +53,7 @@ import {
$freshDraftReady,
$gatewayState,
$messagingSessions,
$resumeFailedSessionId,
$selectedStoredSessionId,
$sessions,
$workingSessionIds,
@ -199,6 +200,7 @@ export function DesktopController() {
const activeSessionId = useStore($activeSessionId)
const currentCwd = useStore($currentCwd)
const freshDraftReady = useStore($freshDraftReady)
const resumeFailedSessionId = useStore($resumeFailedSessionId)
const filePreviewTarget = useStore($filePreviewTarget)
const previewTarget = useStore($previewTarget)
const selectedStoredSessionId = useStore($selectedStoredSessionId)
@ -845,6 +847,7 @@ export function DesktopController() {
gatewayState,
locationPathname: location.pathname,
resumeSession,
resumeFailedSessionId,
routedSessionId,
runtimeIdByStoredSessionIdRef,
selectedStoredSessionId,

View file

@ -13,6 +13,7 @@ interface HarnessProps {
gatewayState: string
locationPathname: string
resumeSession: (sessionId: string, focus: boolean) => Promise<unknown>
resumeFailedSessionId?: null | string
routedSessionId: null | string
runtimeIdByStoredSessionIdRef: MutableRefObject<Map<string, string>>
selectedStoredSessionId: null | string
@ -20,8 +21,8 @@ interface HarnessProps {
startFreshSessionDraft: (focus: boolean) => unknown
}
function RouteResumeHarness(props: HarnessProps) {
useRouteResume(props)
function RouteResumeHarness({ resumeFailedSessionId = null, ...props }: HarnessProps) {
useRouteResume({ ...props, resumeFailedSessionId })
return null
}
@ -256,3 +257,102 @@ describe('useRouteResume', () => {
expect(resumeSession).toHaveBeenCalledWith('session-1', true)
})
})
describe('useRouteResume bounded auto-retry after a failed resume', () => {
afterEach(() => {
cleanup()
vi.useRealTimers()
vi.restoreAllMocks()
})
// Common stranded-window props: gateway open, route on the session, no runtime
// yet, and the ref already synced to the route (resumeSession sets it at entry
// before failing) — the exact state that defeats the main effect's self-heal.
function strandedProps(resumeSession: (sid: string, focus: boolean) => Promise<unknown>) {
return {
activeSessionId: null,
activeSessionIdRef: { current: null } as MutableRefObject<null | string>,
creatingSessionRef: { current: false },
currentView: 'chat',
freshDraftReady: false,
gatewayState: 'open',
locationPathname: '/session-1',
resumeSession,
routedSessionId: 'session-1',
runtimeIdByStoredSessionIdRef: { current: new Map<string, string>() },
selectedStoredSessionId: 'session-1',
// Synced to the route by the failed resume's synchronous entry-write.
selectedStoredSessionIdRef: { current: 'session-1' } as MutableRefObject<null | string>,
startFreshSessionDraft: vi.fn()
}
}
it('retries the resume on backoff when the routed session is flagged as failed', () => {
vi.useFakeTimers()
const resumeSession = vi.fn(async () => undefined)
render(<RouteResumeHarness {...strandedProps(resumeSession)} resumeFailedSessionId="session-1" />)
// The main effect fires one resume on mount (pathname-changed). Clear it so
// we assert purely the bounded-retry effect's scheduled retry below.
resumeSession.mockClear()
// No immediate fire — the retry is scheduled behind the backoff timer.
expect(resumeSession).not.toHaveBeenCalled()
// First backoff window (1s) elapses → one retry.
vi.advanceTimersByTime(1_000)
expect(resumeSession).toHaveBeenCalledTimes(1)
expect(resumeSession).toHaveBeenCalledWith('session-1', true)
})
it('does NOT retry a failed session that is not the routed one', () => {
vi.useFakeTimers()
const resumeSession = vi.fn(async () => undefined)
// The failure flag points at a different session than the route.
render(<RouteResumeHarness {...strandedProps(resumeSession)} resumeFailedSessionId="other-session" />)
resumeSession.mockClear() // drop the mount resume
vi.advanceTimersByTime(10_000)
expect(resumeSession).not.toHaveBeenCalled()
})
it('skips the scheduled retry if the session already recovered when the timer fires', () => {
vi.useFakeTimers()
const resumeSession = vi.fn(async () => undefined)
const props = strandedProps(resumeSession)
render(<RouteResumeHarness {...props} resumeFailedSessionId="session-1" />)
resumeSession.mockClear() // drop the mount resume
// A resume landed while we waited: runtime is now bound.
props.activeSessionIdRef.current = 'runtime-1'
vi.advanceTimersByTime(8_000)
expect(resumeSession).not.toHaveBeenCalled()
})
it('stops retrying after MAX_RESUME_RETRIES consecutive failures', () => {
vi.useFakeTimers()
const resumeSession = vi.fn(async () => undefined)
const props = strandedProps(resumeSession)
// Model the real re-arm loop: resumeSession clears $resumeFailedSessionId at
// entry (null) and a repeat failure re-sets it ('session-1'). That null->id
// toggle is what re-runs the effect and advances the bounded counter. The
// routed session never changes, so the counter is NOT reset between cycles.
const { rerender } = render(<RouteResumeHarness {...props} resumeFailedSessionId="session-1" />)
resumeSession.mockClear() // drop the mount resume; count only the retries
for (let i = 0; i < 8; i += 1) {
vi.advanceTimersByTime(8_000) // fire the scheduled retry (if any)
rerender(<RouteResumeHarness {...props} resumeFailedSessionId={null} />) // cleared at entry
rerender(<RouteResumeHarness {...props} resumeFailedSessionId="session-1" />) // re-armed on failure
}
// Capped at MAX_RESUME_RETRIES (4): a persistently dead backend can't
// hot-loop the resume forever.
expect(resumeSession.mock.calls.length).toBe(4)
})
})

View file

@ -11,6 +11,11 @@ interface RouteResumeOptions {
gatewayState: string | undefined
locationPathname: string
resumeSession: (sessionId: string, focus: boolean) => Promise<unknown>
// Stored-session id whose most recent resume failed terminally (set by
// useSessionActions, mirrored from $resumeFailedSessionId). While this equals
// routedSessionId the window would otherwise latch on the loader forever, so
// the bounded-retry effect below re-attempts the resume.
resumeFailedSessionId: string | null
routedSessionId: string | null
runtimeIdByStoredSessionIdRef: MutableRefObject<Map<string, string>>
selectedStoredSessionId: string | null
@ -18,6 +23,19 @@ interface RouteResumeOptions {
startFreshSessionDraft: (focus: boolean) => unknown
}
// Bounded auto-retry for a stranded session window. A resume can fail terminally
// (gateway RPC reject + REST fallback failure) on a transiently wedged backend —
// dead provider key, a runaway turn hogging the dispatcher, flaky DNS. Without a
// retry the loader latches forever. We retry with backoff, capped, so a
// genuinely dead backend doesn't hot-loop the resume.
const MAX_RESUME_RETRIES = 4
const RESUME_RETRY_BASE_MS = 1_000
const RESUME_RETRY_MAX_MS = 8_000
function resumeRetryDelayMs(attempt: number): number {
return Math.min(RESUME_RETRY_MAX_MS, RESUME_RETRY_BASE_MS * 2 ** attempt)
}
// HashRouter boot edge case: pathname briefly reads `/` before the hash is
// parsed. If the hash references a real session, defer; resume picks it up
// next tick. Without this, ctrl+R on `#/:sessionId` flashes 5 loading states.
@ -49,6 +67,7 @@ export function useRouteResume({
gatewayState,
locationPathname,
resumeSession,
resumeFailedSessionId,
routedSessionId,
runtimeIdByStoredSessionIdRef,
selectedStoredSessionId,
@ -58,6 +77,10 @@ export function useRouteResume({
const lastPathnameRef = useRef<string | null>(null)
const seenGatewayStateRef = useRef(false)
const wasGatewayOpenRef = useRef(false)
// Per-session retry bookkeeping for the bounded auto-retry effect below. Keyed
// by the session id we're retrying so switching chats resets the counter.
const retrySessionIdRef = useRef<string | null>(null)
const retryAttemptRef = useRef(0)
useEffect(() => {
const gatewayOpen = gatewayState === 'open'
@ -139,4 +162,76 @@ export function useRouteResume({
selectedStoredSessionIdRef,
startFreshSessionDraft
])
// Bounded auto-retry: when the routed session's resume failed terminally
// (resumeFailedSessionId matches the route), schedule a backoff retry so the
// window recovers on its own instead of latching the loader forever. This is
// the safety net the main effect above can't provide: after a failed resume,
// selectedStoredSessionIdRef.current already equals the route (resumeSession
// sets it synchronously at entry) and the pathname/gateway are unchanged, so
// none of stuckOnRoutedSession / pathnameChanged / gatewayBecameOpen fire
// again. resumeSession clears resumeFailedSessionId on its next attempt; a
// success keeps it clear (the effect's guard then no-ops), a repeat failure
// re-arms it and we back off further, capped at MAX_RESUME_RETRIES.
useEffect(() => {
if (currentView !== 'chat' || gatewayState !== 'open') {
return
}
const stranded =
Boolean(routedSessionId) &&
resumeFailedSessionId === routedSessionId &&
!creatingSessionRef.current
if (!stranded) {
// Route moved off the stranded session (or it recovered) — reset the
// counter so a future failure on another session starts fresh.
if (retrySessionIdRef.current !== routedSessionId) {
retrySessionIdRef.current = null
retryAttemptRef.current = 0
}
return
}
// New stranded session id → reset the attempt counter.
if (retrySessionIdRef.current !== routedSessionId) {
retrySessionIdRef.current = routedSessionId
retryAttemptRef.current = 0
}
if (retryAttemptRef.current >= MAX_RESUME_RETRIES) {
// Give up auto-retrying a persistently dead backend; the user can still
// reconnect / reselect (which resets the counter via the branch above).
return
}
const attempt = retryAttemptRef.current
retryAttemptRef.current = attempt + 1
const sessionId = routedSessionId as string
const timer = setTimeout(() => {
// Re-check liveness at fire time: a resume may have landed while we waited.
if (
creatingSessionRef.current ||
selectedStoredSessionIdRef.current !== sessionId ||
activeSessionIdRef.current !== null
) {
return
}
void resumeSession(sessionId, true)
}, resumeRetryDelayMs(attempt))
return () => clearTimeout(timer)
}, [
activeSessionIdRef,
creatingSessionRef,
currentView,
gatewayState,
resumeSession,
resumeFailedSessionId,
routedSessionId,
selectedStoredSessionIdRef
])
}

View file

@ -3,8 +3,9 @@ import type { MutableRefObject } from 'react'
import { useEffect } from 'react'
import { afterEach, describe, expect, it, vi } from 'vitest'
import { getSessionMessages } from '@/hermes'
import { $activeGatewayProfile, $newChatProfile } from '@/store/profile'
import { $currentCwd } from '@/store/session'
import { $currentCwd, $resumeFailedSessionId, setMessages, setResumeFailedSessionId } from '@/store/session'
import type { ClientSessionState } from '../../types'
@ -117,3 +118,113 @@ describe('createBackendSessionForSend profile routing', () => {
expect(params).toMatchObject({ profile: 'default' })
})
})
// ── Resume failure recovery (the "stuck loading session window" bug) ──────────
// When session.resume rejects AND the REST transcript fallback ALSO fails, the
// hook must (a) not throw out of the fallback (which stranded the loader), and
// (b) arm $resumeFailedSessionId so use-route-resume can retry. A resume that
// succeeds must NOT leave the flag armed.
function ResumeHarness({
onReady,
requestGateway
}: {
onReady: (resume: (storedSessionId: string, replaceRoute?: boolean) => Promise<unknown>) => void
requestGateway: <T>(method: string, params?: Record<string, unknown>) => Promise<T>
}) {
const ref = <T,>(value: T): MutableRefObject<T> => ({ current: value })
const actions = useSessionActions({
activeSessionId: null,
activeSessionIdRef: ref<string | null>(null),
busyRef: ref(false),
creatingSessionRef: ref(false),
ensureSessionState: () => ({}) as ClientSessionState,
getRouteToken: () => 'token',
navigate: vi.fn() as never,
requestGateway,
runtimeIdByStoredSessionIdRef: ref(new Map<string, string>()),
selectedStoredSessionId: null,
selectedStoredSessionIdRef: ref<string | null>(null),
sessionStateByRuntimeIdRef: ref(new Map<string, ClientSessionState>()),
syncSessionStateToView: vi.fn(),
updateSessionState: (_sessionId, updater) => updater({} as ClientSessionState)
})
useEffect(() => {
onReady(actions.resumeSession)
}, [actions.resumeSession, onReady])
return null
}
describe('resumeSession failure recovery', () => {
afterEach(() => {
cleanup()
setResumeFailedSessionId(null)
setMessages([])
vi.restoreAllMocks()
})
async function runResume(
requestGateway: <T>(method: string, params?: Record<string, unknown>) => Promise<T>
): Promise<void> {
let resume: ((storedSessionId: string, replaceRoute?: boolean) => Promise<unknown>) | null = null
render(<ResumeHarness onReady={r => (resume = r)} requestGateway={requestGateway} />)
await waitFor(() => expect(resume).not.toBeNull())
await resume!('stored-1', true)
}
it('arms $resumeFailedSessionId when resume RPC and REST fallback both fail', async () => {
// session.resume rejects (e.g. timeout against a wedged backend)...
const requestGateway = vi.fn(async (method: string) => {
if (method === 'session.resume') {
throw new Error('request timed out: session.resume')
}
return {} as never
})
// ...and the REST transcript fallback also rejects (backend unreachable).
vi.mocked(getSessionMessages).mockRejectedValue(new Error('network down'))
await runResume(requestGateway)
// The window is no longer silently stranded: the failure latch is armed for
// the stored session, which use-route-resume consumes to retry.
expect($resumeFailedSessionId.get()).toBe('stored-1')
})
it('does NOT throw out of the fallback when REST also fails (no unhandled rejection)', async () => {
const requestGateway = vi.fn(async (method: string) => {
if (method === 'session.resume') {
throw new Error('request timed out: session.resume')
}
return {} as never
})
vi.mocked(getSessionMessages).mockRejectedValue(new Error('network down'))
// resumeSession must resolve (swallow the fallback failure), not reject.
await expect(runResume(requestGateway)).resolves.toBeUndefined()
})
it('leaves the failure latch clear when resume succeeds', async () => {
// Pre-arm to prove a successful resume clears it (entry-clear path).
setResumeFailedSessionId('stored-1')
const requestGateway = vi.fn(async (method: string, params?: Record<string, unknown>) => {
if (method === 'session.resume') {
return { session_id: 'runtime-1', resumed: params?.session_id, messages: [], info: {} } as never
}
return {} as never
})
vi.mocked(getSessionMessages).mockResolvedValue({ messages: [] } as never)
await runResume(requestGateway)
expect($resumeFailedSessionId.get()).toBeNull()
})
})

View file

@ -38,6 +38,7 @@ import {
setFreshDraftReady,
setIntroSeed,
setMessages,
setResumeFailedSessionId,
setSelectedStoredSessionId,
setSessions,
setSessionStartedAt,
@ -579,6 +580,11 @@ export function useSessionActions({
clearNotifications()
setSelectedStoredSessionId(storedSessionId)
selectedStoredSessionIdRef.current = storedSessionId
// Optimistically clear any prior resume-failure latch for this session:
// we're attempting a fresh resume, so the self-heal in use-route-resume
// must not keep treating it as stranded. It's re-armed below only if THIS
// attempt fails terminally (RPC reject + REST fallback failure).
setResumeFailedSessionId(current => (current === storedSessionId ? null : current))
const warmRuntimeId = runtimeIdByStoredSessionIdRef.current.get(storedSessionId)
@ -769,13 +775,35 @@ export function useSessionActions({
return
}
const fallback = await getSessionMessages(storedSessionId, sessionProfile)
// The gateway resume RPC failed. Try the REST transcript as a fallback
// so the window at least shows history. CRITICAL: this fallback must be
// wrapped in its own try — if it ALSO throws (wedged/unreachable backend,
// the common case when resume failed in the first place), an unguarded
// throw here skips setMessages AND leaves activeSessionId null with an
// empty transcript. That is the exact state the thread loader latches on
// forever (messagesEmpty && !activeSessionId) with no recovery path —
// the "open in new window stays stuck loading, even after a nap" bug.
try {
const fallback = await getSessionMessages(storedSessionId, sessionProfile)
if (!isCurrentResume()) {
return
if (!isCurrentResume()) {
return
}
setMessages(preserveLocalAssistantErrors(toChatMessages(fallback.messages), $messages.get()))
} catch {
// Fallback also failed: nothing to paint. Leave whatever messages are
// already shown and fall through to arm the resume-failure latch so
// use-route-resume re-attempts the resume on the next render / window
// focus / gateway reconnect instead of stranding the loader.
}
if (isCurrentResume()) {
// Arm the self-heal: tell use-route-resume this routed session's resume
// failed terminally so it retries (bounded) rather than spinning forever.
setResumeFailedSessionId(storedSessionId)
}
setMessages(preserveLocalAssistantErrors(toChatMessages(fallback.messages), $messages.get()))
notifyError(err, copy.resumeFailed)
} finally {
if (isCurrentResume()) {

View file

@ -218,6 +218,14 @@ export const $lastVisibleMessageIsUser = computed($messages, lastVisibleMessageI
export const $freshDraftReady = atom(false)
export const $busy = atom(false)
export const $awaitingResponse = atom(false)
// Stored-session id whose most recent resume FAILED terminally (the gateway RPC
// rejected AND the REST transcript fallback also failed), leaving the window
// with no runtime and an empty transcript. Drives use-route-resume's self-heal:
// while this matches the routed session the loader would otherwise latch
// forever (messagesEmpty && !activeSessionId), so the hook re-attempts the
// resume on the next render/focus/reconnect instead of stranding the window.
// Null whenever the active route has a healthy (or in-flight) resume.
export const $resumeFailedSessionId = atom<string | null>(null)
export const $currentModel = atom(storedString(COMPOSER_MODEL_KEY) ?? '')
export const $currentProvider = atom(storedString(COMPOSER_PROVIDER_KEY) ?? '')
export const $currentReasoningEffort = atom(storedString(COMPOSER_EFFORT_KEY) ?? '')
@ -262,6 +270,7 @@ export const setActiveSessionId = (next: Updater<string | null>) => updateAtom($
export const setSelectedStoredSessionId = (next: Updater<string | null>) => updateAtom($selectedStoredSessionId, next)
export const setMessages = (next: Updater<ChatMessage[]>) => updateAtom($messages, next)
export const setFreshDraftReady = (next: Updater<boolean>) => updateAtom($freshDraftReady, next)
export const setResumeFailedSessionId = (next: Updater<string | null>) => updateAtom($resumeFailedSessionId, next)
export const setBusy = (next: Updater<boolean>) => updateAtom($busy, next)
export const setAwaitingResponse = (next: Updater<boolean>) => updateAtom($awaitingResponse, next)