diff --git a/apps/desktop/src/app/chat/index.tsx b/apps/desktop/src/app/chat/index.tsx index 8cf4145cf84..4ae3817c888 100644 --- a/apps/desktop/src/app/chat/index.tsx +++ b/apps/desktop/src/app/chat/index.tsx @@ -15,7 +15,9 @@ import { Backdrop } from '@/components/Backdrop' import { PromptOverlays } from '@/components/prompt-overlays' import { Button } from '@/components/ui/button' import { Codicon } from '@/components/ui/codicon' +import { ErrorState } from '@/components/ui/error-state' import { getGlobalModelOptions, type HermesGateway } from '@/hermes' +import { useI18n } from '@/i18n' import type { ChatMessage } from '@/lib/chat-messages' import { quickModelOptions, sessionTitle, toRuntimeMessage } from '@/lib/chat-runtime' import { useIncrementalExternalStoreRuntime } from '@/lib/incremental-external-store-runtime' @@ -38,6 +40,7 @@ import { $lastVisibleMessageIsUser, $messages, $messagesEmpty, + $resumeExhaustedSessionId, $selectedStoredSessionId, $sessions, sessionPinId @@ -86,6 +89,7 @@ interface ChatViewProps extends Omit, 'onSubmit'> { onEdit: (message: AppendMessage) => Promise onReload: (parentId: string | null) => Promise onRestoreToMessage?: (messageId: string) => Promise + onRetryResume: (sessionId: string) => void onTranscribeAudio?: (audio: Blob) => Promise onDismissError?: (messageId: string) => void } @@ -273,10 +277,12 @@ export function ChatView({ onEdit, onReload, onRestoreToMessage, + onRetryResume, onTranscribeAudio, onDismissError }: ChatViewProps) { const location = useLocation() + const { t } = useI18n() const activeSessionId = useStore($activeSessionId) const awaitingResponse = useStore($awaitingResponse) const busy = useStore($busy) @@ -298,6 +304,7 @@ export function ChatView({ const messagesEmpty = useStore($messagesEmpty) const lastVisibleIsUser = useStore($lastVisibleMessageIsUser) const selectedSessionId = useStore($selectedStoredSessionId) + const resumeExhaustedSessionId = useStore($resumeExhaustedSessionId) const routedSessionId = routeSessionId(location.pathname) const isRoutedSessionView = Boolean(routedSessionId) @@ -317,9 +324,21 @@ export function ChatView({ // session exists — even if it has zero messages (a brand-new routed // session). The flicker where `busy` flips true briefly during hydrate // is handled by `threadLoadingState`'s last-visible-user gate. - const loadingSession = isRoutedSessionView && (routeSessionMismatch || (messagesEmpty && !activeSessionId)) + // + // resumeExhausted: the bounded auto-retry in use-route-resume gave up on this + // routed session (gateway RPC + REST fallback failed through every attempt). + // Suppress the loader and show an explicit error + manual Retry instead of + // spinning forever. Gated on the route matching so a stale latch from another + // session can't blank the current one. + const resumeExhausted = isRoutedSessionView && resumeExhaustedSessionId === routedSessionId + + const loadingSession = + !resumeExhausted && isRoutedSessionView && (routeSessionMismatch || (messagesEmpty && !activeSessionId)) + const threadLoading = threadLoadingState(loadingSession, busy, awaitingResponse, lastVisibleIsUser) - const showChatBar = !loadingSession + // Hide the composer in the exhausted error state too: there's no live runtime + // to send to until a retry rebinds one. + const showChatBar = !loadingSession && !resumeExhausted const threadKey = selectedSessionId || activeSessionId || (isRoutedSessionView ? location.pathname : 'new') const modelOptionsQuery = useQuery({ @@ -468,6 +487,21 @@ export function ChatView({ )} + {resumeExhausted && routedSessionId && ( +
+ +
+ +
+
+
+ )} {showChatBar && } diff --git a/apps/desktop/src/app/desktop-controller.tsx b/apps/desktop/src/app/desktop-controller.tsx index 74f544c7099..05dfbbc764f 100644 --- a/apps/desktop/src/app/desktop-controller.tsx +++ b/apps/desktop/src/app/desktop-controller.tsx @@ -54,6 +54,8 @@ import { $gatewayState, $messages, $messagingSessions, + $resumeFailedSessionId, + $resumeExhaustedSessionId, $selectedStoredSessionId, $sessions, $workingSessionIds, @@ -200,6 +202,8 @@ export function DesktopController() { const activeSessionId = useStore($activeSessionId) const currentCwd = useStore($currentCwd) const freshDraftReady = useStore($freshDraftReady) + const resumeFailedSessionId = useStore($resumeFailedSessionId) + const resumeExhaustedSessionId = useStore($resumeExhaustedSessionId) const filePreviewTarget = useStore($filePreviewTarget) const previewTarget = useStore($previewTarget) const selectedStoredSessionId = useStore($selectedStoredSessionId) @@ -889,6 +893,8 @@ export function DesktopController() { gatewayState, locationPathname: location.pathname, resumeSession, + resumeFailedSessionId, + resumeExhaustedSessionId, routedSessionId, runtimeIdByStoredSessionIdRef, selectedStoredSessionId, @@ -1047,6 +1053,7 @@ export function DesktopController() { onReload={reloadFromMessage} onRemoveAttachment={id => void composer.removeAttachment(id)} onRestoreToMessage={restoreToMessage} + onRetryResume={sessionId => void resumeSession(sessionId, true)} onSteer={steerPrompt} onSubmit={submitText} onThreadMessagesChange={handleThreadMessagesChange} diff --git a/apps/desktop/src/app/session/hooks/use-route-resume.test.tsx b/apps/desktop/src/app/session/hooks/use-route-resume.test.tsx index e0d984c37f5..e05f8b748dd 100644 --- a/apps/desktop/src/app/session/hooks/use-route-resume.test.tsx +++ b/apps/desktop/src/app/session/hooks/use-route-resume.test.tsx @@ -2,6 +2,8 @@ import { cleanup, render } from '@testing-library/react' import type { MutableRefObject } from 'react' import { afterEach, describe, expect, it, vi } from 'vitest' +import { $resumeExhaustedSessionId, setResumeExhaustedSessionId } from '@/store/session' + import { useRouteResume } from './use-route-resume' interface HarnessProps { @@ -13,6 +15,8 @@ interface HarnessProps { gatewayState: string locationPathname: string resumeSession: (sessionId: string, focus: boolean) => Promise + resumeFailedSessionId?: null | string + resumeExhaustedSessionId?: null | string routedSessionId: null | string runtimeIdByStoredSessionIdRef: MutableRefObject> selectedStoredSessionId: null | string @@ -20,8 +24,12 @@ interface HarnessProps { startFreshSessionDraft: (focus: boolean) => unknown } -function RouteResumeHarness(props: HarnessProps) { - useRouteResume(props) +function RouteResumeHarness({ + resumeFailedSessionId = null, + resumeExhaustedSessionId = null, + ...props +}: HarnessProps) { + useRouteResume({ ...props, resumeExhaustedSessionId, resumeFailedSessionId }) return null } @@ -256,3 +264,212 @@ describe('useRouteResume', () => { expect(resumeSession).toHaveBeenCalledWith('session-1', true) }) }) + +describe('useRouteResume bounded auto-retry after a failed resume', () => { + afterEach(() => { + cleanup() + vi.useRealTimers() + vi.restoreAllMocks() + setResumeExhaustedSessionId(null) + }) + + // Common stranded-window props: gateway open, route on the session, no runtime + // yet, and the ref already synced to the route (resumeSession sets it at entry + // before failing) — the exact state that defeats the main effect's self-heal. + function strandedProps(resumeSession: (sid: string, focus: boolean) => Promise) { + return { + activeSessionId: null, + activeSessionIdRef: { current: null } as MutableRefObject, + creatingSessionRef: { current: false }, + currentView: 'chat', + freshDraftReady: false, + gatewayState: 'open', + locationPathname: '/session-1', + resumeSession, + routedSessionId: 'session-1', + runtimeIdByStoredSessionIdRef: { current: new Map() }, + selectedStoredSessionId: 'session-1', + // Synced to the route by the failed resume's synchronous entry-write. + selectedStoredSessionIdRef: { current: 'session-1' } as MutableRefObject, + startFreshSessionDraft: vi.fn() + } + } + + it('retries the resume on backoff when the routed session is flagged as failed', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + + render() + + // The main effect fires one resume on mount (pathname-changed). Clear it so + // we assert purely the bounded-retry effect's scheduled retry below. + resumeSession.mockClear() + + // No immediate fire — the retry is scheduled behind the backoff timer. + expect(resumeSession).not.toHaveBeenCalled() + + // First backoff window (1s) elapses → one retry. + vi.advanceTimersByTime(1_000) + expect(resumeSession).toHaveBeenCalledTimes(1) + expect(resumeSession).toHaveBeenCalledWith('session-1', true) + }) + + it('does NOT retry a failed session that is not the routed one', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + + // The failure flag points at a different session than the route. + render() + resumeSession.mockClear() // drop the mount resume + + vi.advanceTimersByTime(10_000) + expect(resumeSession).not.toHaveBeenCalled() + }) + + it('skips the scheduled retry if the session already recovered when the timer fires', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + const props = strandedProps(resumeSession) + + render() + resumeSession.mockClear() // drop the mount resume + + // A resume landed while we waited: runtime is now bound. + props.activeSessionIdRef.current = 'runtime-1' + + vi.advanceTimersByTime(8_000) + expect(resumeSession).not.toHaveBeenCalled() + }) + + it('stops retrying after MAX_RESUME_RETRIES consecutive failures', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + const props = strandedProps(resumeSession) + + // Model the real re-arm loop: resumeSession clears $resumeFailedSessionId at + // entry (null) and a repeat failure re-sets it ('session-1'). That null->id + // toggle is what re-runs the effect and advances the bounded counter. The + // routed session never changes, so the counter is NOT reset between cycles. + const { rerender } = render() + resumeSession.mockClear() // drop the mount resume; count only the retries + + for (let i = 0; i < 8; i += 1) { + vi.advanceTimersByTime(8_000) // fire the scheduled retry (if any) + rerender() // cleared at entry + rerender() // re-armed on failure + } + + // Capped at MAX_RESUME_RETRIES (4): a persistently dead backend can't + // hot-loop the resume forever. + expect(resumeSession.mock.calls.length).toBe(4) + + // Once auto-retry gives up, the exhausted latch is armed for the routed + // session so the chat view can swap the perpetual loader for an explicit + // error + manual Retry instead of spinning forever. + expect($resumeExhaustedSessionId.get()).toBe('session-1') + }) + + it('does not arm the exhausted latch while retries remain', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + const props = strandedProps(resumeSession) + + const { rerender } = render() + resumeSession.mockClear() + + // Two failure cycles — still under the 4-retry cap, so the latch must stay + // clear and the loader keeps spinning (auto-recovery hasn't given up yet). + for (let i = 0; i < 2; i += 1) { + vi.advanceTimersByTime(8_000) + rerender() + rerender() + } + + expect($resumeExhaustedSessionId.get()).toBeNull() + }) + + it('clears a stale exhausted latch when the route moves off the stranded session', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + const props = strandedProps(resumeSession) + + // Pre-arm the latch as if this session had exhausted its retries. + setResumeExhaustedSessionId('session-1') + + // Route is now on a different, healthy session that is not flagged as + // failed — the retry effect's "route moved off" branch clears the latch. + render( + + ) + + expect($resumeExhaustedSessionId.get()).toBeNull() + }) + + it('resets the retry counter for a fresh backoff cycle when the exhausted latch clears (manual retry, same session)', () => { + vi.useFakeTimers() + const resumeSession = vi.fn(async () => undefined) + const props = strandedProps(resumeSession) + + // Phase A — exhaust the bounded auto-retry (counter → MAX) like a dead + // backend. The resumeExhaustedSessionId prop stays null here: the hook sets + // the store, which doesn't feed back into the prop in this harness. + const { rerender } = render() + resumeSession.mockClear() + for (let i = 0; i < 8; i += 1) { + vi.advanceTimersByTime(8_000) + rerender() + rerender() + } + expect(resumeSession.mock.calls.length).toBe(4) // capped + expect($resumeExhaustedSessionId.get()).toBe('session-1') + + // Phase B — user clicks Retry on the SAME stranded session. resumeSession + // clears both latches at entry; the exhausted latch's armed->cleared edge + // must reset the attempt counter so a fresh bounded cycle runs, not a single + // one-shot attempt that immediately re-arms the error. Model the prop + // transitions: reflect the armed latch, then clear it (retry), then re-arm + // the failure latch on the fresh failure. + resumeSession.mockClear() + rerender() + rerender() + rerender() + + // A real retry fires again instead of staying pinned at MAX (which would + // dispatch nothing). Without the reset the counter stays >= MAX and this + // advance dispatches zero resumes. + vi.advanceTimersByTime(8_000) + expect(resumeSession.mock.calls.length).toBeGreaterThan(0) + }) + + it('does not burn retry attempts on unrelated re-renders during the backoff window', () => { + vi.useFakeTimers() + const props = strandedProps(vi.fn()) + + // Mount schedules the first backoff timer. Then re-render repeatedly with a + // fresh resumeSession identity (referential instability — a real dep change + // for the retry effect) WITHOUT ever letting the timer fire. The old code + // incremented the attempt counter at schedule time, so >= MAX re-renders + // armed the exhausted error with zero resumes actually dispatched. The fix + // only advances the counter when a timer truly fires, so the latch stays + // clear no matter how many spurious re-renders happen mid-backoff. + const { rerender } = render( + undefined)} /> + ) + for (let j = 0; j < 8; j += 1) { + rerender( + undefined)} /> + ) + } + + expect($resumeExhaustedSessionId.get()).toBeNull() + }) +}) diff --git a/apps/desktop/src/app/session/hooks/use-route-resume.ts b/apps/desktop/src/app/session/hooks/use-route-resume.ts index ad7677cc4b5..1be8da90c64 100644 --- a/apps/desktop/src/app/session/hooks/use-route-resume.ts +++ b/apps/desktop/src/app/session/hooks/use-route-resume.ts @@ -1,6 +1,7 @@ import { type MutableRefObject, useEffect, useRef } from 'react' import { isNewChatRoute } from '@/app/routes' +import { setResumeExhaustedSessionId } from '@/store/session' interface RouteResumeOptions { activeSessionId: string | null @@ -11,6 +12,17 @@ interface RouteResumeOptions { gatewayState: string | undefined locationPathname: string resumeSession: (sessionId: string, focus: boolean) => Promise + // Stored-session id whose most recent resume failed terminally (set by + // useSessionActions, mirrored from $resumeFailedSessionId). While this equals + // routedSessionId the window would otherwise latch on the loader forever, so + // the bounded-retry effect below re-attempts the resume. + resumeFailedSessionId: string | null + // Stored-session id whose bounded auto-retry has EXHAUSTED (mirrored from + // $resumeExhaustedSessionId). Only resumeSession clears this latch (manual + // Retry / reconnect / reselect) — the auto-retry loop never does — so its + // armed->cleared edge is an unambiguous "give me a fresh backoff cycle" + // signal the effect below uses to reset the attempt counter. + resumeExhaustedSessionId: string | null routedSessionId: string | null runtimeIdByStoredSessionIdRef: MutableRefObject> selectedStoredSessionId: string | null @@ -18,6 +30,19 @@ interface RouteResumeOptions { startFreshSessionDraft: (focus: boolean) => unknown } +// Bounded auto-retry for a stranded session window. A resume can fail terminally +// (gateway RPC reject + REST fallback failure) on a transiently wedged backend — +// dead provider key, a runaway turn hogging the dispatcher, flaky DNS. Without a +// retry the loader latches forever. We retry with backoff, capped, so a +// genuinely dead backend doesn't hot-loop the resume. +const MAX_RESUME_RETRIES = 4 +const RESUME_RETRY_BASE_MS = 1_000 +const RESUME_RETRY_MAX_MS = 8_000 + +function resumeRetryDelayMs(attempt: number): number { + return Math.min(RESUME_RETRY_MAX_MS, RESUME_RETRY_BASE_MS * 2 ** attempt) +} + // HashRouter boot edge case: pathname briefly reads `/` before the hash is // parsed. If the hash references a real session, defer; resume picks it up // next tick. Without this, ctrl+R on `#/:sessionId` flashes 5 loading states. @@ -49,6 +74,8 @@ export function useRouteResume({ gatewayState, locationPathname, resumeSession, + resumeFailedSessionId, + resumeExhaustedSessionId, routedSessionId, runtimeIdByStoredSessionIdRef, selectedStoredSessionId, @@ -58,6 +85,16 @@ export function useRouteResume({ const lastPathnameRef = useRef(null) const seenGatewayStateRef = useRef(false) const wasGatewayOpenRef = useRef(false) + // Per-session retry bookkeeping for the bounded auto-retry effect below. Keyed + // by the session id we're retrying so switching chats resets the counter. + const retrySessionIdRef = useRef(null) + const retryAttemptRef = useRef(0) + // Tracks the previous exhausted-latch value so we can detect its armed->cleared + // edge. resumeSession clears $resumeExhaustedSessionId on a manual Retry / + // reconnect / reselect; that transition is our cue to reset the attempt counter + // for a fresh backoff cycle on the SAME session (the auto-retry loop itself + // never touches this latch, so it can't spuriously trigger the reset). + const prevResumeExhaustedRef = useRef(null) useEffect(() => { const gatewayOpen = gatewayState === 'open' @@ -139,4 +176,111 @@ export function useRouteResume({ selectedStoredSessionIdRef, startFreshSessionDraft ]) + + // Bounded auto-retry: when the routed session's resume failed terminally + // (resumeFailedSessionId matches the route), schedule a backoff retry so the + // window recovers on its own instead of latching the loader forever. This is + // the safety net the main effect above can't provide: after a failed resume, + // selectedStoredSessionIdRef.current already equals the route (resumeSession + // sets it synchronously at entry) and the pathname/gateway are unchanged, so + // none of stuckOnRoutedSession / pathnameChanged / gatewayBecameOpen fire + // again. resumeSession clears resumeFailedSessionId on its next attempt; a + // success keeps it clear (the effect's guard then no-ops), a repeat failure + // re-arms it and we back off further, capped at MAX_RESUME_RETRIES. + useEffect(() => { + // Detect the exhausted-latch armed->cleared edge for the current route. Only + // resumeSession clears $resumeExhaustedSessionId (manual Retry / reconnect / + // reselect) — the auto-retry loop never touches it — so this transition + // uniquely means "the user asked for another go." Reset the attempt counter + // for a fresh bounded backoff cycle on the SAME session. Without this, + // retryAttemptRef stays pinned at MAX after exhaustion (the !stranded reset + // below only fires on a route CHANGE to a different session), so a manual + // retry on the same stranded session would get exactly ONE attempt and then + // immediately re-arm the exhausted error — never the renewed backoff cycle + // the store/session.ts + use-session-actions.ts comments promise. (Point 2) + const wasExhausted = prevResumeExhaustedRef.current + prevResumeExhaustedRef.current = resumeExhaustedSessionId + if (wasExhausted && wasExhausted === routedSessionId && resumeExhaustedSessionId !== wasExhausted) { + retrySessionIdRef.current = routedSessionId + retryAttemptRef.current = 0 + } + + if (currentView !== 'chat' || gatewayState !== 'open') { + return + } + + const stranded = + Boolean(routedSessionId) && + resumeFailedSessionId === routedSessionId && + !creatingSessionRef.current + + if (!stranded) { + // Route moved off the stranded session (or it recovered) — reset the + // counter so a future failure on another session starts fresh, and clear + // any exhausted-latch armed for a session we're no longer viewing (never + // the current route: that's the error state we want to keep showing). + // resumeSession also clears it on a fresh attempt; this covers a plain + // route-change away from the stranded window. + if (retrySessionIdRef.current !== routedSessionId) { + retrySessionIdRef.current = null + retryAttemptRef.current = 0 + setResumeExhaustedSessionId(current => (current && current !== routedSessionId ? null : current)) + } + + return + } + + // New stranded session id → reset the attempt counter. + if (retrySessionIdRef.current !== routedSessionId) { + retrySessionIdRef.current = routedSessionId + retryAttemptRef.current = 0 + } + + if (retryAttemptRef.current >= MAX_RESUME_RETRIES) { + // Give up auto-retrying a persistently dead backend; the user can still + // reconnect / reselect (which resets the counter via the branch above). + // Surface an explicit error + manual Retry in the chat view instead of + // spinning the loader forever — resumeSession (manual Retry / reconnect / + // reselect) clears this latch and resets the counter for a fresh cycle. + setResumeExhaustedSessionId(routedSessionId) + + return + } + + const attempt = retryAttemptRef.current + const sessionId = routedSessionId as string + + const timer = setTimeout(() => { + // Re-check liveness at fire time: a resume may have landed while we waited. + if ( + creatingSessionRef.current || + selectedStoredSessionIdRef.current !== sessionId || + activeSessionIdRef.current !== null + ) { + return + } + + // Consume an attempt ONLY now that a resume is actually dispatching. + // Incrementing at schedule time (the old behavior) let unrelated dep + // changes during the 1s–8s backoff window — a transient gatewayState + // flip, a non-referentially-stable resumeSession — clear the pending + // timer and re-run the effect, burning an attempt without any resume + // having fired. A flapping backend could then hit MAX in a couple of + // re-renders with far fewer than MAX real attempts. (Point 3) + retryAttemptRef.current += 1 + void resumeSession(sessionId, true) + }, resumeRetryDelayMs(attempt)) + + return () => clearTimeout(timer) + }, [ + activeSessionIdRef, + creatingSessionRef, + currentView, + gatewayState, + resumeSession, + resumeFailedSessionId, + resumeExhaustedSessionId, + routedSessionId, + selectedStoredSessionIdRef + ]) } diff --git a/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx b/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx index 739e8b93756..a84a854ded4 100644 --- a/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx +++ b/apps/desktop/src/app/session/hooks/use-session-actions.test.tsx @@ -3,8 +3,9 @@ import type { MutableRefObject } from 'react' import { useEffect } from 'react' import { afterEach, describe, expect, it, vi } from 'vitest' +import { getSessionMessages } from '@/hermes' import { $activeGatewayProfile, $newChatProfile } from '@/store/profile' -import { $currentCwd } from '@/store/session' +import { $currentCwd, $messages, $resumeFailedSessionId, setMessages, setResumeFailedSessionId } from '@/store/session' import type { ClientSessionState } from '../../types' @@ -117,3 +118,142 @@ describe('createBackendSessionForSend profile routing', () => { expect(params).toMatchObject({ profile: 'default' }) }) }) + +// ── Resume failure recovery (the "stuck loading session window" bug) ────────── +// When session.resume rejects AND the REST transcript fallback ALSO fails, the +// hook must (a) not throw out of the fallback (which stranded the loader), and +// (b) arm $resumeFailedSessionId so use-route-resume can retry. A resume that +// succeeds must NOT leave the flag armed. +function ResumeHarness({ + onReady, + requestGateway +}: { + onReady: (resume: (storedSessionId: string, replaceRoute?: boolean) => Promise) => void + requestGateway: (method: string, params?: Record) => Promise +}) { + const ref = (value: T): MutableRefObject => ({ current: value }) + + const actions = useSessionActions({ + activeSessionId: null, + activeSessionIdRef: ref(null), + busyRef: ref(false), + creatingSessionRef: ref(false), + ensureSessionState: () => ({}) as ClientSessionState, + getRouteToken: () => 'token', + navigate: vi.fn() as never, + requestGateway, + runtimeIdByStoredSessionIdRef: ref(new Map()), + selectedStoredSessionId: null, + selectedStoredSessionIdRef: ref(null), + sessionStateByRuntimeIdRef: ref(new Map()), + syncSessionStateToView: vi.fn(), + updateSessionState: (_sessionId, updater) => updater({} as ClientSessionState) + }) + + useEffect(() => { + onReady(actions.resumeSession) + }, [actions.resumeSession, onReady]) + + return null +} + +describe('resumeSession failure recovery', () => { + afterEach(() => { + cleanup() + setResumeFailedSessionId(null) + setMessages([]) + vi.restoreAllMocks() + }) + + async function runResume( + requestGateway: (method: string, params?: Record) => Promise + ): Promise { + let resume: ((storedSessionId: string, replaceRoute?: boolean) => Promise) | null = null + render( (resume = r)} requestGateway={requestGateway} />) + await waitFor(() => expect(resume).not.toBeNull()) + await resume!('stored-1', true) + } + + it('arms $resumeFailedSessionId when resume RPC and REST fallback both fail', async () => { + // session.resume rejects (e.g. timeout against a wedged backend)... + const requestGateway = vi.fn(async (method: string) => { + if (method === 'session.resume') { + throw new Error('request timed out: session.resume') + } + + return {} as never + }) + + // ...and the REST transcript fallback also rejects (backend unreachable). + vi.mocked(getSessionMessages).mockRejectedValue(new Error('network down')) + + await runResume(requestGateway) + + // The window is no longer silently stranded: the failure latch is armed for + // the stored session, which use-route-resume consumes to retry. + expect($resumeFailedSessionId.get()).toBe('stored-1') + }) + + it('does NOT arm the failure latch when the resume RPC fails but the REST fallback paints history', async () => { + // session.resume rejects, but the REST transcript fallback succeeds and + // hydrates a readable transcript — the window is NOT stranded. + const requestGateway = vi.fn(async (method: string) => { + if (method === 'session.resume') { + throw new Error('request timed out: session.resume') + } + + return {} as never + }) + + vi.mocked(getSessionMessages).mockResolvedValue({ + messages: [ + { content: 'hello', role: 'user', timestamp: 1 }, + { content: 'hi there', role: 'assistant', timestamp: 2 } + ], + session_id: 'stored-1' + } as never) + + await runResume(requestGateway) + + // Arming here would auto-retry a window that already shows history and, + // on exhaustion, blank that transcript behind the error overlay — a + // regression vs. plain fallback-success. The latch must stay clear. + expect($resumeFailedSessionId.get()).toBeNull() + // The fallback transcript is visible. + expect($messages.get().length).toBeGreaterThan(0) + }) + + it('does NOT throw out of the fallback when REST also fails (no unhandled rejection)', async () => { + const requestGateway = vi.fn(async (method: string) => { + if (method === 'session.resume') { + throw new Error('request timed out: session.resume') + } + + return {} as never + }) + + vi.mocked(getSessionMessages).mockRejectedValue(new Error('network down')) + + // resumeSession must resolve (swallow the fallback failure), not reject. + await expect(runResume(requestGateway)).resolves.toBeUndefined() + }) + + it('leaves the failure latch clear when resume succeeds', async () => { + // Pre-arm to prove a successful resume clears it (entry-clear path). + setResumeFailedSessionId('stored-1') + + const requestGateway = vi.fn(async (method: string, params?: Record) => { + if (method === 'session.resume') { + return { session_id: 'runtime-1', resumed: params?.session_id, messages: [], info: {} } as never + } + + return {} as never + }) + + vi.mocked(getSessionMessages).mockResolvedValue({ messages: [] } as never) + + await runResume(requestGateway) + + expect($resumeFailedSessionId.get()).toBeNull() + }) +}) diff --git a/apps/desktop/src/app/session/hooks/use-session-actions.ts b/apps/desktop/src/app/session/hooks/use-session-actions.ts index 6f7a779e8ea..36dfea759f2 100644 --- a/apps/desktop/src/app/session/hooks/use-session-actions.ts +++ b/apps/desktop/src/app/session/hooks/use-session-actions.ts @@ -38,6 +38,8 @@ import { setFreshDraftReady, setIntroSeed, setMessages, + setResumeExhaustedSessionId, + setResumeFailedSessionId, setSelectedStoredSessionId, setSessions, setSessionStartedAt, @@ -579,6 +581,15 @@ export function useSessionActions({ clearNotifications() setSelectedStoredSessionId(storedSessionId) selectedStoredSessionIdRef.current = storedSessionId + // Optimistically clear any prior resume-failure latch for this session: + // we're attempting a fresh resume, so the self-heal in use-route-resume + // must not keep treating it as stranded. It's re-armed below only if THIS + // attempt fails terminally (RPC reject + REST fallback failure). + setResumeFailedSessionId(current => (current === storedSessionId ? null : current)) + // Also clear the exhausted-latch: a fresh attempt (manual Retry, reconnect, + // reselect) gives the bounded auto-retry counter a clean cycle, so the + // chat view drops the error state and shows the loader again. + setResumeExhaustedSessionId(current => (current === storedSessionId ? null : current)) const warmRuntimeId = runtimeIdByStoredSessionIdRef.current.get(storedSessionId) @@ -769,13 +780,41 @@ export function useSessionActions({ return } - const fallback = await getSessionMessages(storedSessionId, sessionProfile) + // The gateway resume RPC failed. Try the REST transcript as a fallback + // so the window at least shows history. CRITICAL: this fallback must be + // wrapped in its own try — if it ALSO throws (wedged/unreachable backend, + // the common case when resume failed in the first place), an unguarded + // throw here skips setMessages AND leaves activeSessionId null with an + // empty transcript. That is the exact state the thread loader latches on + // forever (messagesEmpty && !activeSessionId) with no recovery path — + // the "open in new window stays stuck loading, even after a nap" bug. + try { + const fallback = await getSessionMessages(storedSessionId, sessionProfile) - if (!isCurrentResume()) { - return + if (!isCurrentResume()) { + return + } + + setMessages(preserveLocalAssistantErrors(toChatMessages(fallback.messages), $messages.get())) + } catch { + // Fallback also failed: nothing to paint. Leave whatever messages are + // already shown and fall through to arm the resume-failure latch so + // use-route-resume re-attempts the resume on the next render / window + // focus / gateway reconnect instead of stranding the loader. + } + + if (isCurrentResume() && $messages.get().length === 0) { + // Arm the self-heal ONLY when the window is still empty: the gateway + // resume rejected AND the REST fallback failed to paint a transcript. + // That is the exact stranded state the loader latches on + // (messagesEmpty && !activeSessionId), and matches $resumeFailedSessionId's + // documented contract. If the REST fallback DID paint history, the + // window is readable — arming here would needlessly auto-retry and, + // once retries exhaust, blank that visible transcript behind the + // exhausted-state error overlay (a regression vs. plain fallback success). + setResumeFailedSessionId(storedSessionId) } - setMessages(preserveLocalAssistantErrors(toChatMessages(fallback.messages), $messages.get())) notifyError(err, copy.resumeFailed) } finally { if (isCurrentResume()) { diff --git a/apps/desktop/src/i18n/en.ts b/apps/desktop/src/i18n/en.ts index 70720adec1e..3c1a7ec3879 100644 --- a/apps/desktop/src/i18n/en.ts +++ b/apps/desktop/src/i18n/en.ts @@ -1843,6 +1843,9 @@ export const en: Translations = { regenerateFailed: 'Regenerate failed', editFailed: 'Edit failed', resumeFailed: 'Resume failed', + resumeStrandedTitle: "Couldn't load this session", + resumeStrandedBody: 'The connection to this session failed and automatic retries gave up. Check that the gateway is running, then try again.', + resumeRetry: 'Retry', nothingToBranch: 'Nothing to branch', branchNeedsChat: 'Start or resume a chat before branching.', sessionBusy: 'Session busy', diff --git a/apps/desktop/src/i18n/ja.ts b/apps/desktop/src/i18n/ja.ts index 48b46ac9267..904e4b25c53 100644 --- a/apps/desktop/src/i18n/ja.ts +++ b/apps/desktop/src/i18n/ja.ts @@ -1974,6 +1974,9 @@ export const ja = defineLocale({ regenerateFailed: '再生成に失敗しました', editFailed: '編集に失敗しました', resumeFailed: '再開に失敗しました', + resumeStrandedTitle: 'このセッションを読み込めませんでした', + resumeStrandedBody: 'このセッションへの接続に失敗し、自動再試行も停止しました。ゲートウェイが実行中か確認してから、もう一度お試しください。', + resumeRetry: '再試行', nothingToBranch: 'ブランチするものがありません', branchNeedsChat: 'ブランチする前にチャットを開始または再開してください。', sessionBusy: 'セッションが使用中', diff --git a/apps/desktop/src/i18n/types.ts b/apps/desktop/src/i18n/types.ts index dc3be24765c..dcf1028fb4b 100644 --- a/apps/desktop/src/i18n/types.ts +++ b/apps/desktop/src/i18n/types.ts @@ -1481,6 +1481,9 @@ export interface Translations { regenerateFailed: string editFailed: string resumeFailed: string + resumeStrandedTitle: string + resumeStrandedBody: string + resumeRetry: string nothingToBranch: string branchNeedsChat: string sessionBusy: string diff --git a/apps/desktop/src/i18n/zh-hant.ts b/apps/desktop/src/i18n/zh-hant.ts index 1f5be40cad5..8f208aff341 100644 --- a/apps/desktop/src/i18n/zh-hant.ts +++ b/apps/desktop/src/i18n/zh-hant.ts @@ -1914,6 +1914,9 @@ export const zhHant = defineLocale({ regenerateFailed: '重新生成失敗', editFailed: '編輯失敗', resumeFailed: '繼續失敗', + resumeStrandedTitle: '無法載入此工作階段', + resumeStrandedBody: '與此工作階段的連線失敗,自動重試已停止。請確認閘道正在執行,然後重試。', + resumeRetry: '重試', nothingToBranch: '沒有可分支的內容', branchNeedsChat: '分支前請先開始或繼續一個聊天。', sessionBusy: '工作階段忙碌中', diff --git a/apps/desktop/src/i18n/zh.ts b/apps/desktop/src/i18n/zh.ts index 6a2e426eee1..f368d3585ca 100644 --- a/apps/desktop/src/i18n/zh.ts +++ b/apps/desktop/src/i18n/zh.ts @@ -2021,6 +2021,9 @@ export const zh: Translations = { regenerateFailed: '重新生成失败', editFailed: '编辑失败', resumeFailed: '恢复失败', + resumeStrandedTitle: '无法加载此会话', + resumeStrandedBody: '与此会话的连接失败,自动重试已停止。请确认网关正在运行,然后重试。', + resumeRetry: '重试', nothingToBranch: '没有可分支的内容', branchNeedsChat: '分支前请先开始或恢复一个对话。', sessionBusy: '会话忙碌中', diff --git a/apps/desktop/src/store/session.ts b/apps/desktop/src/store/session.ts index e40484cfec1..958801df1f3 100644 --- a/apps/desktop/src/store/session.ts +++ b/apps/desktop/src/store/session.ts @@ -218,6 +218,23 @@ export const $lastVisibleMessageIsUser = computed($messages, lastVisibleMessageI export const $freshDraftReady = atom(false) export const $busy = atom(false) export const $awaitingResponse = atom(false) +// Stored-session id whose most recent resume FAILED terminally (the gateway RPC +// rejected AND the REST transcript fallback also failed), leaving the window +// with no runtime and an empty transcript. Drives use-route-resume's self-heal: +// while this matches the routed session the loader would otherwise latch +// forever (messagesEmpty && !activeSessionId), so the hook re-attempts the +// resume on the next render/focus/reconnect instead of stranding the window. +// Null whenever the active route has a healthy (or in-flight) resume. +export const $resumeFailedSessionId = atom(null) +// Stored-session id whose resume has EXHAUSTED its bounded auto-retries (the +// terminal-failure latch above kept failing through all MAX_RESUME_RETRIES +// attempts). Distinct from $resumeFailedSessionId, which is armed *during* the +// backoff window too: this fires only once auto-recovery has given up, so the +// chat view can swap the perpetual loader for an explicit error + manual Retry +// affordance. A fresh resumeSession() (manual Retry, reconnect, reselect) +// clears it and resets the retry counter. Null whenever the active route has a +// healthy, in-flight, or still-auto-retrying resume. +export const $resumeExhaustedSessionId = atom(null) export const $currentModel = atom(storedString(COMPOSER_MODEL_KEY) ?? '') export const $currentProvider = atom(storedString(COMPOSER_PROVIDER_KEY) ?? '') export const $currentReasoningEffort = atom(storedString(COMPOSER_EFFORT_KEY) ?? '') @@ -262,6 +279,8 @@ export const setActiveSessionId = (next: Updater) => updateAtom($ export const setSelectedStoredSessionId = (next: Updater) => updateAtom($selectedStoredSessionId, next) export const setMessages = (next: Updater) => updateAtom($messages, next) export const setFreshDraftReady = (next: Updater) => updateAtom($freshDraftReady, next) +export const setResumeFailedSessionId = (next: Updater) => updateAtom($resumeFailedSessionId, next) +export const setResumeExhaustedSessionId = (next: Updater) => updateAtom($resumeExhaustedSessionId, next) export const setBusy = (next: Updater) => updateAtom($busy, next) export const setAwaitingResponse = (next: Updater) => updateAtom($awaitingResponse, next)