fix(desktop): recover the root error boundary from transient render races

A stale-index render race in assistant-ui (a just-shrunk thread rendered at an old message index during a session switch / teardown) throws errors like "tapClientLookup: Index N out of bounds", "Cannot read properties of undefined (reading 'type')", or "Tried to unmount a fiber that is already unmounted". These bubble to the root ErrorBoundary and latch the WHOLE desktop app on the "Reload window" fallback even though the next render against fresh state would be fine. Teach the root boundary to treat that small set of known-transient renderer errors as recoverable: log them and schedule a next-tick reset() so React re-renders against current state instead of stranding the user on the fallback. Auto-recovery is BOUNDED -- at most MAX_RECOVERIES (3) attempts within a 5s window -- so a genuinely persistent error can't spin the boundary in a reset -> throw -> reset loop; after the budget is spent the fallback is left up for the user. Manual retry (the button) resets the budget. Only the root boundary auto-recovers; scoped boundaries keep their own fallbacks, and unrecognized errors are never swallowed. Tests: transient race recovers (fallback never sticks), a persistent recoverable error stops at the cap and surfaces the fallback (proving the loop is bounded), and neither a non-root boundary nor an unrecognized root error auto-recovers. Closes #41693. Supersedes #41787 by @izumi0uu, reimplemented with a bounded recovery budget so a non-transient error can't loop forever. Co-authored-by: izumi0uu <izumi0uu@gmail.com>
2026-06-29 11:42:04 +00:00 · 2026-06-25 16:15:20 -05:00 · 2026-06-25 16:15:20 -05:00 · 2e3efce66e
commit 2e3efce66e
parent c6575df927
2 changed files with 183 additions and 0 deletions
--- a/apps/desktop/src/components/error-boundary.test.tsx
+++ b/apps/desktop/src/components/error-boundary.test.tsx
@ -0,0 +1,114 @@
+import { cleanup, render, screen, waitFor } from '@testing-library/react'
+import { afterEach, describe, expect, it, vi } from 'vitest'
+
+import { ErrorBoundary } from './error-boundary'
+
+// The real assistant-ui stale-index throw the root boundary must survive
+// (open chat / session switch render race), reproduced verbatim so the
+// recoverable-pattern match is exercised against the actual error text.
+const TAP_ERROR = 'tapClientLookup: Index 23 out of bounds (length: 18)'
+
+// Throws purely from `box.error` so a render replay (React dev) throws
+// identically; the test mutates the box only from timers, never during render —
+// modelling a transient race that clears once the boundary remounts against
+// fresh state.
+function makeBomb(box: { error: Error | null }) {
+  return function Bomb() {
+    if (box.error) {
+      throw box.error
+    }
+
+    return <div>recovered</div>
+  }
+}
+
+const RELOAD_WINDOW = { name: 'Reload window', role: 'button' } as const
+
+const countRecoverWarnings = (calls: unknown[][]) =>
+  calls.filter(call => call.some(value => String(value).includes('auto-recovering from transient render error'))).length
+
+describe('ErrorBoundary root auto-recovery', () => {
+  afterEach(() => {
+    cleanup()
+    vi.restoreAllMocks()
+  })
+
+  it('recovers the root boundary from a transient stale-index render race', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    // Disarm before the scheduled next-tick reset re-renders the subtree, so the
+    // race genuinely resolves on recovery instead of throwing forever.
+    queueMicrotask(() => {
+      box.error = null
+    })
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByText('recovered')).toBeTruthy())
+    expect(screen.queryByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeNull()
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBeGreaterThanOrEqual(1)
+  })
+
+  it('stops auto-recovering a persistent error after the cap and leaves the fallback up', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    // Never disarmed: the boundary must not spin a reset -> throw -> reset loop
+    // forever — it caps recovery and surfaces the fallback to the user.
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    // The fallback showing up at all IS the cap working: with unbounded recovery
+    // the boundary would reset -> throw -> reset forever and 'Reload window'
+    // would never render (this waitFor would hang). The recovery attempts are
+    // bounded by MAX_RECOVERIES (3), never an unbounded storm.
+    await waitFor(() => expect(screen.getByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeTruthy())
+    const warnings = countRecoverWarnings(warnSpy.mock.calls)
+    expect(warnings).toBeGreaterThanOrEqual(1)
+    expect(warnings).toBeLessThanOrEqual(3)
+  })
+
+  it('does not auto-recover a non-root boundary', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary fallback={() => <div>scoped-fallback</div>} label="thread">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByText('scoped-fallback')).toBeTruthy())
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBe(0)
+  })
+
+  it('does not auto-recover an unrecognized error even at the root', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error('some unrelated application error') }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeTruthy())
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBe(0)
+  })
+})
--- a/apps/desktop/src/components/error-boundary.tsx
+++ b/apps/desktop/src/components/error-boundary.tsx
@ -20,8 +20,31 @@ interface ErrorBoundaryState {
  error: Error | null
 }

+// assistant-ui can momentarily render a stale message index against a thread
+// that just shrank (session switch / teardown), throwing a render-race error
+// that latches the WHOLE app on the root "Reload window" screen. These throws
+// clear themselves on the next render against fresh state, so the root boundary
+// recovers itself once the storm settles instead of stranding the user.
+const RECOVERABLE_ERROR_PATTERNS = [
+  /tapClientLookup: Index \d+\s+out of bounds \(length:\s*\d+\)/i,
+  /Cannot read properties of undefined \(reading 'type'\)/i,
+  /Tried to unmount a fiber that is already unmounted/i
+]
+
+const isRecoverableDesktopRenderError = (error: Error): boolean =>
+  RECOVERABLE_ERROR_PATTERNS.some(pattern => pattern.test(error.message))
+
+// Bound auto-recovery so a *persistent* (non-transient) error can't spin the
+// boundary in a reset -> throw -> reset loop: at most MAX_RECOVERIES attempts
+// inside RECOVERY_WINDOW_MS, after which the fallback is left up for the user.
+const MAX_RECOVERIES = 3
+const RECOVERY_WINDOW_MS = 5_000
+
 export class ErrorBoundary extends Component<ErrorBoundaryProps, ErrorBoundaryState> {
  state: ErrorBoundaryState = { error: null }
+  private recoverTimer: null | number = null
+  private recoverCount = 0
+  private recoverWindowStart = 0

  static getDerivedStateFromError(error: Error): ErrorBoundaryState {
    return { error }
@ -31,9 +54,55 @@ export class ErrorBoundary extends Component<ErrorBoundaryProps, ErrorBoundarySt
    const tag = this.props.label ? `[error-boundary:${this.props.label}]` : '[error-boundary]'
    console.error(tag, error, info.componentStack)
    this.props.onError?.(error, info)
+
+    if (this.props.label === 'root' && isRecoverableDesktopRenderError(error) && this.canRecover()) {
+      console.warn(`${tag} auto-recovering from transient render error`, error.message)
+      this.scheduleRecover()
+    }
+  }
+
+  componentWillUnmount() {
+    this.clearRecoverTimer()
  }

  reset = () => {
+    this.clearRecoverTimer()
+    // A manual retry (button) starts a clean recovery budget.
+    this.recoverCount = 0
+    this.recoverWindowStart = 0
+    this.setState({ error: null })
+  }
+
+  // True while the boundary still has recovery budget. Each storm gets a fresh
+  // window; auto-recovery (autoReset) deliberately does NOT reset the count, so
+  // a tight reset -> throw loop is capped at MAX_RECOVERIES and then falls back.
+  private canRecover(): boolean {
+    const now = Date.now()
+
+    if (now - this.recoverWindowStart > RECOVERY_WINDOW_MS) {
+      this.recoverWindowStart = now
+      this.recoverCount = 0
+    }
+
+    this.recoverCount += 1
+
+    return this.recoverCount <= MAX_RECOVERIES
+  }
+
+  private clearRecoverTimer() {
+    if (this.recoverTimer !== null) {
+      window.clearTimeout(this.recoverTimer)
+      this.recoverTimer = null
+    }
+  }
+
+  private scheduleRecover() {
+    this.clearRecoverTimer()
+    this.recoverTimer = window.setTimeout(this.autoReset, 0)
+  }
+
+  private autoReset = () => {
+    this.recoverTimer = null
    this.setState({ error: null })
  }