Merge pull request #52704 from NousResearch/bb/desktop-root-boundary-recover

fix(desktop): recover root error boundary from transient render races (salvage #41787)
2026-06-28 11:32:22 +00:00 · 2026-06-25 16:17:18 -05:00 · 2026-06-25 16:17:18 -05:00 · c4ba4770eb
commit c4ba4770eb
parent 43f9d24513 2e3efce66e
2 changed files with 183 additions and 0 deletions
--- a/apps/desktop/src/components/error-boundary.test.tsx
+++ b/apps/desktop/src/components/error-boundary.test.tsx
@ -0,0 +1,114 @@
+import { cleanup, render, screen, waitFor } from '@testing-library/react'
+import { afterEach, describe, expect, it, vi } from 'vitest'
+
+import { ErrorBoundary } from './error-boundary'
+
+// The real assistant-ui stale-index throw the root boundary must survive
+// (open chat / session switch render race), reproduced verbatim so the
+// recoverable-pattern match is exercised against the actual error text.
+const TAP_ERROR = 'tapClientLookup: Index 23 out of bounds (length: 18)'
+
+// Throws purely from `box.error` so a render replay (React dev) throws
+// identically; the test mutates the box only from timers, never during render —
+// modelling a transient race that clears once the boundary remounts against
+// fresh state.
+function makeBomb(box: { error: Error | null }) {
+  return function Bomb() {
+    if (box.error) {
+      throw box.error
+    }
+
+    return <div>recovered</div>
+  }
+}
+
+const RELOAD_WINDOW = { name: 'Reload window', role: 'button' } as const
+
+const countRecoverWarnings = (calls: unknown[][]) =>
+  calls.filter(call => call.some(value => String(value).includes('auto-recovering from transient render error'))).length
+
+describe('ErrorBoundary root auto-recovery', () => {
+  afterEach(() => {
+    cleanup()
+    vi.restoreAllMocks()
+  })
+
+  it('recovers the root boundary from a transient stale-index render race', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    // Disarm before the scheduled next-tick reset re-renders the subtree, so the
+    // race genuinely resolves on recovery instead of throwing forever.
+    queueMicrotask(() => {
+      box.error = null
+    })
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByText('recovered')).toBeTruthy())
+    expect(screen.queryByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeNull()
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBeGreaterThanOrEqual(1)
+  })
+
+  it('stops auto-recovering a persistent error after the cap and leaves the fallback up', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    // Never disarmed: the boundary must not spin a reset -> throw -> reset loop
+    // forever — it caps recovery and surfaces the fallback to the user.
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    // The fallback showing up at all IS the cap working: with unbounded recovery
+    // the boundary would reset -> throw -> reset forever and 'Reload window'
+    // would never render (this waitFor would hang). The recovery attempts are
+    // bounded by MAX_RECOVERIES (3), never an unbounded storm.
+    await waitFor(() => expect(screen.getByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeTruthy())
+    const warnings = countRecoverWarnings(warnSpy.mock.calls)
+    expect(warnings).toBeGreaterThanOrEqual(1)
+    expect(warnings).toBeLessThanOrEqual(3)
+  })
+
+  it('does not auto-recover a non-root boundary', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary fallback={() => <div>scoped-fallback</div>} label="thread">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByText('scoped-fallback')).toBeTruthy())
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBe(0)
+  })
+
+  it('does not auto-recover an unrecognized error even at the root', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error('some unrelated application error') }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeTruthy())
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBe(0)
+  })
+})
--- a/apps/desktop/src/components/error-boundary.tsx
+++ b/apps/desktop/src/components/error-boundary.tsx
@ -20,8 +20,31 @@ interface ErrorBoundaryState {
  error: Error | null
 }

+// assistant-ui can momentarily render a stale message index against a thread
+// that just shrank (session switch / teardown), throwing a render-race error
+// that latches the WHOLE app on the root "Reload window" screen. These throws
+// clear themselves on the next render against fresh state, so the root boundary
+// recovers itself once the storm settles instead of stranding the user.
+const RECOVERABLE_ERROR_PATTERNS = [
+  /tapClientLookup: Index \d+\s+out of bounds \(length:\s*\d+\)/i,
+  /Cannot read properties of undefined \(reading 'type'\)/i,
+  /Tried to unmount a fiber that is already unmounted/i
+]
+
+const isRecoverableDesktopRenderError = (error: Error): boolean =>
+  RECOVERABLE_ERROR_PATTERNS.some(pattern => pattern.test(error.message))
+
+// Bound auto-recovery so a *persistent* (non-transient) error can't spin the
+// boundary in a reset -> throw -> reset loop: at most MAX_RECOVERIES attempts
+// inside RECOVERY_WINDOW_MS, after which the fallback is left up for the user.
+const MAX_RECOVERIES = 3
+const RECOVERY_WINDOW_MS = 5_000
+
 export class ErrorBoundary extends Component<ErrorBoundaryProps, ErrorBoundaryState> {
  state: ErrorBoundaryState = { error: null }
+  private recoverTimer: null | number = null
+  private recoverCount = 0
+  private recoverWindowStart = 0

  static getDerivedStateFromError(error: Error): ErrorBoundaryState {
    return { error }
@ -31,9 +54,55 @@ export class ErrorBoundary extends Component<ErrorBoundaryProps, ErrorBoundarySt
    const tag = this.props.label ? `[error-boundary:${this.props.label}]` : '[error-boundary]'
    console.error(tag, error, info.componentStack)
    this.props.onError?.(error, info)
+
+    if (this.props.label === 'root' && isRecoverableDesktopRenderError(error) && this.canRecover()) {
+      console.warn(`${tag} auto-recovering from transient render error`, error.message)
+      this.scheduleRecover()
+    }
+  }
+
+  componentWillUnmount() {
+    this.clearRecoverTimer()
  }

  reset = () => {
+    this.clearRecoverTimer()
+    // A manual retry (button) starts a clean recovery budget.
+    this.recoverCount = 0
+    this.recoverWindowStart = 0
+    this.setState({ error: null })
+  }
+
+  // True while the boundary still has recovery budget. Each storm gets a fresh
+  // window; auto-recovery (autoReset) deliberately does NOT reset the count, so
+  // a tight reset -> throw loop is capped at MAX_RECOVERIES and then falls back.
+  private canRecover(): boolean {
+    const now = Date.now()
+
+    if (now - this.recoverWindowStart > RECOVERY_WINDOW_MS) {
+      this.recoverWindowStart = now
+      this.recoverCount = 0
+    }
+
+    this.recoverCount += 1
+
+    return this.recoverCount <= MAX_RECOVERIES
+  }
+
+  private clearRecoverTimer() {
+    if (this.recoverTimer !== null) {
+      window.clearTimeout(this.recoverTimer)
+      this.recoverTimer = null
+    }
+  }
+
+  private scheduleRecover() {
+    this.clearRecoverTimer()
+    this.recoverTimer = window.setTimeout(this.autoReset, 0)
+  }
+
+  private autoReset = () => {
+    this.recoverTimer = null
    this.setState({ error: null })
  }