From 2e3efce66ebd0a144a0733333e1c0d31d9f65c5d Mon Sep 17 00:00:00 2001
From: Brooklyn Nicholson <brooklyn.bb.nicholson@gmail.com>
Date: Thu, 25 Jun 2026 16:15:20 -0500
Subject: [PATCH] fix(desktop): recover the root error boundary from transient
 render races

A stale-index render race in assistant-ui (a just-shrunk thread rendered
at an old message index during a session switch / teardown) throws
errors like "tapClientLookup: Index N out of bounds", "Cannot read
properties of undefined (reading 'type')", or "Tried to unmount a fiber
that is already unmounted". These bubble to the root ErrorBoundary and
latch the WHOLE desktop app on the "Reload window" fallback even though
the next render against fresh state would be fine.

Teach the root boundary to treat that small set of known-transient
renderer errors as recoverable: log them and schedule a next-tick
reset() so React re-renders against current state instead of stranding
the user on the fallback.

Auto-recovery is BOUNDED -- at most MAX_RECOVERIES (3) attempts within a
5s window -- so a genuinely persistent error can't spin the boundary in
a reset -> throw -> reset loop; after the budget is spent the fallback
is left up for the user. Manual retry (the button) resets the budget.
Only the root boundary auto-recovers; scoped boundaries keep their own
fallbacks, and unrecognized errors are never swallowed.

Tests: transient race recovers (fallback never sticks), a persistent
recoverable error stops at the cap and surfaces the fallback (proving
the loop is bounded), and neither a non-root boundary nor an
unrecognized root error auto-recovers.

Closes #41693. Supersedes #41787 by @izumi0uu, reimplemented with a
bounded recovery budget so a non-transient error can't loop forever.

Co-authored-by: izumi0uu <izumi0uu@gmail.com>
---
 .../src/components/error-boundary.test.tsx    | 114 ++++++++++++++++++
 .../desktop/src/components/error-boundary.tsx |  69 +++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 apps/desktop/src/components/error-boundary.test.tsx
diff --git a/apps/desktop/src/components/error-boundary.test.tsx b/apps/desktop/src/components/error-boundary.test.tsx
new file mode 100644
index 00000000000..cd5f3a547bb
--- /dev/null
+++ b/apps/desktop/src/components/error-boundary.test.tsx
@@ -0,0 +1,114 @@
+import { cleanup, render, screen, waitFor } from '@testing-library/react'
+import { afterEach, describe, expect, it, vi } from 'vitest'
+
+import { ErrorBoundary } from './error-boundary'
+
+// The real assistant-ui stale-index throw the root boundary must survive
+// (open chat / session switch render race), reproduced verbatim so the
+// recoverable-pattern match is exercised against the actual error text.
+const TAP_ERROR = 'tapClientLookup: Index 23 out of bounds (length: 18)'
+
+// Throws purely from `box.error` so a render replay (React dev) throws
+// identically; the test mutates the box only from timers, never during render —
+// modelling a transient race that clears once the boundary remounts against
+// fresh state.
+function makeBomb(box: { error: Error | null }) {
+  return function Bomb() {
+    if (box.error) {
+      throw box.error
+    }
+
+    return <div>recovered</div>
+  }
+}
+
+const RELOAD_WINDOW = { name: 'Reload window', role: 'button' } as const
+
+const countRecoverWarnings = (calls: unknown[][]) =>
+  calls.filter(call => call.some(value => String(value).includes('auto-recovering from transient render error'))).length
+
+describe('ErrorBoundary root auto-recovery', () => {
+  afterEach(() => {
+    cleanup()
+    vi.restoreAllMocks()
+  })
+
+  it('recovers the root boundary from a transient stale-index render race', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    // Disarm before the scheduled next-tick reset re-renders the subtree, so the
+    // race genuinely resolves on recovery instead of throwing forever.
+    queueMicrotask(() => {
+      box.error = null
+    })
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByText('recovered')).toBeTruthy())
+    expect(screen.queryByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeNull()
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBeGreaterThanOrEqual(1)
+  })
+
+  it('stops auto-recovering a persistent error after the cap and leaves the fallback up', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    // Never disarmed: the boundary must not spin a reset -> throw -> reset loop
+    // forever — it caps recovery and surfaces the fallback to the user.
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    // The fallback showing up at all IS the cap working: with unbounded recovery
+    // the boundary would reset -> throw -> reset forever and 'Reload window'
+    // would never render (this waitFor would hang). The recovery attempts are
+    // bounded by MAX_RECOVERIES (3), never an unbounded storm.
+    await waitFor(() => expect(screen.getByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeTruthy())
+    const warnings = countRecoverWarnings(warnSpy.mock.calls)
+    expect(warnings).toBeGreaterThanOrEqual(1)
+    expect(warnings).toBeLessThanOrEqual(3)
+  })
+
+  it('does not auto-recover a non-root boundary', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error(TAP_ERROR) }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary fallback={() => <div>scoped-fallback</div>} label="thread">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByText('scoped-fallback')).toBeTruthy())
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBe(0)
+  })
+
+  it('does not auto-recover an unrecognized error even at the root', async () => {
+    vi.spyOn(console, 'error').mockImplementation(() => {})
+    const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {})
+    const box: { error: Error | null } = { error: new Error('some unrelated application error') }
+    const Bomb = makeBomb(box)
+
+    render(
+      <ErrorBoundary label="root">
+        <Bomb />
+      </ErrorBoundary>
+    )
+
+    await waitFor(() => expect(screen.getByRole(RELOAD_WINDOW.role, { name: RELOAD_WINDOW.name })).toBeTruthy())
+    expect(countRecoverWarnings(warnSpy.mock.calls)).toBe(0)
+  })
+})
diff --git a/apps/desktop/src/components/error-boundary.tsx b/apps/desktop/src/components/error-boundary.tsx
index 87b6b7743c5..f607760427f 100644
--- a/apps/desktop/src/components/error-boundary.tsx
+++ b/apps/desktop/src/components/error-boundary.tsx
@@ -20,8 +20,31 @@ interface ErrorBoundaryState {
   error: Error | null
 }
 
+// assistant-ui can momentarily render a stale message index against a thread
+// that just shrank (session switch / teardown), throwing a render-race error
+// that latches the WHOLE app on the root "Reload window" screen. These throws
+// clear themselves on the next render against fresh state, so the root boundary
+// recovers itself once the storm settles instead of stranding the user.
+const RECOVERABLE_ERROR_PATTERNS = [
+  /tapClientLookup: Index \d+\s+out of bounds \(length:\s*\d+\)/i,
+  /Cannot read properties of undefined \(reading 'type'\)/i,
+  /Tried to unmount a fiber that is already unmounted/i
+]
+
+const isRecoverableDesktopRenderError = (error: Error): boolean =>
+  RECOVERABLE_ERROR_PATTERNS.some(pattern => pattern.test(error.message))
+
+// Bound auto-recovery so a *persistent* (non-transient) error can't spin the
+// boundary in a reset -> throw -> reset loop: at most MAX_RECOVERIES attempts
+// inside RECOVERY_WINDOW_MS, after which the fallback is left up for the user.
+const MAX_RECOVERIES = 3
+const RECOVERY_WINDOW_MS = 5_000
+
 export class ErrorBoundary extends Component<ErrorBoundaryProps, ErrorBoundaryState> {
   state: ErrorBoundaryState = { error: null }
+  private recoverTimer: null | number = null
+  private recoverCount = 0
+  private recoverWindowStart = 0
 
   static getDerivedStateFromError(error: Error): ErrorBoundaryState {
     return { error }
@@ -31,9 +54,55 @@ export class ErrorBoundary extends Component<ErrorBoundaryProps, ErrorBoundarySt
     const tag = this.props.label ? `[error-boundary:${this.props.label}]` : '[error-boundary]'
     console.error(tag, error, info.componentStack)
     this.props.onError?.(error, info)
+
+    if (this.props.label === 'root' && isRecoverableDesktopRenderError(error) && this.canRecover()) {
+      console.warn(`${tag} auto-recovering from transient render error`, error.message)
+      this.scheduleRecover()
+    }
+  }
+
+  componentWillUnmount() {
+    this.clearRecoverTimer()
   }
 
   reset = () => {
+    this.clearRecoverTimer()
+    // A manual retry (button) starts a clean recovery budget.
+    this.recoverCount = 0
+    this.recoverWindowStart = 0
+    this.setState({ error: null })
+  }
+
+  // True while the boundary still has recovery budget. Each storm gets a fresh
+  // window; auto-recovery (autoReset) deliberately does NOT reset the count, so
+  // a tight reset -> throw loop is capped at MAX_RECOVERIES and then falls back.
+  private canRecover(): boolean {
+    const now = Date.now()
+
+    if (now - this.recoverWindowStart > RECOVERY_WINDOW_MS) {
+      this.recoverWindowStart = now
+      this.recoverCount = 0
+    }
+
+    this.recoverCount += 1
+
+    return this.recoverCount <= MAX_RECOVERIES
+  }
+
+  private clearRecoverTimer() {
+    if (this.recoverTimer !== null) {
+      window.clearTimeout(this.recoverTimer)
+      this.recoverTimer = null
+    }
+  }
+
+  private scheduleRecover() {
+    this.clearRecoverTimer()
+    this.recoverTimer = window.setTimeout(this.autoReset, 0)
+  }
+
+  private autoReset = () => {
+    this.recoverTimer = null
     this.setState({ error: null })
   }