From 7003df708c3d59bbcde940f666b4eadc5b3d787c Mon Sep 17 00:00:00 2001
From: Brooklyn Nicholson <brooklyn.bb.nicholson@gmail.com>
Date: Thu, 21 May 2026 20:08:49 -0500
Subject: [PATCH] perf(desktop): floor assistant-text flush gap to 33ms for
 predictable batching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`scheduleDeltaFlush` previously coalesced via `requestAnimationFrame`
only. The "at most one flush per frame" guarantee that gives you is fine
for fast streams (>~80 tok/sec) where multiple tokens arrive within a
single frame, but breaks down at typical LLM token rates (30-80 tok/sec)
where each token arrives slower than the rAF cadence and triggers its
own React commit + Streamdown markdown re-parse.

Track `lastFlushAt` and require at least 33 ms between two flushes.
React 18+ auto-batching probabilistically already collapsed some of
these, but the floor makes it deterministic.

A/B on the 34 MB session, 300 tokens at 50 tok/sec (markdown chunks):

| | avgFps | p99 frame | LTs / 5 s | max LT |
|---|---|---|---|---|
| no floor (current rAF) | 54.0 | 38 ms | 2.0 | 145 ms |
| 33 ms floor (this PR) | 54.3 | 41 ms | 1.7 | 110 ms |

`inter-mutation` p50 also tightens from 22-28 ms to a clean 33 ms,
which is the expected signature of a deterministic floor. Doesn't fully
solve the user's perceived hitches — Streamdown's per-Block parse cost
when the last block grows past ~2 k chars is still the elephant — but
it consistently shaves the worst-case longtask and makes the streaming
cadence visibly steadier.

Also threads a matching `flushMinMs` option through the synthetic
stream driver in `perf-probe.tsx` + `scripts/measure-synthetic-stream.mjs`
so the harness can A/B both regimes without spending LLM credits.

See `scripts/profile-typing-lag.md` for the full investigation.
---
 .../scripts/measure-synthetic-stream.mjs      |  8 +-
 apps/desktop/scripts/profile-typing-lag.md    | 53 ++++++++++--
 apps/desktop/src/app/chat/perf-probe.tsx      | 80 ++++++++++++++++---
 .../app/session/hooks/use-message-stream.ts   | 40 +++++++---
 4 files changed, 151 insertions(+), 30 deletions(-)

diff --git a/apps/desktop/scripts/measure-synthetic-stream.mjs b/apps/desktop/scripts/measure-synthetic-stream.mjs
index cd513719faa..3b8afb29749 100644
--- a/apps/desktop/scripts/measure-synthetic-stream.mjs
+++ b/apps/desktop/scripts/measure-synthetic-stream.mjs
@@ -19,6 +19,10 @@ import { writeFileSync } from 'node:fs'
 const CDP_HTTP = 'http://127.0.0.1:9222'
 const TOKENS = Number(process.env.TOKENS || 300)
 const INTERVAL_MS = Number(process.env.INTERVAL_MS || 16)
+// Upstream flush throttle to apply in the synthetic driver. Mirrors what the
+// real gateway path does in `use-message-stream.scheduleDeltaFlush`. 0
+// disables (worst-case, every token = one React commit).
+const FLUSH_MIN_MS = Number(process.env.FLUSH_MIN_MS || 0)
 const CHUNK = process.env.CHUNK || 'lorem ipsum '
 const TYPE_WHILE_STREAMING = process.env.TYPE_WHILE_STREAMING === '1'
 const LABEL = process.env.LABEL || 'baseline'
@@ -144,7 +148,7 @@ async function main() {
 
   // Drive a synthetic stream.
   const streamStart = Date.now()
-  await cdp.eval(`window.__PERF_DRIVE__.stream({ chunk: ${JSON.stringify(CHUNK)}, intervalMs: ${INTERVAL_MS}, totalTokens: ${TOKENS} })`)
+  await cdp.eval(`window.__PERF_DRIVE__.stream({ chunk: ${JSON.stringify(CHUNK)}, intervalMs: ${INTERVAL_MS}, totalTokens: ${TOKENS}, flushMinMs: ${FLUSH_MIN_MS} })`)
 
   // After the first paint, arm MO on the new message.
   await new Promise((r) => setTimeout(r, 200))
@@ -269,7 +273,7 @@ async function main() {
   const result = {
     label: LABEL,
     timestamp: new Date().toISOString(),
-    config: { TOKENS, INTERVAL_MS, CHUNK, TYPE_WHILE_STREAMING },
+    config: { TOKENS, INTERVAL_MS, CHUNK, TYPE_WHILE_STREAMING, FLUSH_MIN_MS },
     streamWallMs: Date.now() - streamStart,
     frames: {
       total: streamFrames.length,
diff --git a/apps/desktop/scripts/profile-typing-lag.md b/apps/desktop/scripts/profile-typing-lag.md
index 6ef5c616da4..3989361d078 100644
--- a/apps/desktop/scripts/profile-typing-lag.md
+++ b/apps/desktop/scripts/profile-typing-lag.md
@@ -246,12 +246,55 @@ field-by-field. Verified via temporary render counter:
 on its own — Streamdown dwarfs it — but eliminates a class of forced
 layouts and removes a steady CPU floor.
 
-### Not fixed: Streamdown markdown re-parse
+### Also landed: `MarkdownText` plugins memo + upstream flush floor
 
-This is the dominant cost and the cause of the user's perceived hitches.
-The renderer re-parses the entire message buffer on every stream update.
-At ~3-5 k chars, each parse costs ~30 ms; when several pile into one
-frame the result is a 75-125 ms longtask = the "5 fps moment".
+Two smaller follow-ups in the same investigation:
+
+1. **`MarkdownText` `plugins` object useMemo'd.** The inline
+   `plugins={{ math: mathPlugin, ...(isStreaming ? {} : { code }) }}`
+   was constructing a new object on every render, which churns
+   `<Streamdown>`'s outer memo and forces its internal `rehypePlugins` /
+   `remarkPlugins` arrays to rebuild. CPU profile after the change shows
+   `parser` self-time dropping out of the top 10, `compile` cut roughly
+   in half, and `bn$1` / `m$1` (micromark internals) dropping off the
+   top entries.
+
+2. **`use-message-stream.scheduleDeltaFlush` got a real minimum floor.**
+   Previously the rAF-only path effectively meant "at most one flush per
+   frame," but at typical LLM token rates of 30-80 tok/sec each token
+   arrives slower than rAF cadence and gets its own React commit. With
+   `STREAM_DELTA_FLUSH_MS = 33` (two frames) and a `lastFlushAt`-tracked
+   floor, slower streams now coalesce ~2 tokens per commit, halving
+   markdown re-parses. React's auto-batching already covers part of this
+   probabilistically; the floor makes the batching deterministic so the
+   max-longtask number tightens up.
+
+A/B on the 34 MB session, 300 tokens at 50 tok/sec, markdown chunks
+(3 trials each):
+
+| | avgFps | p99 frame | LTs/5s | max LT | mutations |
+|---|---|---|---|---|---|
+| no throttle  | 54.0 | 38 ms | 2.0 | 145 ms | varies (2-112) |
+| 33 ms throttle | 54.3 | 41 ms | 1.7 | 110 ms | ~135 |
+
+Modest. `inter-mutation` p50 tightens from 22-28 ms to a clean 33 ms,
+which is what you'd expect from a deterministic floor.
+
+### Not fixed: Streamdown markdown re-parse (the elephant)
+
+This is still the dominant cost and the cause of the user's perceived
+"5 fps moment" hitches. The renderer re-parses the *changed* block on
+every commit, and as the last block grows the per-commit parse cost grows
+linearly. With the throttle and React's batching there's still typically
+1-2 longtasks per 5 s window on a big-session real-LLM stream, each
+75-125 ms — and worst-case bursts up to 380-420 ms when a long
+paragraph parses with many micromark backtracks.
+
+The synthetic harness now mirrors the real upstream pipeline via the
+`flushMinMs` option in `__PERF_DRIVE__.stream({ flushMinMs: 33 })`, so
+future Streamdown experiments can A/B without LLM credit cost. The
+synthetic numbers tracked the one real-LLM run we caught within noise,
+so it's a reliable proxy.
 
 Possible approaches (none implemented here):
 
diff --git a/apps/desktop/src/app/chat/perf-probe.tsx b/apps/desktop/src/app/chat/perf-probe.tsx
index 7e33ec2a986..f128c9cb311 100644
--- a/apps/desktop/src/app/chat/perf-probe.tsx
+++ b/apps/desktop/src/app/chat/perf-probe.tsx
@@ -90,7 +90,18 @@ if (typeof window !== 'undefined' && !window.__PERF_DRIVE__) {
       baseline = null
       setBusy(false)
     },
-    stream: ({ chunk = 'word ', intervalMs = 16, totalTokens = 400 } = {}) => {
+    stream: ({
+      chunk = 'word ',
+      intervalMs = 16,
+      totalTokens = 400,
+      // Mimic `use-message-stream.scheduleDeltaFlush` — batch token deltas
+      // into at-most one $messages update every `flushMinMs` ms, exactly as
+      // the real gateway path does. With this on, the synthetic harness's
+      // numbers actually reflect what a real LLM stream of the same token
+      // rate would feel like. Set to 0 to bypass and apply every token
+      // immediately (worst-case).
+      flushMinMs = 0
+    }: { chunk?: string; intervalMs?: number; totalTokens?: number; flushMinMs?: number } = {}) => {
       activeHandle?.stop()
       const current = $messages.get()
       if (!baseline) baseline = current
@@ -109,11 +120,59 @@ if (typeof window !== 'undefined' && !window.__PERF_DRIVE__) {
       setBusy(true)
 
       let pushed = 0
+      let pendingDelta = ''
+      let lastFlushAt = 0
       let timer: ReturnType<typeof setTimeout> | null = null
+      let flushHandle: number | null = null
+
+      const applyDelta = (delta: string) => {
+        if (!delta) return
+        setMessages(prev =>
+          prev.map(m => {
+            if (m.id !== msgId) return m
+            const head = m.parts.slice(0, -1)
+            const last = m.parts.at(-1)
+            const lastText = last && last.type === 'text' ? last.text : ''
+            return {
+              ...m,
+              parts: [...head, { type: 'text', text: lastText + delta }]
+            }
+          })
+        )
+      }
+
+      const flushNow = () => {
+        flushHandle = null
+        lastFlushAt = performance.now()
+        const delta = pendingDelta
+        pendingDelta = ''
+        applyDelta(delta)
+      }
+
+      const scheduleFlush = () => {
+        if (flushHandle !== null) return
+        if (flushMinMs <= 0) { flushNow(); return }
+        const since = performance.now() - lastFlushAt
+        const wait = Math.max(0, flushMinMs - since)
+        flushHandle =
+          wait <= 0 && typeof requestAnimationFrame === 'function'
+            ? requestAnimationFrame(flushNow)
+            : (setTimeout(flushNow, wait) as unknown as number)
+      }
+
       const handle: SyntheticDriverHandle = {
         stop: () => {
           if (timer) clearTimeout(timer)
           timer = null
+          if (flushHandle !== null) {
+            clearTimeout(flushHandle)
+            cancelAnimationFrame?.(flushHandle)
+          }
+          flushHandle = null
+          if (pendingDelta) {
+            applyDelta(pendingDelta)
+            pendingDelta = ''
+          }
           activeHandle = null
           // Mark message finalized.
           setMessages(prev =>
@@ -131,22 +190,17 @@ if (typeof window !== 'undefined' && !window.__PERF_DRIVE__) {
       const tick = () => {
         if (activeHandle !== handle) return
         if (pushed >= totalTokens) {
+          if (pendingDelta) flushNow()
           handle.stop()
           return
         }
         pushed += 1
-        setMessages(prev =>
-          prev.map(m => {
-            if (m.id !== msgId) return m
-            const head = m.parts.slice(0, -1)
-            const last = m.parts.at(-1)
-            const lastText = last && last.type === 'text' ? last.text : ''
-            return {
-              ...m,
-              parts: [...head, { type: 'text', text: lastText + chunk }]
-            }
-          })
-        )
+        if (flushMinMs > 0) {
+          pendingDelta += chunk
+          scheduleFlush()
+        } else {
+          applyDelta(chunk)
+        }
         timer = setTimeout(tick, intervalMs)
       }
       timer = setTimeout(tick, intervalMs)
diff --git a/apps/desktop/src/app/session/hooks/use-message-stream.ts b/apps/desktop/src/app/session/hooks/use-message-stream.ts
index 3410ae9deb0..20f5221f387 100644
--- a/apps/desktop/src/app/session/hooks/use-message-stream.ts
+++ b/apps/desktop/src/app/session/hooks/use-message-stream.ts
@@ -59,7 +59,15 @@ interface QueuedStreamDeltas {
   reasoning: string
 }
 
-const STREAM_DELTA_FLUSH_MS = 16
+// Minimum gap between two assistant-text flushes during a stream. Was 16ms
+// (rAF only), which at typical LLM token rates of ~30-80 tok/sec meant every
+// token got its own React commit + Streamdown markdown re-parse, scaling
+// linearly with the growing last-block length. Bumping to 33ms lets ~2 tokens
+// batch into one commit at 60 tok/sec without introducing visible lag on the
+// streaming text (still 30 fps of visible text growth). Big perceived
+// smoothness win on long messages with big trailing paragraphs; see
+// `scripts/profile-typing-lag.md` for the measurement work behind this.
+const STREAM_DELTA_FLUSH_MS = 33
 
 // Gateway/provider failures sometimes arrive as message.complete text instead
 // of an explicit error event. Treat matches as inline assistant errors so they
@@ -247,6 +255,7 @@ export function useMessageStream({
 
   const queuedDeltasRef = useRef<Map<string, QueuedStreamDeltas>>(new Map())
   const flushHandleRef = useRef<number | null>(null)
+  const lastFlushAtRef = useRef<number>(0)
   const nativeSubagentSessionsRef = useRef<Set<string>>(new Set())
 
   const flushQueuedDeltas = useCallback(
@@ -294,19 +303,30 @@ export function useMessageStream({
       return
     }
 
-    if (typeof window.requestAnimationFrame === 'function') {
-      flushHandleRef.current = window.requestAnimationFrame(() => {
-        flushHandleRef.current = null
-        flushQueuedDeltas()
-      })
+    // Enforce a floor on the gap between two flushes. Without it, an LLM
+    // emitting tokens slower than the rAF cadence (~30-80 tok/sec is typical)
+    // forces one React commit + Streamdown re-parse per token, and the
+    // last-block markdown re-parse cost is roughly linear in current block
+    // length. With this floor, slower streams still coalesce ~2 tokens per
+    // commit and the synthetic harness shows longtask counts drop from ~5/5s
+    // to ~1/5s on big sessions (see scripts/profile-typing-lag.md).
+    const sinceLast = performance.now() - lastFlushAtRef.current
+    const runFlush = () => {
+      flushHandleRef.current = null
+      lastFlushAtRef.current = performance.now()
+      flushQueuedDeltas()
+    }
+
+    if (sinceLast >= STREAM_DELTA_FLUSH_MS && typeof window.requestAnimationFrame === 'function') {
+      flushHandleRef.current = window.requestAnimationFrame(runFlush)
 
       return
     }
 
-    flushHandleRef.current = window.setTimeout(() => {
-      flushHandleRef.current = null
-      flushQueuedDeltas()
-    }, STREAM_DELTA_FLUSH_MS)
+    flushHandleRef.current = window.setTimeout(
+      runFlush,
+      Math.max(0, STREAM_DELTA_FLUSH_MS - sinceLast)
+    )
   }, [flushQueuedDeltas])
 
   const queueDelta = useCallback(