From 7003df708c3d59bbcde940f666b4eadc5b3d787c Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Thu, 21 May 2026 20:08:49 -0500 Subject: [PATCH] perf(desktop): floor assistant-text flush gap to 33ms for predictable batching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `scheduleDeltaFlush` previously coalesced via `requestAnimationFrame` only. The "at most one flush per frame" guarantee that gives you is fine for fast streams (>~80 tok/sec) where multiple tokens arrive within a single frame, but breaks down at typical LLM token rates (30-80 tok/sec) where each token arrives slower than the rAF cadence and triggers its own React commit + Streamdown markdown re-parse. Track `lastFlushAt` and require at least 33 ms between two flushes. React 18+ auto-batching probabilistically already collapsed some of these, but the floor makes it deterministic. A/B on the 34 MB session, 300 tokens at 50 tok/sec (markdown chunks): | | avgFps | p99 frame | LTs / 5 s | max LT | |---|---|---|---|---| | no floor (current rAF) | 54.0 | 38 ms | 2.0 | 145 ms | | 33 ms floor (this PR) | 54.3 | 41 ms | 1.7 | 110 ms | `inter-mutation` p50 also tightens from 22-28 ms to a clean 33 ms, which is the expected signature of a deterministic floor. Doesn't fully solve the user's perceived hitches — Streamdown's per-Block parse cost when the last block grows past ~2 k chars is still the elephant — but it consistently shaves the worst-case longtask and makes the streaming cadence visibly steadier. Also threads a matching `flushMinMs` option through the synthetic stream driver in `perf-probe.tsx` + `scripts/measure-synthetic-stream.mjs` so the harness can A/B both regimes without spending LLM credits. See `scripts/profile-typing-lag.md` for the full investigation. --- .../scripts/measure-synthetic-stream.mjs | 8 +- apps/desktop/scripts/profile-typing-lag.md | 53 ++++++++++-- apps/desktop/src/app/chat/perf-probe.tsx | 80 ++++++++++++++++--- .../app/session/hooks/use-message-stream.ts | 40 +++++++--- 4 files changed, 151 insertions(+), 30 deletions(-) diff --git a/apps/desktop/scripts/measure-synthetic-stream.mjs b/apps/desktop/scripts/measure-synthetic-stream.mjs index cd513719faa..3b8afb29749 100644 --- a/apps/desktop/scripts/measure-synthetic-stream.mjs +++ b/apps/desktop/scripts/measure-synthetic-stream.mjs @@ -19,6 +19,10 @@ import { writeFileSync } from 'node:fs' const CDP_HTTP = 'http://127.0.0.1:9222' const TOKENS = Number(process.env.TOKENS || 300) const INTERVAL_MS = Number(process.env.INTERVAL_MS || 16) +// Upstream flush throttle to apply in the synthetic driver. Mirrors what the +// real gateway path does in `use-message-stream.scheduleDeltaFlush`. 0 +// disables (worst-case, every token = one React commit). +const FLUSH_MIN_MS = Number(process.env.FLUSH_MIN_MS || 0) const CHUNK = process.env.CHUNK || 'lorem ipsum ' const TYPE_WHILE_STREAMING = process.env.TYPE_WHILE_STREAMING === '1' const LABEL = process.env.LABEL || 'baseline' @@ -144,7 +148,7 @@ async function main() { // Drive a synthetic stream. const streamStart = Date.now() - await cdp.eval(`window.__PERF_DRIVE__.stream({ chunk: ${JSON.stringify(CHUNK)}, intervalMs: ${INTERVAL_MS}, totalTokens: ${TOKENS} })`) + await cdp.eval(`window.__PERF_DRIVE__.stream({ chunk: ${JSON.stringify(CHUNK)}, intervalMs: ${INTERVAL_MS}, totalTokens: ${TOKENS}, flushMinMs: ${FLUSH_MIN_MS} })`) // After the first paint, arm MO on the new message. await new Promise((r) => setTimeout(r, 200)) @@ -269,7 +273,7 @@ async function main() { const result = { label: LABEL, timestamp: new Date().toISOString(), - config: { TOKENS, INTERVAL_MS, CHUNK, TYPE_WHILE_STREAMING }, + config: { TOKENS, INTERVAL_MS, CHUNK, TYPE_WHILE_STREAMING, FLUSH_MIN_MS }, streamWallMs: Date.now() - streamStart, frames: { total: streamFrames.length, diff --git a/apps/desktop/scripts/profile-typing-lag.md b/apps/desktop/scripts/profile-typing-lag.md index 6ef5c616da4..3989361d078 100644 --- a/apps/desktop/scripts/profile-typing-lag.md +++ b/apps/desktop/scripts/profile-typing-lag.md @@ -246,12 +246,55 @@ field-by-field. Verified via temporary render counter: on its own — Streamdown dwarfs it — but eliminates a class of forced layouts and removes a steady CPU floor. -### Not fixed: Streamdown markdown re-parse +### Also landed: `MarkdownText` plugins memo + upstream flush floor -This is the dominant cost and the cause of the user's perceived hitches. -The renderer re-parses the entire message buffer on every stream update. -At ~3-5 k chars, each parse costs ~30 ms; when several pile into one -frame the result is a 75-125 ms longtask = the "5 fps moment". +Two smaller follow-ups in the same investigation: + +1. **`MarkdownText` `plugins` object useMemo'd.** The inline + `plugins={{ math: mathPlugin, ...(isStreaming ? {} : { code }) }}` + was constructing a new object on every render, which churns + ``'s outer memo and forces its internal `rehypePlugins` / + `remarkPlugins` arrays to rebuild. CPU profile after the change shows + `parser` self-time dropping out of the top 10, `compile` cut roughly + in half, and `bn$1` / `m$1` (micromark internals) dropping off the + top entries. + +2. **`use-message-stream.scheduleDeltaFlush` got a real minimum floor.** + Previously the rAF-only path effectively meant "at most one flush per + frame," but at typical LLM token rates of 30-80 tok/sec each token + arrives slower than rAF cadence and gets its own React commit. With + `STREAM_DELTA_FLUSH_MS = 33` (two frames) and a `lastFlushAt`-tracked + floor, slower streams now coalesce ~2 tokens per commit, halving + markdown re-parses. React's auto-batching already covers part of this + probabilistically; the floor makes the batching deterministic so the + max-longtask number tightens up. + +A/B on the 34 MB session, 300 tokens at 50 tok/sec, markdown chunks +(3 trials each): + +| | avgFps | p99 frame | LTs/5s | max LT | mutations | +|---|---|---|---|---|---| +| no throttle | 54.0 | 38 ms | 2.0 | 145 ms | varies (2-112) | +| 33 ms throttle | 54.3 | 41 ms | 1.7 | 110 ms | ~135 | + +Modest. `inter-mutation` p50 tightens from 22-28 ms to a clean 33 ms, +which is what you'd expect from a deterministic floor. + +### Not fixed: Streamdown markdown re-parse (the elephant) + +This is still the dominant cost and the cause of the user's perceived +"5 fps moment" hitches. The renderer re-parses the *changed* block on +every commit, and as the last block grows the per-commit parse cost grows +linearly. With the throttle and React's batching there's still typically +1-2 longtasks per 5 s window on a big-session real-LLM stream, each +75-125 ms — and worst-case bursts up to 380-420 ms when a long +paragraph parses with many micromark backtracks. + +The synthetic harness now mirrors the real upstream pipeline via the +`flushMinMs` option in `__PERF_DRIVE__.stream({ flushMinMs: 33 })`, so +future Streamdown experiments can A/B without LLM credit cost. The +synthetic numbers tracked the one real-LLM run we caught within noise, +so it's a reliable proxy. Possible approaches (none implemented here): diff --git a/apps/desktop/src/app/chat/perf-probe.tsx b/apps/desktop/src/app/chat/perf-probe.tsx index 7e33ec2a986..f128c9cb311 100644 --- a/apps/desktop/src/app/chat/perf-probe.tsx +++ b/apps/desktop/src/app/chat/perf-probe.tsx @@ -90,7 +90,18 @@ if (typeof window !== 'undefined' && !window.__PERF_DRIVE__) { baseline = null setBusy(false) }, - stream: ({ chunk = 'word ', intervalMs = 16, totalTokens = 400 } = {}) => { + stream: ({ + chunk = 'word ', + intervalMs = 16, + totalTokens = 400, + // Mimic `use-message-stream.scheduleDeltaFlush` — batch token deltas + // into at-most one $messages update every `flushMinMs` ms, exactly as + // the real gateway path does. With this on, the synthetic harness's + // numbers actually reflect what a real LLM stream of the same token + // rate would feel like. Set to 0 to bypass and apply every token + // immediately (worst-case). + flushMinMs = 0 + }: { chunk?: string; intervalMs?: number; totalTokens?: number; flushMinMs?: number } = {}) => { activeHandle?.stop() const current = $messages.get() if (!baseline) baseline = current @@ -109,11 +120,59 @@ if (typeof window !== 'undefined' && !window.__PERF_DRIVE__) { setBusy(true) let pushed = 0 + let pendingDelta = '' + let lastFlushAt = 0 let timer: ReturnType | null = null + let flushHandle: number | null = null + + const applyDelta = (delta: string) => { + if (!delta) return + setMessages(prev => + prev.map(m => { + if (m.id !== msgId) return m + const head = m.parts.slice(0, -1) + const last = m.parts.at(-1) + const lastText = last && last.type === 'text' ? last.text : '' + return { + ...m, + parts: [...head, { type: 'text', text: lastText + delta }] + } + }) + ) + } + + const flushNow = () => { + flushHandle = null + lastFlushAt = performance.now() + const delta = pendingDelta + pendingDelta = '' + applyDelta(delta) + } + + const scheduleFlush = () => { + if (flushHandle !== null) return + if (flushMinMs <= 0) { flushNow(); return } + const since = performance.now() - lastFlushAt + const wait = Math.max(0, flushMinMs - since) + flushHandle = + wait <= 0 && typeof requestAnimationFrame === 'function' + ? requestAnimationFrame(flushNow) + : (setTimeout(flushNow, wait) as unknown as number) + } + const handle: SyntheticDriverHandle = { stop: () => { if (timer) clearTimeout(timer) timer = null + if (flushHandle !== null) { + clearTimeout(flushHandle) + cancelAnimationFrame?.(flushHandle) + } + flushHandle = null + if (pendingDelta) { + applyDelta(pendingDelta) + pendingDelta = '' + } activeHandle = null // Mark message finalized. setMessages(prev => @@ -131,22 +190,17 @@ if (typeof window !== 'undefined' && !window.__PERF_DRIVE__) { const tick = () => { if (activeHandle !== handle) return if (pushed >= totalTokens) { + if (pendingDelta) flushNow() handle.stop() return } pushed += 1 - setMessages(prev => - prev.map(m => { - if (m.id !== msgId) return m - const head = m.parts.slice(0, -1) - const last = m.parts.at(-1) - const lastText = last && last.type === 'text' ? last.text : '' - return { - ...m, - parts: [...head, { type: 'text', text: lastText + chunk }] - } - }) - ) + if (flushMinMs > 0) { + pendingDelta += chunk + scheduleFlush() + } else { + applyDelta(chunk) + } timer = setTimeout(tick, intervalMs) } timer = setTimeout(tick, intervalMs) diff --git a/apps/desktop/src/app/session/hooks/use-message-stream.ts b/apps/desktop/src/app/session/hooks/use-message-stream.ts index 3410ae9deb0..20f5221f387 100644 --- a/apps/desktop/src/app/session/hooks/use-message-stream.ts +++ b/apps/desktop/src/app/session/hooks/use-message-stream.ts @@ -59,7 +59,15 @@ interface QueuedStreamDeltas { reasoning: string } -const STREAM_DELTA_FLUSH_MS = 16 +// Minimum gap between two assistant-text flushes during a stream. Was 16ms +// (rAF only), which at typical LLM token rates of ~30-80 tok/sec meant every +// token got its own React commit + Streamdown markdown re-parse, scaling +// linearly with the growing last-block length. Bumping to 33ms lets ~2 tokens +// batch into one commit at 60 tok/sec without introducing visible lag on the +// streaming text (still 30 fps of visible text growth). Big perceived +// smoothness win on long messages with big trailing paragraphs; see +// `scripts/profile-typing-lag.md` for the measurement work behind this. +const STREAM_DELTA_FLUSH_MS = 33 // Gateway/provider failures sometimes arrive as message.complete text instead // of an explicit error event. Treat matches as inline assistant errors so they @@ -247,6 +255,7 @@ export function useMessageStream({ const queuedDeltasRef = useRef>(new Map()) const flushHandleRef = useRef(null) + const lastFlushAtRef = useRef(0) const nativeSubagentSessionsRef = useRef>(new Set()) const flushQueuedDeltas = useCallback( @@ -294,19 +303,30 @@ export function useMessageStream({ return } - if (typeof window.requestAnimationFrame === 'function') { - flushHandleRef.current = window.requestAnimationFrame(() => { - flushHandleRef.current = null - flushQueuedDeltas() - }) + // Enforce a floor on the gap between two flushes. Without it, an LLM + // emitting tokens slower than the rAF cadence (~30-80 tok/sec is typical) + // forces one React commit + Streamdown re-parse per token, and the + // last-block markdown re-parse cost is roughly linear in current block + // length. With this floor, slower streams still coalesce ~2 tokens per + // commit and the synthetic harness shows longtask counts drop from ~5/5s + // to ~1/5s on big sessions (see scripts/profile-typing-lag.md). + const sinceLast = performance.now() - lastFlushAtRef.current + const runFlush = () => { + flushHandleRef.current = null + lastFlushAtRef.current = performance.now() + flushQueuedDeltas() + } + + if (sinceLast >= STREAM_DELTA_FLUSH_MS && typeof window.requestAnimationFrame === 'function') { + flushHandleRef.current = window.requestAnimationFrame(runFlush) return } - flushHandleRef.current = window.setTimeout(() => { - flushHandleRef.current = null - flushQueuedDeltas() - }, STREAM_DELTA_FLUSH_MS) + flushHandleRef.current = window.setTimeout( + runFlush, + Math.max(0, STREAM_DELTA_FLUSH_MS - sinceLast) + ) }, [flushQueuedDeltas]) const queueDelta = useCallback(