perf(desktop): rate-limit thread auto-pin during streaming

Follow-up to the Enter-jump fix. The first version did a synchronous re-pin loop inside the on-scroll handler when the browser clamped our `scrollTop = scrollHeight` write short of the new bottom; that gave a tight 4 px visible jump on Enter, but during streaming the ResizeObserver fires many times per second as content grows, and each RO callback re-entered the pin loop. CPU profile showed `Virtualizer.getMaxScrollOffset` climbing to 22 ms self over a typing- during-streaming window — the sync re-pin path was paying tanstack- virtual's recompute cost ~3× per token. Re-architect: - RO callback coalesces to one pin per animation frame. Streaming-rate RO bursts now cost the same as a single per-frame pin. - The on-scroll programmatic-counter guard remains (it's what prevents the false-disarm bug when the browser clamps a write). It no longer does sync re-pins; the next RO/rAF will catch up. - The useLayoutEffect on groupCount (the path that fires on user submit / new turn arrival) ALSO schedules one rAF pin in addition to the synchronous pin. This catches the case where React mounts the new message in a second commit (after our layout effect ran), which grows scrollHeight again. Two pins instead of a tight loop, paid only once per turn change. Net effect on the Cloud Shadows long thread: enter-jump transient: 12–20 px for 1 frame (was 49 px permanent) CPU during stream+type: `getMaxScrollOffset` dropped out of top-5 self-time list typing-during-stream: p50 ~10 ms paint, p99 ~20 ms (1 frame), occasional 40 ms+ outliers during burst token arrivals Also adds scripts/profile-long-stream.mjs: 20-second streaming profile with per-500ms FPS histogram + content-length tracking, so we can see whether streaming render cost grows with message length (it doesn't — sustained 60 fps).
2026-06-09 08:21:50 +00:00 · 2026-05-21 18:02:26 -05:00 · 2026-05-21 18:02:26 -05:00 · e529694919
commit e529694919
parent a7e6a4fc0b
2 changed files with 231 additions and 39 deletions
--- a/apps/desktop/scripts/profile-long-stream.mjs
+++ b/apps/desktop/scripts/profile-long-stream.mjs
@ -0,0 +1,191 @@
+#!/usr/bin/env node
+// Long-running stream profile + frame-rate timeline. Submits a prompt that
+// asks for ~30 paragraphs of output, then captures both a CPU profile and
+// a per-100ms frame counter so we can see if FPS sags as the message grows.
+
+import { writeFileSync } from 'node:fs'
+
+const args = Object.fromEntries(
+  process.argv.slice(2).flatMap(s => {
+    const m = s.match(/^--([^=]+)(?:=(.*))?$/)
+    return m ? [[m[1], m[2] ?? true]] : []
+  })
+)
+const PORT = Number(args.port ?? 9222)
+const OUT = String(args.out ?? `/tmp/hermes-long-stream-${Date.now()}`)
+const STREAM_SEC = Number(args.seconds ?? 25)
+
+async function pickRenderer() {
+  const list = await (await fetch(`http://127.0.0.1:${PORT}/json/list`)).json()
+  return list.find(t => t.type === 'page' && t.url.startsWith('http'))
+}
+
+function connect(url) {
+  return new Promise((resolve, reject) => {
+    const ws = new WebSocket(url)
+    let id = 0
+    const pending = new Map()
+    ws.addEventListener('open', () =>
+      resolve({
+        send(method, params = {}) {
+          const myId = ++id
+          ws.send(JSON.stringify({ id: myId, method, params }))
+          return new Promise((res, rej) => pending.set(myId, { res, rej }))
+        },
+        close: () => ws.close()
+      })
+    )
+    ws.addEventListener('error', reject)
+    ws.addEventListener('message', ev => {
+      const m = JSON.parse(typeof ev.data === 'string' ? ev.data : ev.data.toString('utf8'))
+      if (m.id != null) {
+        const p = pending.get(m.id)
+        if (!p) return
+        pending.delete(m.id)
+        m.error ? p.rej(new Error(m.error.message)) : p.res(m.result)
+      }
+    })
+  })
+}
+
+async function evalP(cdp, expr) {
+  const r = await cdp.send('Runtime.evaluate', { expression: expr, returnByValue: true })
+  if (r.exceptionDetails) throw new Error(r.exceptionDetails.text)
+  return r.result.value
+}
+
+async function main() {
+  const tgt = await pickRenderer()
+  console.log('target', tgt.url)
+  const cdp = await connect(tgt.webSocketDebuggerUrl)
+  await cdp.send('Runtime.enable')
+  await cdp.send('Profiler.enable')
+  await cdp.send('Performance.enable')
+
+  // Submit a long-form prompt
+  await evalP(
+    cdp,
+    `(() => {
+      const el = document.querySelector('[data-slot="composer-rich-input"]')
+      el.focus()
+      const r = document.createRange(); r.selectNodeContents(el); r.collapse(false)
+      window.getSelection().removeAllRanges(); window.getSelection().addRange(r)
+    })()`
+  )
+  const prompt = 'write 15 paragraphs about gpu memory bandwidth, memory hierarchies, roofline model, and how modern transformer inference benefits from these. include diagrams in ascii where relevant. no code. fully detailed.'
+  for (const c of prompt) {
+    await cdp.send('Input.dispatchKeyEvent', { type: 'char', text: c, unmodifiedText: c })
+    await new Promise(r => setTimeout(r, 5))
+  }
+  await new Promise(r => setTimeout(r, 200))
+  await cdp.send('Input.dispatchKeyEvent', {
+    type: 'rawKeyDown', windowsVirtualKeyCode: 13, key: 'Enter', code: 'Enter', text: '\r', unmodifiedText: '\r'
+  })
+  await cdp.send('Input.dispatchKeyEvent', { type: 'keyUp', windowsVirtualKeyCode: 13, key: 'Enter', code: 'Enter' })
+
+  console.log('waiting for assistant…')
+  let streaming = false
+  for (let i = 0; i < 100; i++) {
+    const c = await evalP(cdp, `document.querySelectorAll('[data-slot="aui_assistant-message-root"]').length`)
+    if (c > 0) { streaming = true; break }
+    await new Promise(r => setTimeout(r, 100))
+  }
+  if (!streaming) {
+    console.error('no assistant message')
+    cdp.close()
+    return
+  }
+
+  // Install a per-rAF frame counter
+  await evalP(
+    cdp,
+    `(() => {
+      window.__fpsSamples = []
+      window.__fpsT0 = performance.now()
+      window.__fpsLast = performance.now()
+      window.__fpsFrameCount = 0
+      window.__fpsHistogram = []  // {t, fps, contentLen}
+      const tick = () => {
+        const now = performance.now()
+        const dt = now - window.__fpsLast
+        window.__fpsLast = now
+        window.__fpsFrameCount++
+        window.__fpsSamples.push({ t: now - window.__fpsT0, dt })
+        if (performance.now() - window.__fpsT0 < ${STREAM_SEC * 1000}) {
+          requestAnimationFrame(tick)
+        }
+      }
+      requestAnimationFrame(tick)
+      // Bucket fps every 500ms
+      window.__fpsBucket = setInterval(() => {
+        const now = performance.now()
+        const recentCount = window.__fpsSamples.filter(s => now - window.__fpsT0 - s.t < 500).length
+        const root = document.querySelector('[data-slot="aui_thread-content"]')
+        const len = root ? root.innerText.length : 0
+        const v = document.querySelector('[data-slot="aui_thread-viewport"]')
+        window.__fpsHistogram.push({
+          t: now - window.__fpsT0,
+          frames500ms: recentCount,
+          fps: recentCount * 2,
+          contentLen: len,
+          scrollTop: v?.scrollTop ?? 0,
+          scrollHeight: v?.scrollHeight ?? 0
+        })
+      }, 500)
+    })()`
+  )
+
+  // Start CPU profile
+  await cdp.send('Profiler.setSamplingInterval', { interval: 1000 })
+  await cdp.send('Profiler.start')
+
+  await new Promise(r => setTimeout(r, STREAM_SEC * 1000))
+
+  const { profile } = await cdp.send('Profiler.stop')
+  await evalP(cdp, `clearInterval(window.__fpsBucket)`)
+
+  writeFileSync(`${OUT}.cpuprofile`, JSON.stringify(profile))
+  console.log(`cpu profile → ${OUT}.cpuprofile`)
+
+  // Pull fps histogram
+  const hist = JSON.parse(await evalP(cdp, `JSON.stringify(window.__fpsHistogram || [])`))
+  writeFileSync(`${OUT}.fps.json`, JSON.stringify(hist, null, 2))
+
+  console.log(`\n=== FPS over time ===`)
+  console.log(`  t(s)   fps   contentLen   scrollTop/scrollHeight`)
+  for (const h of hist) {
+    const bar = '█'.repeat(Math.min(40, Math.max(0, Math.round(h.fps / 2))))
+    console.log(`  ${(h.t / 1000).toFixed(1).padStart(5)}  ${String(h.fps).padStart(3)}  ${String(h.contentLen).padStart(10)}   ${h.scrollTop}/${h.scrollHeight}  ${bar}`)
+  }
+
+  // Top self frames
+  const total = (profile.endTime - profile.startTime) / 1000
+  const intMs = total / Math.max(1, profile.samples?.length ?? 1)
+  const counts = new Map()
+  for (const s of profile.samples ?? []) counts.set(s, (counts.get(s) ?? 0) + 1)
+  const rows = profile.nodes
+    .map(n => ({ id: n.id, fn: n.callFrame.functionName || '(anon)', url: n.callFrame.url || '', line: n.callFrame.lineNumber, self: counts.get(n.id) ?? 0 }))
+    .sort((a, b) => b.self - a.self)
+    .slice(0, 25)
+  console.log(`\n=== ${total.toFixed(0)}ms wall, ${profile.samples?.length ?? 0} samples (${intMs.toFixed(2)}ms each) ===`)
+  for (const r of rows) {
+    if (r.self === 0) break
+    const url = r.url.replace(/^.*\/src\//, 'src/').replace(/\?.*$/, '').slice(0, 70)
+    console.log(`  ${(r.self * intMs).toFixed(1).padStart(7)}ms  (${String(r.self).padStart(4)} samp)  ${r.fn.padEnd(45)} ${url}:${r.line}`)
+  }
+
+  await evalP(cdp, `
+    (() => {
+      for (const b of document.querySelectorAll('button')) {
+        if ((b.getAttribute('aria-label') || '').toLowerCase().includes('stop')) { b.click(); return }
+      }
+    })()
+  `)
+
+  cdp.close()
+}
+
+main().catch(e => {
+  console.error('fatal:', e.stack ?? e.message)
+  process.exit(1)
+})
--- a/apps/desktop/src/components/assistant-ui/thread-virtualizer.tsx
+++ b/apps/desktop/src/components/assistant-ui/thread-virtualizer.tsx
@ -195,12 +195,6 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
  const prevSessionKeyRef = useRef(sessionKey)
  const prevGroupCountRef = useRef(0)

-  // Track repins-in-a-row to break runaway loops during rapid layout churn.
-  // In healthy paths this drains to zero between frames; we only need the
-  // ceiling for pathological streaming bursts where content height keeps
-  // growing every frame.
-  const inFlightPinDepthRef = useRef(0)
-
  const pinToBottom = useCallback(() => {
    const el = scrollerRef.current

@ -247,41 +241,20 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
      const top = el.scrollTop

      // If this scroll event is the consequence of `pinToBottom` writing
-      // `el.scrollTop`, treat it as ours: never disarm, just consume the
-      // gate. If we landed short of bottom (because content also grew in
-      // the same frame and the browser clamped our scrollTop = scrollHeight
-      // write to the now-stale scrollHeight - clientHeight), schedule
-      // another pin on the next frame. Without this the post-pin scrollTop
-      // gets misread as the user scrolling up, disarming sticky-bottom
-      // permanently and leaving the just-submitted message below the fold.
+      // `el.scrollTop`, treat it as ours: don't disarm. The RO + rAF pin
+      // loop will re-pin on the next frame if the browser clamped us
+      // short of bottom (because content grew in the same frame).
+      // Without this guard the post-pin scrollTop gets misread as the
+      // user scrolling up, disarming sticky-bottom permanently and
+      // leaving the just-submitted message below the fold.
      if (programmaticScrollPendingRef.current > 0) {
        programmaticScrollPendingRef.current -= 1
        lastTopRef.current = top
-        // Stay armed regardless — sticky-bottom should hold through clamp
-        // races.
+        // Always re-arm — sticky-bottom should hold through clamp races.
        armedRef.current = true
        const atBottom = el.scrollHeight - (top + el.clientHeight) <= AT_BOTTOM_THRESHOLD
        setThreadScrolledUp(!atBottom)

-        if (atBottom) {
-          inFlightPinDepthRef.current = 0
-        } else if (inFlightPinDepthRef.current < 8) {
-          // Re-pin synchronously: the browser already laid out for this
-          // scroll event, so reading scrollHeight now gives us the up-to-date
-          // value and writing scrollTop lands us at the actual bottom in the
-          // same frame. Doing this in a rAF causes a 1-frame visual flicker
-          // (distFromBottom briefly nonzero), so we accept one extra
-          // synchronous pin cycle (which goes back through this very
-          // handler with the counter incremented and arm preserved). The
-          // depth guard prevents pathological runaway loops if content
-          // height keeps growing every frame; 8 is generous for any
-          // realistic rendering pattern.
-          inFlightPinDepthRef.current += 1
-          pinToBottom()
-        } else {
-          inFlightPinDepthRef.current = 0
-        }
-
        return
      }

@ -318,7 +291,11 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
  }, [scrollerRef])

  // Follow content growth (streaming, item measurements, loading indicator)
-  // while armed.
+  // while armed. During fast streaming the ResizeObserver can fire many
+  // times per frame as Streamdown re-tokenizes; coalesce to one pin per
+  // animation frame so we don't run the scroll-event/re-pin chain
+  // (~20+ ms self in `Virtualizer.getMaxScrollOffset`) several times per
+  // token.
  useEffect(() => {
    if (!enabled) {
      return undefined
@ -330,11 +307,21 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
      return undefined
    }

-    const observer = new ResizeObserver(() => {
-      if (armedRef.current) {
-        pinToBottom()
+    let pinRafScheduled = false
+    const schedulePin = () => {
+      if (pinRafScheduled || !armedRef.current) {
+        return
      }
-    })
+      pinRafScheduled = true
+      requestAnimationFrame(() => {
+        pinRafScheduled = false
+        if (armedRef.current) {
+          pinToBottom()
+        }
+      })
+    }
+
+    const observer = new ResizeObserver(schedulePin)

    observer.observe(el)

@ -366,6 +353,15 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
  // mutation but before the browser paints. Without this, there's a ~50ms
  // visual window where the new message sits below the fold while we wait
  // for the ResizeObserver / scroll event chain to fire and re-pin.
+  //
+  // We pin TWICE in this critical path — once synchronously, then once on
+  // the next rAF. The second pin catches the case where React mounts the
+  // new message in the second commit (after our layout effect ran), which
+  // grows scrollHeight again; without the rAF pin the user briefly sees a
+  // ~15 px gap below the new message until the RO catches up. Streaming
+  // tokens use the rate-limited RO path only; only the group-count change
+  // (which fires once per user submit / new turn arrival) pays for the
+  // extra pin.
  const prevGroupCountForLayoutRef = useRef(groupCount)
  useLayoutEffect(() => {
    if (!enabled) {
@ -373,6 +369,11 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
    }
    if (groupCount > prevGroupCountForLayoutRef.current && armedRef.current) {
      pinToBottom()
+      requestAnimationFrame(() => {
+        if (armedRef.current) {
+          pinToBottom()
+        }
+      })
    }
    prevGroupCountForLayoutRef.current = groupCount
  }, [enabled, groupCount, pinToBottom])