perf(desktop): rate-limit thread auto-pin during streaming

Follow-up to the Enter-jump fix. The first version did a synchronous
re-pin loop inside the on-scroll handler when the browser clamped our
`scrollTop = scrollHeight` write short of the new bottom; that gave a
tight 4 px visible jump on Enter, but during streaming the
ResizeObserver fires many times per second as content grows, and each
RO callback re-entered the pin loop. CPU profile showed
`Virtualizer.getMaxScrollOffset` climbing to 22 ms self over a typing-
during-streaming window — the sync re-pin path was paying tanstack-
virtual's recompute cost ~3× per token.

Re-architect:

- RO callback coalesces to one pin per animation frame. Streaming-rate
  RO bursts now cost the same as a single per-frame pin.
- The on-scroll programmatic-counter guard remains (it's what prevents
  the false-disarm bug when the browser clamps a write). It no longer
  does sync re-pins; the next RO/rAF will catch up.
- The useLayoutEffect on groupCount (the path that fires on user
  submit / new turn arrival) ALSO schedules one rAF pin in addition to
  the synchronous pin. This catches the case where React mounts the
  new message in a second commit (after our layout effect ran), which
  grows scrollHeight again. Two pins instead of a tight loop, paid only
  once per turn change.

Net effect on the Cloud Shadows long thread:

  enter-jump transient:   12–20 px for 1 frame (was 49 px permanent)
  CPU during stream+type: `getMaxScrollOffset` dropped out of top-5
                          self-time list
  typing-during-stream:   p50 ~10 ms paint, p99 ~20 ms (1 frame),
                          occasional 40 ms+ outliers during burst
                          token arrivals

Also adds scripts/profile-long-stream.mjs: 20-second streaming profile
with per-500ms FPS histogram + content-length tracking, so we can see
whether streaming render cost grows with message length (it doesn't —
sustained 60 fps).
This commit is contained in:
Brooklyn Nicholson 2026-05-21 18:02:26 -05:00
parent a7e6a4fc0b
commit e529694919
2 changed files with 231 additions and 39 deletions

View file

@ -0,0 +1,191 @@
#!/usr/bin/env node
// Long-running stream profile + frame-rate timeline. Submits a prompt that
// asks for ~30 paragraphs of output, then captures both a CPU profile and
// a per-100ms frame counter so we can see if FPS sags as the message grows.
import { writeFileSync } from 'node:fs'
const args = Object.fromEntries(
process.argv.slice(2).flatMap(s => {
const m = s.match(/^--([^=]+)(?:=(.*))?$/)
return m ? [[m[1], m[2] ?? true]] : []
})
)
const PORT = Number(args.port ?? 9222)
const OUT = String(args.out ?? `/tmp/hermes-long-stream-${Date.now()}`)
const STREAM_SEC = Number(args.seconds ?? 25)
async function pickRenderer() {
const list = await (await fetch(`http://127.0.0.1:${PORT}/json/list`)).json()
return list.find(t => t.type === 'page' && t.url.startsWith('http'))
}
function connect(url) {
return new Promise((resolve, reject) => {
const ws = new WebSocket(url)
let id = 0
const pending = new Map()
ws.addEventListener('open', () =>
resolve({
send(method, params = {}) {
const myId = ++id
ws.send(JSON.stringify({ id: myId, method, params }))
return new Promise((res, rej) => pending.set(myId, { res, rej }))
},
close: () => ws.close()
})
)
ws.addEventListener('error', reject)
ws.addEventListener('message', ev => {
const m = JSON.parse(typeof ev.data === 'string' ? ev.data : ev.data.toString('utf8'))
if (m.id != null) {
const p = pending.get(m.id)
if (!p) return
pending.delete(m.id)
m.error ? p.rej(new Error(m.error.message)) : p.res(m.result)
}
})
})
}
async function evalP(cdp, expr) {
const r = await cdp.send('Runtime.evaluate', { expression: expr, returnByValue: true })
if (r.exceptionDetails) throw new Error(r.exceptionDetails.text)
return r.result.value
}
async function main() {
const tgt = await pickRenderer()
console.log('target', tgt.url)
const cdp = await connect(tgt.webSocketDebuggerUrl)
await cdp.send('Runtime.enable')
await cdp.send('Profiler.enable')
await cdp.send('Performance.enable')
// Submit a long-form prompt
await evalP(
cdp,
`(() => {
const el = document.querySelector('[data-slot="composer-rich-input"]')
el.focus()
const r = document.createRange(); r.selectNodeContents(el); r.collapse(false)
window.getSelection().removeAllRanges(); window.getSelection().addRange(r)
})()`
)
const prompt = 'write 15 paragraphs about gpu memory bandwidth, memory hierarchies, roofline model, and how modern transformer inference benefits from these. include diagrams in ascii where relevant. no code. fully detailed.'
for (const c of prompt) {
await cdp.send('Input.dispatchKeyEvent', { type: 'char', text: c, unmodifiedText: c })
await new Promise(r => setTimeout(r, 5))
}
await new Promise(r => setTimeout(r, 200))
await cdp.send('Input.dispatchKeyEvent', {
type: 'rawKeyDown', windowsVirtualKeyCode: 13, key: 'Enter', code: 'Enter', text: '\r', unmodifiedText: '\r'
})
await cdp.send('Input.dispatchKeyEvent', { type: 'keyUp', windowsVirtualKeyCode: 13, key: 'Enter', code: 'Enter' })
console.log('waiting for assistant…')
let streaming = false
for (let i = 0; i < 100; i++) {
const c = await evalP(cdp, `document.querySelectorAll('[data-slot="aui_assistant-message-root"]').length`)
if (c > 0) { streaming = true; break }
await new Promise(r => setTimeout(r, 100))
}
if (!streaming) {
console.error('no assistant message')
cdp.close()
return
}
// Install a per-rAF frame counter
await evalP(
cdp,
`(() => {
window.__fpsSamples = []
window.__fpsT0 = performance.now()
window.__fpsLast = performance.now()
window.__fpsFrameCount = 0
window.__fpsHistogram = [] // {t, fps, contentLen}
const tick = () => {
const now = performance.now()
const dt = now - window.__fpsLast
window.__fpsLast = now
window.__fpsFrameCount++
window.__fpsSamples.push({ t: now - window.__fpsT0, dt })
if (performance.now() - window.__fpsT0 < ${STREAM_SEC * 1000}) {
requestAnimationFrame(tick)
}
}
requestAnimationFrame(tick)
// Bucket fps every 500ms
window.__fpsBucket = setInterval(() => {
const now = performance.now()
const recentCount = window.__fpsSamples.filter(s => now - window.__fpsT0 - s.t < 500).length
const root = document.querySelector('[data-slot="aui_thread-content"]')
const len = root ? root.innerText.length : 0
const v = document.querySelector('[data-slot="aui_thread-viewport"]')
window.__fpsHistogram.push({
t: now - window.__fpsT0,
frames500ms: recentCount,
fps: recentCount * 2,
contentLen: len,
scrollTop: v?.scrollTop ?? 0,
scrollHeight: v?.scrollHeight ?? 0
})
}, 500)
})()`
)
// Start CPU profile
await cdp.send('Profiler.setSamplingInterval', { interval: 1000 })
await cdp.send('Profiler.start')
await new Promise(r => setTimeout(r, STREAM_SEC * 1000))
const { profile } = await cdp.send('Profiler.stop')
await evalP(cdp, `clearInterval(window.__fpsBucket)`)
writeFileSync(`${OUT}.cpuprofile`, JSON.stringify(profile))
console.log(`cpu profile → ${OUT}.cpuprofile`)
// Pull fps histogram
const hist = JSON.parse(await evalP(cdp, `JSON.stringify(window.__fpsHistogram || [])`))
writeFileSync(`${OUT}.fps.json`, JSON.stringify(hist, null, 2))
console.log(`\n=== FPS over time ===`)
console.log(` t(s) fps contentLen scrollTop/scrollHeight`)
for (const h of hist) {
const bar = '█'.repeat(Math.min(40, Math.max(0, Math.round(h.fps / 2))))
console.log(` ${(h.t / 1000).toFixed(1).padStart(5)} ${String(h.fps).padStart(3)} ${String(h.contentLen).padStart(10)} ${h.scrollTop}/${h.scrollHeight} ${bar}`)
}
// Top self frames
const total = (profile.endTime - profile.startTime) / 1000
const intMs = total / Math.max(1, profile.samples?.length ?? 1)
const counts = new Map()
for (const s of profile.samples ?? []) counts.set(s, (counts.get(s) ?? 0) + 1)
const rows = profile.nodes
.map(n => ({ id: n.id, fn: n.callFrame.functionName || '(anon)', url: n.callFrame.url || '', line: n.callFrame.lineNumber, self: counts.get(n.id) ?? 0 }))
.sort((a, b) => b.self - a.self)
.slice(0, 25)
console.log(`\n=== ${total.toFixed(0)}ms wall, ${profile.samples?.length ?? 0} samples (${intMs.toFixed(2)}ms each) ===`)
for (const r of rows) {
if (r.self === 0) break
const url = r.url.replace(/^.*\/src\//, 'src/').replace(/\?.*$/, '').slice(0, 70)
console.log(` ${(r.self * intMs).toFixed(1).padStart(7)}ms (${String(r.self).padStart(4)} samp) ${r.fn.padEnd(45)} ${url}:${r.line}`)
}
await evalP(cdp, `
(() => {
for (const b of document.querySelectorAll('button')) {
if ((b.getAttribute('aria-label') || '').toLowerCase().includes('stop')) { b.click(); return }
}
})()
`)
cdp.close()
}
main().catch(e => {
console.error('fatal:', e.stack ?? e.message)
process.exit(1)
})

View file

@ -195,12 +195,6 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
const prevSessionKeyRef = useRef(sessionKey)
const prevGroupCountRef = useRef(0)
// Track repins-in-a-row to break runaway loops during rapid layout churn.
// In healthy paths this drains to zero between frames; we only need the
// ceiling for pathological streaming bursts where content height keeps
// growing every frame.
const inFlightPinDepthRef = useRef(0)
const pinToBottom = useCallback(() => {
const el = scrollerRef.current
@ -247,41 +241,20 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
const top = el.scrollTop
// If this scroll event is the consequence of `pinToBottom` writing
// `el.scrollTop`, treat it as ours: never disarm, just consume the
// gate. If we landed short of bottom (because content also grew in
// the same frame and the browser clamped our scrollTop = scrollHeight
// write to the now-stale scrollHeight - clientHeight), schedule
// another pin on the next frame. Without this the post-pin scrollTop
// gets misread as the user scrolling up, disarming sticky-bottom
// permanently and leaving the just-submitted message below the fold.
// `el.scrollTop`, treat it as ours: don't disarm. The RO + rAF pin
// loop will re-pin on the next frame if the browser clamped us
// short of bottom (because content grew in the same frame).
// Without this guard the post-pin scrollTop gets misread as the
// user scrolling up, disarming sticky-bottom permanently and
// leaving the just-submitted message below the fold.
if (programmaticScrollPendingRef.current > 0) {
programmaticScrollPendingRef.current -= 1
lastTopRef.current = top
// Stay armed regardless — sticky-bottom should hold through clamp
// races.
// Always re-arm — sticky-bottom should hold through clamp races.
armedRef.current = true
const atBottom = el.scrollHeight - (top + el.clientHeight) <= AT_BOTTOM_THRESHOLD
setThreadScrolledUp(!atBottom)
if (atBottom) {
inFlightPinDepthRef.current = 0
} else if (inFlightPinDepthRef.current < 8) {
// Re-pin synchronously: the browser already laid out for this
// scroll event, so reading scrollHeight now gives us the up-to-date
// value and writing scrollTop lands us at the actual bottom in the
// same frame. Doing this in a rAF causes a 1-frame visual flicker
// (distFromBottom briefly nonzero), so we accept one extra
// synchronous pin cycle (which goes back through this very
// handler with the counter incremented and arm preserved). The
// depth guard prevents pathological runaway loops if content
// height keeps growing every frame; 8 is generous for any
// realistic rendering pattern.
inFlightPinDepthRef.current += 1
pinToBottom()
} else {
inFlightPinDepthRef.current = 0
}
return
}
@ -318,7 +291,11 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
}, [scrollerRef])
// Follow content growth (streaming, item measurements, loading indicator)
// while armed.
// while armed. During fast streaming the ResizeObserver can fire many
// times per frame as Streamdown re-tokenizes; coalesce to one pin per
// animation frame so we don't run the scroll-event/re-pin chain
// (~20+ ms self in `Virtualizer.getMaxScrollOffset`) several times per
// token.
useEffect(() => {
if (!enabled) {
return undefined
@ -330,11 +307,21 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
return undefined
}
const observer = new ResizeObserver(() => {
if (armedRef.current) {
pinToBottom()
let pinRafScheduled = false
const schedulePin = () => {
if (pinRafScheduled || !armedRef.current) {
return
}
})
pinRafScheduled = true
requestAnimationFrame(() => {
pinRafScheduled = false
if (armedRef.current) {
pinToBottom()
}
})
}
const observer = new ResizeObserver(schedulePin)
observer.observe(el)
@ -366,6 +353,15 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
// mutation but before the browser paints. Without this, there's a ~50ms
// visual window where the new message sits below the fold while we wait
// for the ResizeObserver / scroll event chain to fire and re-pin.
//
// We pin TWICE in this critical path — once synchronously, then once on
// the next rAF. The second pin catches the case where React mounts the
// new message in the second commit (after our layout effect ran), which
// grows scrollHeight again; without the rAF pin the user briefly sees a
// ~15 px gap below the new message until the RO catches up. Streaming
// tokens use the rate-limited RO path only; only the group-count change
// (which fires once per user submit / new turn arrival) pays for the
// extra pin.
const prevGroupCountForLayoutRef = useRef(groupCount)
useLayoutEffect(() => {
if (!enabled) {
@ -373,6 +369,11 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
}
if (groupCount > prevGroupCountForLayoutRef.current && armedRef.current) {
pinToBottom()
requestAnimationFrame(() => {
if (armedRef.current) {
pinToBottom()
}
})
}
prevGroupCountForLayoutRef.current = groupCount
}, [enabled, groupCount, pinToBottom])