mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
perf(desktop): rate-limit thread auto-pin during streaming
Follow-up to the Enter-jump fix. The first version did a synchronous
re-pin loop inside the on-scroll handler when the browser clamped our
`scrollTop = scrollHeight` write short of the new bottom; that gave a
tight 4 px visible jump on Enter, but during streaming the
ResizeObserver fires many times per second as content grows, and each
RO callback re-entered the pin loop. CPU profile showed
`Virtualizer.getMaxScrollOffset` climbing to 22 ms self over a typing-
during-streaming window — the sync re-pin path was paying tanstack-
virtual's recompute cost ~3× per token.
Re-architect:
- RO callback coalesces to one pin per animation frame. Streaming-rate
RO bursts now cost the same as a single per-frame pin.
- The on-scroll programmatic-counter guard remains (it's what prevents
the false-disarm bug when the browser clamps a write). It no longer
does sync re-pins; the next RO/rAF will catch up.
- The useLayoutEffect on groupCount (the path that fires on user
submit / new turn arrival) ALSO schedules one rAF pin in addition to
the synchronous pin. This catches the case where React mounts the
new message in a second commit (after our layout effect ran), which
grows scrollHeight again. Two pins instead of a tight loop, paid only
once per turn change.
Net effect on the Cloud Shadows long thread:
enter-jump transient: 12–20 px for 1 frame (was 49 px permanent)
CPU during stream+type: `getMaxScrollOffset` dropped out of top-5
self-time list
typing-during-stream: p50 ~10 ms paint, p99 ~20 ms (1 frame),
occasional 40 ms+ outliers during burst
token arrivals
Also adds scripts/profile-long-stream.mjs: 20-second streaming profile
with per-500ms FPS histogram + content-length tracking, so we can see
whether streaming render cost grows with message length (it doesn't —
sustained 60 fps).
This commit is contained in:
parent
a7e6a4fc0b
commit
e529694919
2 changed files with 231 additions and 39 deletions
191
apps/desktop/scripts/profile-long-stream.mjs
Normal file
191
apps/desktop/scripts/profile-long-stream.mjs
Normal file
|
|
@ -0,0 +1,191 @@
|
|||
#!/usr/bin/env node
|
||||
// Long-running stream profile + frame-rate timeline. Submits a prompt that
|
||||
// asks for ~30 paragraphs of output, then captures both a CPU profile and
|
||||
// a per-100ms frame counter so we can see if FPS sags as the message grows.
|
||||
|
||||
import { writeFileSync } from 'node:fs'
|
||||
|
||||
const args = Object.fromEntries(
|
||||
process.argv.slice(2).flatMap(s => {
|
||||
const m = s.match(/^--([^=]+)(?:=(.*))?$/)
|
||||
return m ? [[m[1], m[2] ?? true]] : []
|
||||
})
|
||||
)
|
||||
const PORT = Number(args.port ?? 9222)
|
||||
const OUT = String(args.out ?? `/tmp/hermes-long-stream-${Date.now()}`)
|
||||
const STREAM_SEC = Number(args.seconds ?? 25)
|
||||
|
||||
async function pickRenderer() {
|
||||
const list = await (await fetch(`http://127.0.0.1:${PORT}/json/list`)).json()
|
||||
return list.find(t => t.type === 'page' && t.url.startsWith('http'))
|
||||
}
|
||||
|
||||
function connect(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const ws = new WebSocket(url)
|
||||
let id = 0
|
||||
const pending = new Map()
|
||||
ws.addEventListener('open', () =>
|
||||
resolve({
|
||||
send(method, params = {}) {
|
||||
const myId = ++id
|
||||
ws.send(JSON.stringify({ id: myId, method, params }))
|
||||
return new Promise((res, rej) => pending.set(myId, { res, rej }))
|
||||
},
|
||||
close: () => ws.close()
|
||||
})
|
||||
)
|
||||
ws.addEventListener('error', reject)
|
||||
ws.addEventListener('message', ev => {
|
||||
const m = JSON.parse(typeof ev.data === 'string' ? ev.data : ev.data.toString('utf8'))
|
||||
if (m.id != null) {
|
||||
const p = pending.get(m.id)
|
||||
if (!p) return
|
||||
pending.delete(m.id)
|
||||
m.error ? p.rej(new Error(m.error.message)) : p.res(m.result)
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
async function evalP(cdp, expr) {
|
||||
const r = await cdp.send('Runtime.evaluate', { expression: expr, returnByValue: true })
|
||||
if (r.exceptionDetails) throw new Error(r.exceptionDetails.text)
|
||||
return r.result.value
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const tgt = await pickRenderer()
|
||||
console.log('target', tgt.url)
|
||||
const cdp = await connect(tgt.webSocketDebuggerUrl)
|
||||
await cdp.send('Runtime.enable')
|
||||
await cdp.send('Profiler.enable')
|
||||
await cdp.send('Performance.enable')
|
||||
|
||||
// Submit a long-form prompt
|
||||
await evalP(
|
||||
cdp,
|
||||
`(() => {
|
||||
const el = document.querySelector('[data-slot="composer-rich-input"]')
|
||||
el.focus()
|
||||
const r = document.createRange(); r.selectNodeContents(el); r.collapse(false)
|
||||
window.getSelection().removeAllRanges(); window.getSelection().addRange(r)
|
||||
})()`
|
||||
)
|
||||
const prompt = 'write 15 paragraphs about gpu memory bandwidth, memory hierarchies, roofline model, and how modern transformer inference benefits from these. include diagrams in ascii where relevant. no code. fully detailed.'
|
||||
for (const c of prompt) {
|
||||
await cdp.send('Input.dispatchKeyEvent', { type: 'char', text: c, unmodifiedText: c })
|
||||
await new Promise(r => setTimeout(r, 5))
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 200))
|
||||
await cdp.send('Input.dispatchKeyEvent', {
|
||||
type: 'rawKeyDown', windowsVirtualKeyCode: 13, key: 'Enter', code: 'Enter', text: '\r', unmodifiedText: '\r'
|
||||
})
|
||||
await cdp.send('Input.dispatchKeyEvent', { type: 'keyUp', windowsVirtualKeyCode: 13, key: 'Enter', code: 'Enter' })
|
||||
|
||||
console.log('waiting for assistant…')
|
||||
let streaming = false
|
||||
for (let i = 0; i < 100; i++) {
|
||||
const c = await evalP(cdp, `document.querySelectorAll('[data-slot="aui_assistant-message-root"]').length`)
|
||||
if (c > 0) { streaming = true; break }
|
||||
await new Promise(r => setTimeout(r, 100))
|
||||
}
|
||||
if (!streaming) {
|
||||
console.error('no assistant message')
|
||||
cdp.close()
|
||||
return
|
||||
}
|
||||
|
||||
// Install a per-rAF frame counter
|
||||
await evalP(
|
||||
cdp,
|
||||
`(() => {
|
||||
window.__fpsSamples = []
|
||||
window.__fpsT0 = performance.now()
|
||||
window.__fpsLast = performance.now()
|
||||
window.__fpsFrameCount = 0
|
||||
window.__fpsHistogram = [] // {t, fps, contentLen}
|
||||
const tick = () => {
|
||||
const now = performance.now()
|
||||
const dt = now - window.__fpsLast
|
||||
window.__fpsLast = now
|
||||
window.__fpsFrameCount++
|
||||
window.__fpsSamples.push({ t: now - window.__fpsT0, dt })
|
||||
if (performance.now() - window.__fpsT0 < ${STREAM_SEC * 1000}) {
|
||||
requestAnimationFrame(tick)
|
||||
}
|
||||
}
|
||||
requestAnimationFrame(tick)
|
||||
// Bucket fps every 500ms
|
||||
window.__fpsBucket = setInterval(() => {
|
||||
const now = performance.now()
|
||||
const recentCount = window.__fpsSamples.filter(s => now - window.__fpsT0 - s.t < 500).length
|
||||
const root = document.querySelector('[data-slot="aui_thread-content"]')
|
||||
const len = root ? root.innerText.length : 0
|
||||
const v = document.querySelector('[data-slot="aui_thread-viewport"]')
|
||||
window.__fpsHistogram.push({
|
||||
t: now - window.__fpsT0,
|
||||
frames500ms: recentCount,
|
||||
fps: recentCount * 2,
|
||||
contentLen: len,
|
||||
scrollTop: v?.scrollTop ?? 0,
|
||||
scrollHeight: v?.scrollHeight ?? 0
|
||||
})
|
||||
}, 500)
|
||||
})()`
|
||||
)
|
||||
|
||||
// Start CPU profile
|
||||
await cdp.send('Profiler.setSamplingInterval', { interval: 1000 })
|
||||
await cdp.send('Profiler.start')
|
||||
|
||||
await new Promise(r => setTimeout(r, STREAM_SEC * 1000))
|
||||
|
||||
const { profile } = await cdp.send('Profiler.stop')
|
||||
await evalP(cdp, `clearInterval(window.__fpsBucket)`)
|
||||
|
||||
writeFileSync(`${OUT}.cpuprofile`, JSON.stringify(profile))
|
||||
console.log(`cpu profile → ${OUT}.cpuprofile`)
|
||||
|
||||
// Pull fps histogram
|
||||
const hist = JSON.parse(await evalP(cdp, `JSON.stringify(window.__fpsHistogram || [])`))
|
||||
writeFileSync(`${OUT}.fps.json`, JSON.stringify(hist, null, 2))
|
||||
|
||||
console.log(`\n=== FPS over time ===`)
|
||||
console.log(` t(s) fps contentLen scrollTop/scrollHeight`)
|
||||
for (const h of hist) {
|
||||
const bar = '█'.repeat(Math.min(40, Math.max(0, Math.round(h.fps / 2))))
|
||||
console.log(` ${(h.t / 1000).toFixed(1).padStart(5)} ${String(h.fps).padStart(3)} ${String(h.contentLen).padStart(10)} ${h.scrollTop}/${h.scrollHeight} ${bar}`)
|
||||
}
|
||||
|
||||
// Top self frames
|
||||
const total = (profile.endTime - profile.startTime) / 1000
|
||||
const intMs = total / Math.max(1, profile.samples?.length ?? 1)
|
||||
const counts = new Map()
|
||||
for (const s of profile.samples ?? []) counts.set(s, (counts.get(s) ?? 0) + 1)
|
||||
const rows = profile.nodes
|
||||
.map(n => ({ id: n.id, fn: n.callFrame.functionName || '(anon)', url: n.callFrame.url || '', line: n.callFrame.lineNumber, self: counts.get(n.id) ?? 0 }))
|
||||
.sort((a, b) => b.self - a.self)
|
||||
.slice(0, 25)
|
||||
console.log(`\n=== ${total.toFixed(0)}ms wall, ${profile.samples?.length ?? 0} samples (${intMs.toFixed(2)}ms each) ===`)
|
||||
for (const r of rows) {
|
||||
if (r.self === 0) break
|
||||
const url = r.url.replace(/^.*\/src\//, 'src/').replace(/\?.*$/, '').slice(0, 70)
|
||||
console.log(` ${(r.self * intMs).toFixed(1).padStart(7)}ms (${String(r.self).padStart(4)} samp) ${r.fn.padEnd(45)} ${url}:${r.line}`)
|
||||
}
|
||||
|
||||
await evalP(cdp, `
|
||||
(() => {
|
||||
for (const b of document.querySelectorAll('button')) {
|
||||
if ((b.getAttribute('aria-label') || '').toLowerCase().includes('stop')) { b.click(); return }
|
||||
}
|
||||
})()
|
||||
`)
|
||||
|
||||
cdp.close()
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error('fatal:', e.stack ?? e.message)
|
||||
process.exit(1)
|
||||
})
|
||||
|
|
@ -195,12 +195,6 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
|
|||
const prevSessionKeyRef = useRef(sessionKey)
|
||||
const prevGroupCountRef = useRef(0)
|
||||
|
||||
// Track repins-in-a-row to break runaway loops during rapid layout churn.
|
||||
// In healthy paths this drains to zero between frames; we only need the
|
||||
// ceiling for pathological streaming bursts where content height keeps
|
||||
// growing every frame.
|
||||
const inFlightPinDepthRef = useRef(0)
|
||||
|
||||
const pinToBottom = useCallback(() => {
|
||||
const el = scrollerRef.current
|
||||
|
||||
|
|
@ -247,41 +241,20 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
|
|||
const top = el.scrollTop
|
||||
|
||||
// If this scroll event is the consequence of `pinToBottom` writing
|
||||
// `el.scrollTop`, treat it as ours: never disarm, just consume the
|
||||
// gate. If we landed short of bottom (because content also grew in
|
||||
// the same frame and the browser clamped our scrollTop = scrollHeight
|
||||
// write to the now-stale scrollHeight - clientHeight), schedule
|
||||
// another pin on the next frame. Without this the post-pin scrollTop
|
||||
// gets misread as the user scrolling up, disarming sticky-bottom
|
||||
// permanently and leaving the just-submitted message below the fold.
|
||||
// `el.scrollTop`, treat it as ours: don't disarm. The RO + rAF pin
|
||||
// loop will re-pin on the next frame if the browser clamped us
|
||||
// short of bottom (because content grew in the same frame).
|
||||
// Without this guard the post-pin scrollTop gets misread as the
|
||||
// user scrolling up, disarming sticky-bottom permanently and
|
||||
// leaving the just-submitted message below the fold.
|
||||
if (programmaticScrollPendingRef.current > 0) {
|
||||
programmaticScrollPendingRef.current -= 1
|
||||
lastTopRef.current = top
|
||||
// Stay armed regardless — sticky-bottom should hold through clamp
|
||||
// races.
|
||||
// Always re-arm — sticky-bottom should hold through clamp races.
|
||||
armedRef.current = true
|
||||
const atBottom = el.scrollHeight - (top + el.clientHeight) <= AT_BOTTOM_THRESHOLD
|
||||
setThreadScrolledUp(!atBottom)
|
||||
|
||||
if (atBottom) {
|
||||
inFlightPinDepthRef.current = 0
|
||||
} else if (inFlightPinDepthRef.current < 8) {
|
||||
// Re-pin synchronously: the browser already laid out for this
|
||||
// scroll event, so reading scrollHeight now gives us the up-to-date
|
||||
// value and writing scrollTop lands us at the actual bottom in the
|
||||
// same frame. Doing this in a rAF causes a 1-frame visual flicker
|
||||
// (distFromBottom briefly nonzero), so we accept one extra
|
||||
// synchronous pin cycle (which goes back through this very
|
||||
// handler with the counter incremented and arm preserved). The
|
||||
// depth guard prevents pathological runaway loops if content
|
||||
// height keeps growing every frame; 8 is generous for any
|
||||
// realistic rendering pattern.
|
||||
inFlightPinDepthRef.current += 1
|
||||
pinToBottom()
|
||||
} else {
|
||||
inFlightPinDepthRef.current = 0
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -318,7 +291,11 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
|
|||
}, [scrollerRef])
|
||||
|
||||
// Follow content growth (streaming, item measurements, loading indicator)
|
||||
// while armed.
|
||||
// while armed. During fast streaming the ResizeObserver can fire many
|
||||
// times per frame as Streamdown re-tokenizes; coalesce to one pin per
|
||||
// animation frame so we don't run the scroll-event/re-pin chain
|
||||
// (~20+ ms self in `Virtualizer.getMaxScrollOffset`) several times per
|
||||
// token.
|
||||
useEffect(() => {
|
||||
if (!enabled) {
|
||||
return undefined
|
||||
|
|
@ -330,11 +307,21 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
|
|||
return undefined
|
||||
}
|
||||
|
||||
const observer = new ResizeObserver(() => {
|
||||
if (armedRef.current) {
|
||||
pinToBottom()
|
||||
let pinRafScheduled = false
|
||||
const schedulePin = () => {
|
||||
if (pinRafScheduled || !armedRef.current) {
|
||||
return
|
||||
}
|
||||
})
|
||||
pinRafScheduled = true
|
||||
requestAnimationFrame(() => {
|
||||
pinRafScheduled = false
|
||||
if (armedRef.current) {
|
||||
pinToBottom()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const observer = new ResizeObserver(schedulePin)
|
||||
|
||||
observer.observe(el)
|
||||
|
||||
|
|
@ -366,6 +353,15 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
|
|||
// mutation but before the browser paints. Without this, there's a ~50ms
|
||||
// visual window where the new message sits below the fold while we wait
|
||||
// for the ResizeObserver / scroll event chain to fire and re-pin.
|
||||
//
|
||||
// We pin TWICE in this critical path — once synchronously, then once on
|
||||
// the next rAF. The second pin catches the case where React mounts the
|
||||
// new message in the second commit (after our layout effect ran), which
|
||||
// grows scrollHeight again; without the rAF pin the user briefly sees a
|
||||
// ~15 px gap below the new message until the RO catches up. Streaming
|
||||
// tokens use the rate-limited RO path only; only the group-count change
|
||||
// (which fires once per user submit / new turn arrival) pays for the
|
||||
// extra pin.
|
||||
const prevGroupCountForLayoutRef = useRef(groupCount)
|
||||
useLayoutEffect(() => {
|
||||
if (!enabled) {
|
||||
|
|
@ -373,6 +369,11 @@ function useThreadScrollAnchor({ enabled, groupCount, scrollerRef, sessionKey, v
|
|||
}
|
||||
if (groupCount > prevGroupCountForLayoutRef.current && armedRef.current) {
|
||||
pinToBottom()
|
||||
requestAnimationFrame(() => {
|
||||
if (armedRef.current) {
|
||||
pinToBottom()
|
||||
}
|
||||
})
|
||||
}
|
||||
prevGroupCountForLayoutRef.current = groupCount
|
||||
}, [enabled, groupCount, pinToBottom])
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue