mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
Automatic heap dumps from the TUI memory monitor could write multi-GiB
.heapsnapshot files on every threshold cross, growing ~/.hermes/heapdumps
to tens of GiB. Add four layered safeguards:
- Gate auto-high/auto-critical snapshots behind HERMES_AUTO_HEAPDUMP=1;
manual dumps remain unchanged.
- Always write the lightweight diagnostics JSON sidecar so users still
get an actionable artifact when the snapshot is suppressed.
- Cap total bytes in the dump dir (HERMES_HEAPDUMP_MAX_BYTES, default
2 GiB), evicting oldest first, retaining the newest.
- Add a cooldown between auto dumps (HERMES_AUTO_HEAPDUMP_COOLDOWN_MS,
default 10 min) so an oscillating heap can't re-trigger.
Closes #21767
184 lines
7 KiB
TypeScript
184 lines
7 KiB
TypeScript
import { getHeapStatistics } from 'node:v8'
|
|
|
|
import { type HeapDumpResult, performHeapDump } from './memory.js'
|
|
|
|
export type MemoryLevel = 'critical' | 'high' | 'normal'
|
|
|
|
export interface MemorySnapshot {
|
|
heapUsed: number
|
|
level: MemoryLevel
|
|
rss: number
|
|
}
|
|
|
|
export interface MemoryMonitorOptions {
|
|
criticalBytes?: number
|
|
highBytes?: number
|
|
intervalMs?: number
|
|
onCritical?: (snap: MemorySnapshot, dump: HeapDumpResult | null) => void
|
|
onHigh?: (snap: MemorySnapshot, dump: HeapDumpResult | null) => void
|
|
// Fired ONCE when heap growth looks abnormal while still far below the
|
|
// critical exit threshold — the regime where the TUI used to die silently
|
|
// (#34095: Node OOMs from an Ink render-tree blowup at a few hundred MB,
|
|
// well under criticalBytes, so onCritical never fired and the gateway death
|
|
// showed up only as a bare `stdin EOF`). A visible warning here makes that
|
|
// class of death diagnosable instead of silent.
|
|
onWarn?: (snap: MemorySnapshot) => void
|
|
warnBytes?: number
|
|
}
|
|
|
|
const GB = 1024 ** 3
|
|
const MB = 1024 ** 2
|
|
|
|
// Resolve the exit / dump thresholds RELATIVE to the actual V8 heap ceiling
|
|
// (--max-old-space-size, 8GB for the TUI) instead of hardcoding 2.5GB. The old
|
|
// constant killed the process — and silently closed the gateway's stdin — at
|
|
// ~31% of an 8GB ceiling, treating a normal long-session heap as an OOM. We now
|
|
// exit only when genuinely near the ceiling (critical ~88%, high ~70%), and
|
|
// clamp to sane floors/ceilings so a tiny --max-old-space-size can't drive the
|
|
// thresholds below the warn watermark. Callers may still override explicitly.
|
|
function resolveThresholds(criticalBytes?: number, highBytes?: number) {
|
|
let limit = 0
|
|
try {
|
|
limit = getHeapStatistics().heap_size_limit || 0
|
|
} catch {
|
|
limit = 0
|
|
}
|
|
|
|
// Fall back to the historical 8GB ceiling if V8 doesn't report one.
|
|
const ceiling = limit > 0 ? limit : 8 * GB
|
|
const critical = criticalBytes ?? Math.max(2 * GB, Math.round(ceiling * 0.88))
|
|
const high = highBytes ?? Math.max(1 * GB, Math.min(critical - 256 * MB, Math.round(ceiling * 0.7)))
|
|
|
|
return { critical, high }
|
|
}
|
|
|
|
// Deferred @hermes/ink import: loading `@hermes/ink` at module top-level
|
|
// pulls the full ~414KB Ink bundle (React, renderer, components, hooks) onto
|
|
// the critical path before the Python gateway can even be spawned. That
|
|
// serialised roughly 150ms of Node work in front of gw.start() on every
|
|
// cold `hermes --tui` launch.
|
|
//
|
|
// evictInkCaches only runs inside `tick()`, which fires on a 10s timer and
|
|
// only when heap pressure crosses the high-water mark — by then Ink has
|
|
// long since been loaded by the app entry. This dynamic import is a no-op
|
|
// on the hot path (module is already in the ESM cache); when a startup
|
|
// spike somehow trips the threshold before the app registers its own Ink
|
|
// import, we pay the load cost exactly once, inside the tick that needs it.
|
|
let _evictInkCaches: ((level: 'all' | 'half') => unknown) | null = null
|
|
let _evictInkCachesPromise: Promise<(level: 'all' | 'half') => unknown> | null = null
|
|
|
|
async function _ensureEvictInkCaches(): Promise<(level: 'all' | 'half') => unknown> {
|
|
if (_evictInkCaches) {
|
|
return _evictInkCaches
|
|
}
|
|
|
|
_evictInkCachesPromise ??= import('@hermes/ink')
|
|
.then(mod => {
|
|
_evictInkCaches = mod.evictInkCaches as (level: 'all' | 'half') => unknown
|
|
|
|
return _evictInkCaches
|
|
})
|
|
.catch(err => {
|
|
_evictInkCachesPromise = null
|
|
throw err
|
|
})
|
|
|
|
return _evictInkCachesPromise
|
|
}
|
|
|
|
export function startMemoryMonitor({
|
|
criticalBytes,
|
|
highBytes,
|
|
intervalMs = 10_000,
|
|
onCritical,
|
|
onHigh,
|
|
onWarn,
|
|
warnBytes = 600 * MB
|
|
}: MemoryMonitorOptions = {}): () => void {
|
|
const { critical, high } = resolveThresholds(criticalBytes, highBytes)
|
|
const dumped = new Set<Exclude<MemoryLevel, 'normal'>>()
|
|
const inFlight = new Set<Exclude<MemoryLevel, 'normal'>>()
|
|
|
|
// Early-warning state (#34095): the silent-death regime is BELOW `high`, so
|
|
// the level machine above never sees it. Track the previous sample and fire
|
|
// onWarn at most once when heap both crosses a modest absolute floor AND is
|
|
// climbing steeply (≥150MB between 10s ticks) — the signature of a render-
|
|
// tree blowup — so the user gets a visible heads-up before Node OOMs under
|
|
// the exit threshold. Re-armed only after heap falls back below the floor.
|
|
// `lastHeap < 0` marks the un-seeded first sample so a cold start that opens
|
|
// already-high can't be mistaken for sudden growth (growth = current - last).
|
|
let lastHeap = -1
|
|
let warned = false
|
|
const WARN_GROWTH_STEP = 150 * MB
|
|
|
|
// Cooldown prevents repeated auto dumps when heap oscillates around the
|
|
// threshold (issue #21767). `dumped` alone is not enough — it clears on
|
|
// every transition back to `normal`.
|
|
const cooldownRaw = process.env.HERMES_AUTO_HEAPDUMP_COOLDOWN_MS?.trim()
|
|
const cooldownParsed = cooldownRaw ? Number(cooldownRaw) : NaN
|
|
const cooldownMs = Number.isFinite(cooldownParsed) && cooldownParsed >= 0 ? cooldownParsed : 600_000
|
|
let lastAutoDumpAt = 0
|
|
|
|
const tick = async () => {
|
|
const { heapUsed, rss } = process.memoryUsage()
|
|
|
|
// Sub-threshold abnormal-growth warning. Skip on the first (un-seeded)
|
|
// sample — we need a prior reading to measure a delta against.
|
|
if (heapUsed < high && lastHeap >= 0) {
|
|
if (!warned && heapUsed >= warnBytes && heapUsed - lastHeap >= WARN_GROWTH_STEP) {
|
|
warned = true
|
|
onWarn?.({ heapUsed, level: 'normal', rss })
|
|
} else if (heapUsed < warnBytes) {
|
|
warned = false
|
|
}
|
|
}
|
|
lastHeap = heapUsed
|
|
|
|
const level: MemoryLevel = heapUsed >= critical ? 'critical' : heapUsed >= high ? 'high' : 'normal'
|
|
|
|
if (level === 'normal') {
|
|
dumped.clear()
|
|
return
|
|
}
|
|
|
|
if (dumped.has(level) || inFlight.has(level)) {
|
|
return
|
|
}
|
|
|
|
if (Date.now() - lastAutoDumpAt < cooldownMs) {
|
|
return
|
|
}
|
|
|
|
inFlight.add(level)
|
|
lastAutoDumpAt = Date.now()
|
|
|
|
// Prune Ink content caches before dump/exit — half on 'high' (recoverable),
|
|
// full on 'critical' (post-dump RSS reduction, keeps user running).
|
|
// Deferred import keeps `@hermes/ink` off the cold-start critical path;
|
|
// by the time a tick fires 10s after launch the app has already loaded
|
|
// the same module, so this resolves instantly from the ESM cache.
|
|
try {
|
|
try {
|
|
const evictInkCaches = await _ensureEvictInkCaches()
|
|
evictInkCaches(level === 'critical' ? 'all' : 'half')
|
|
} catch {
|
|
// Best-effort: if the dynamic import fails for any reason we still
|
|
// continue to the heap dump below so the user gets diagnostics.
|
|
}
|
|
|
|
dumped.add(level)
|
|
const dump = await performHeapDump(level === 'critical' ? 'auto-critical' : 'auto-high').catch(() => null)
|
|
const snap: MemorySnapshot = { heapUsed, level, rss }
|
|
|
|
;(level === 'critical' ? onCritical : onHigh)?.(snap, dump)
|
|
} finally {
|
|
inFlight.delete(level)
|
|
}
|
|
}
|
|
|
|
const handle = setInterval(() => void tick(), intervalMs)
|
|
|
|
handle.unref?.()
|
|
|
|
return () => clearInterval(handle)
|
|
}
|