mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-25 11:02:03 +00:00
Automatic heap dumps from the TUI memory monitor could write multi-GiB
.heapsnapshot files on every threshold cross, growing ~/.hermes/heapdumps
to tens of GiB. Add four layered safeguards:
- Gate auto-high/auto-critical snapshots behind HERMES_AUTO_HEAPDUMP=1;
manual dumps remain unchanged.
- Always write the lightweight diagnostics JSON sidecar so users still
get an actionable artifact when the snapshot is suppressed.
- Cap total bytes in the dump dir (HERMES_HEAPDUMP_MAX_BYTES, default
2 GiB), evicting oldest first, retaining the newest.
- Add a cooldown between auto dumps (HERMES_AUTO_HEAPDUMP_COOLDOWN_MS,
default 10 min) so an oscillating heap can't re-trigger.
Closes #21767
246 lines
8.5 KiB
TypeScript
246 lines
8.5 KiB
TypeScript
import { createWriteStream } from 'node:fs'
|
|
import { mkdir, readdir, readFile, stat, unlink, writeFile } from 'node:fs/promises'
|
|
import { homedir, tmpdir } from 'node:os'
|
|
import { join } from 'node:path'
|
|
import { pipeline } from 'node:stream/promises'
|
|
import { getHeapSnapshot, getHeapSpaceStatistics, getHeapStatistics } from 'node:v8'
|
|
|
|
export type MemoryTrigger = 'auto-critical' | 'auto-high' | 'manual'
|
|
|
|
export interface MemoryDiagnostics {
|
|
activeHandles: number
|
|
activeRequests: number
|
|
analysis: {
|
|
potentialLeaks: string[]
|
|
recommendation: string
|
|
}
|
|
memoryGrowthRate: {
|
|
bytesPerSecond: number
|
|
mbPerHour: number
|
|
}
|
|
memoryUsage: {
|
|
arrayBuffers: number
|
|
external: number
|
|
heapTotal: number
|
|
heapUsed: number
|
|
rss: number
|
|
}
|
|
nodeVersion: string
|
|
openFileDescriptors?: number
|
|
platform: string
|
|
resourceUsage: {
|
|
maxRSS: number
|
|
systemCPUTime: number
|
|
userCPUTime: number
|
|
}
|
|
smapsRollup?: string
|
|
timestamp: string
|
|
trigger: MemoryTrigger
|
|
uptimeSeconds: number
|
|
v8HeapSpaces?: { available: number; name: string; size: number; used: number }[]
|
|
v8HeapStats: {
|
|
detachedContexts: number
|
|
heapSizeLimit: number
|
|
mallocedMemory: number
|
|
nativeContexts: number
|
|
peakMallocedMemory: number
|
|
}
|
|
}
|
|
|
|
export interface HeapDumpResult {
|
|
diagPath?: string
|
|
error?: string
|
|
heapPath?: string
|
|
// True when an auto trigger wrote diagnostics only and intentionally skipped
|
|
// the heavy snapshot because HERMES_AUTO_HEAPDUMP was not enabled (#21767).
|
|
suppressed?: boolean
|
|
success: boolean
|
|
}
|
|
|
|
export async function captureMemoryDiagnostics(trigger: MemoryTrigger): Promise<MemoryDiagnostics> {
|
|
const usage = process.memoryUsage()
|
|
const heapStats = getHeapStatistics()
|
|
const resourceUsage = process.resourceUsage()
|
|
const uptimeSeconds = process.uptime()
|
|
|
|
// Not available on Bun / older Node.
|
|
let heapSpaces: ReturnType<typeof getHeapSpaceStatistics> | undefined
|
|
|
|
try {
|
|
heapSpaces = getHeapSpaceStatistics()
|
|
} catch {
|
|
/* noop */
|
|
}
|
|
|
|
const internals = process as unknown as {
|
|
_getActiveHandles: () => unknown[]
|
|
_getActiveRequests: () => unknown[]
|
|
}
|
|
|
|
const activeHandles = internals._getActiveHandles().length
|
|
const activeRequests = internals._getActiveRequests().length
|
|
const openFileDescriptors = await swallow(async () => (await readdir('/proc/self/fd')).length)
|
|
const smapsRollup = await swallow(() => readFile('/proc/self/smaps_rollup', 'utf8'))
|
|
|
|
const nativeMemory = usage.rss - usage.heapUsed
|
|
// Real growth rate since STARTED_AT (captured at module load) — NOT a lifetime
|
|
// average of rss/uptime, which would report phantom "growth" for a stable process.
|
|
const elapsed = Math.max(0, uptimeSeconds - STARTED_AT.uptime)
|
|
const bytesPerSecond = elapsed > 0 ? (usage.rss - STARTED_AT.rss) / elapsed : 0
|
|
const mbPerHour = (bytesPerSecond * 3600) / (1024 * 1024)
|
|
|
|
const potentialLeaks = [
|
|
heapStats.number_of_detached_contexts > 0 &&
|
|
`${heapStats.number_of_detached_contexts} detached context(s) — possible component/closure leak`,
|
|
activeHandles > 100 && `${activeHandles} active handles — possible timer/socket leak`,
|
|
nativeMemory > usage.heapUsed && 'Native memory > heap — leak may be in native addons',
|
|
mbPerHour > 100 && `High memory growth rate: ${mbPerHour.toFixed(1)} MB/hour`,
|
|
openFileDescriptors && openFileDescriptors > 500 && `${openFileDescriptors} open FDs — possible file/socket leak`
|
|
].filter((s): s is string => typeof s === 'string')
|
|
|
|
return {
|
|
activeHandles,
|
|
activeRequests,
|
|
analysis: {
|
|
potentialLeaks,
|
|
recommendation: potentialLeaks.length
|
|
? `WARNING: ${potentialLeaks.length} potential leak indicator(s). See potentialLeaks.`
|
|
: 'No obvious leak indicators. Inspect heap snapshot for retained objects.'
|
|
},
|
|
memoryGrowthRate: { bytesPerSecond, mbPerHour },
|
|
memoryUsage: {
|
|
arrayBuffers: usage.arrayBuffers,
|
|
external: usage.external,
|
|
heapTotal: usage.heapTotal,
|
|
heapUsed: usage.heapUsed,
|
|
rss: usage.rss
|
|
},
|
|
nodeVersion: process.version,
|
|
openFileDescriptors,
|
|
platform: process.platform,
|
|
resourceUsage: {
|
|
maxRSS: resourceUsage.maxRSS * 1024,
|
|
systemCPUTime: resourceUsage.systemCPUTime,
|
|
userCPUTime: resourceUsage.userCPUTime
|
|
},
|
|
smapsRollup,
|
|
timestamp: new Date().toISOString(),
|
|
trigger,
|
|
uptimeSeconds,
|
|
v8HeapSpaces: heapSpaces?.map(s => ({
|
|
available: s.space_available_size,
|
|
name: s.space_name,
|
|
size: s.space_size,
|
|
used: s.space_used_size
|
|
})),
|
|
v8HeapStats: {
|
|
detachedContexts: heapStats.number_of_detached_contexts,
|
|
heapSizeLimit: heapStats.heap_size_limit,
|
|
mallocedMemory: heapStats.malloced_memory,
|
|
nativeContexts: heapStats.number_of_native_contexts,
|
|
peakMallocedMemory: heapStats.peak_malloced_memory
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promise<HeapDumpResult> {
|
|
try {
|
|
// Diagnostics first — heap-snapshot serialization can crash on very large
|
|
// heaps, and the JSON sidecar is the most actionable artifact if so.
|
|
const diagnostics = await captureMemoryDiagnostics(trigger)
|
|
const dir = process.env.HERMES_HEAPDUMP_DIR?.trim() || join(homedir() || tmpdir(), '.hermes', 'heapdumps')
|
|
|
|
await mkdir(dir, { recursive: true })
|
|
|
|
const base = `hermes-${new Date().toISOString().replace(/[:.]/g, '-')}-${process.pid}-${trigger}`
|
|
const heapPath = join(dir, `${base}.heapsnapshot`)
|
|
const diagPath = join(dir, `${base}.diagnostics.json`)
|
|
|
|
// The diagnostics JSON is KB-sized and the most useful artifact when a
|
|
// full snapshot is suppressed by the auto-heapdump opt-in gate below.
|
|
await writeFile(diagPath, JSON.stringify(diagnostics, null, 2), { mode: 0o600 })
|
|
|
|
// Auto triggers require explicit opt-in: multi-GiB snapshots written on
|
|
// every threshold cross can fill the user's disk (issue #21767).
|
|
const isAuto = trigger === 'auto-critical' || trigger === 'auto-high'
|
|
const autoEnabled = /^(?:1|true|yes|on)$/i.test((process.env.HERMES_AUTO_HEAPDUMP ?? '').trim())
|
|
|
|
if (isAuto && !autoEnabled) {
|
|
await pruneHeapdumps(dir).catch(() => undefined)
|
|
|
|
// Not an error: the dump did its job — it wrote the lightweight
|
|
// diagnostics sidecar and intentionally skipped the heavy snapshot.
|
|
// `heapPath` is omitted so callers/notices report diagnostics-only.
|
|
return { diagPath, suppressed: true, success: true }
|
|
}
|
|
|
|
await pipeline(getHeapSnapshot(), createWriteStream(heapPath, { mode: 0o600 }))
|
|
await pruneHeapdumps(dir).catch(() => undefined)
|
|
|
|
return { diagPath, heapPath, success: true }
|
|
} catch (e) {
|
|
return { error: e instanceof Error ? e.message : String(e), success: false }
|
|
}
|
|
}
|
|
|
|
// Cap total bytes of files in `dir`, deleting oldest first. Covers both
|
|
// `.heapsnapshot` and `.diagnostics.json` artifacts so orphan sidecars from
|
|
// gated auto-triggers cannot accumulate without bound. The newest file is
|
|
// always retained even if it alone exceeds the cap.
|
|
async function pruneHeapdumps(dir: string): Promise<void> {
|
|
const raw = process.env.HERMES_HEAPDUMP_MAX_BYTES?.trim()
|
|
const parsed = raw ? Number(raw) : NaN
|
|
const cap = Number.isFinite(parsed) && parsed > 0 ? parsed : 2 * 1024 ** 3
|
|
|
|
const names = await readdir(dir)
|
|
|
|
const stats = await Promise.all(
|
|
names.map(async name => {
|
|
const path = join(dir, name)
|
|
const s = await stat(path).catch(() => null)
|
|
|
|
return s && s.isFile() ? { mtimeMs: s.mtimeMs, path, size: s.size } : null
|
|
})
|
|
)
|
|
|
|
const valid = stats.filter((s): s is { mtimeMs: number; path: string; size: number } => s !== null)
|
|
|
|
valid.sort((a, b) => b.mtimeMs - a.mtimeMs)
|
|
|
|
let total = valid.reduce((acc, s) => acc + s.size, 0)
|
|
|
|
while (total > cap && valid.length > 1) {
|
|
const oldest = valid.pop()
|
|
|
|
if (!oldest) {
|
|
break
|
|
}
|
|
|
|
await unlink(oldest.path).catch(() => undefined)
|
|
total -= oldest.size
|
|
}
|
|
}
|
|
|
|
export function formatBytes(bytes: number): string {
|
|
if (!Number.isFinite(bytes) || bytes <= 0) {
|
|
return '0B'
|
|
}
|
|
|
|
const exp = Math.min(UNITS.length - 1, Math.floor(Math.log10(bytes) / 3))
|
|
const value = bytes / 1024 ** exp
|
|
|
|
return `${value >= 100 ? value.toFixed(0) : value.toFixed(1)}${UNITS[exp]}`
|
|
}
|
|
|
|
const UNITS = ['B', 'KB', 'MB', 'GB', 'TB']
|
|
|
|
const STARTED_AT = { rss: process.memoryUsage().rss, uptime: process.uptime() }
|
|
|
|
// Returns undefined when the probe isn't available (non-Linux paths, sandboxed FS).
|
|
const swallow = async <T>(fn: () => Promise<T>): Promise<T | undefined> => {
|
|
try {
|
|
return await fn()
|
|
} catch {
|
|
return undefined
|
|
}
|
|
}
|