fix(tui): guard automatic heap dumps against disk fill

Automatic heap dumps from the TUI memory monitor could write multi-GiB
  .heapsnapshot files on every threshold cross, growing ~/.hermes/heapdumps
  to tens of GiB. Add four layered safeguards:

  - Gate auto-high/auto-critical snapshots behind HERMES_AUTO_HEAPDUMP=1;
    manual dumps remain unchanged.
  - Always write the lightweight diagnostics JSON sidecar so users still
    get an actionable artifact when the snapshot is suppressed.
  - Cap total bytes in the dump dir (HERMES_HEAPDUMP_MAX_BYTES, default
    2 GiB), evicting oldest first, retaining the newest.
  - Add a cooldown between auto dumps (HERMES_AUTO_HEAPDUMP_COOLDOWN_MS,
    default 10 min) so an oscillating heap can't re-trigger.

  Closes #21767
This commit is contained in:
alarcritty 2026-05-08 15:22:26 +05:30 committed by Teknium
parent dd0d1222a2
commit 8ae0d054f4
3 changed files with 74 additions and 2 deletions

View file

@ -37,7 +37,7 @@ const gw = new GatewayClient()
gw.start()
const dumpNotice = (snap: MemorySnapshot, dump: HeapDumpResult | null) =>
`hermes-tui: ${snap.level} memory (${formatBytes(snap.heapUsed)}) — auto heap dump → ${dump?.heapPath ?? '(failed)'}\n`
`hermes-tui: ${snap.level} memory (${formatBytes(snap.heapUsed)}) — auto heap dump → ${dump?.heapPath ?? dump?.diagPath ?? '(failed)'}\n`
setupGracefulExit({
cleanups: [

View file

@ -1,5 +1,5 @@
import { createWriteStream } from 'node:fs'
import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises'
import { mkdir, readdir, readFile, stat, unlink, writeFile } from 'node:fs/promises'
import { homedir, tmpdir } from 'node:os'
import { join } from 'node:path'
import { pipeline } from 'node:stream/promises'
@ -51,6 +51,9 @@ export interface HeapDumpResult {
diagPath?: string
error?: string
heapPath?: string
// True when an auto trigger wrote diagnostics only and intentionally skipped
// the heavy snapshot because HERMES_AUTO_HEAPDUMP was not enabled (#21767).
suppressed?: boolean
success: boolean
}
@ -153,8 +156,26 @@ export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promis
const heapPath = join(dir, `${base}.heapsnapshot`)
const diagPath = join(dir, `${base}.diagnostics.json`)
// The diagnostics JSON is KB-sized and the most useful artifact when a
// full snapshot is suppressed by the auto-heapdump opt-in gate below.
await writeFile(diagPath, JSON.stringify(diagnostics, null, 2), { mode: 0o600 })
// Auto triggers require explicit opt-in: multi-GiB snapshots written on
// every threshold cross can fill the user's disk (issue #21767).
const isAuto = trigger === 'auto-critical' || trigger === 'auto-high'
const autoEnabled = /^(?:1|true|yes|on)$/i.test((process.env.HERMES_AUTO_HEAPDUMP ?? '').trim())
if (isAuto && !autoEnabled) {
await pruneHeapdumps(dir).catch(() => undefined)
// Not an error: the dump did its job — it wrote the lightweight
// diagnostics sidecar and intentionally skipped the heavy snapshot.
// `heapPath` is omitted so callers/notices report diagnostics-only.
return { diagPath, suppressed: true, success: true }
}
await pipeline(getHeapSnapshot(), createWriteStream(heapPath, { mode: 0o600 }))
await pruneHeapdumps(dir).catch(() => undefined)
return { diagPath, heapPath, success: true }
} catch (e) {
@ -162,6 +183,44 @@ export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promis
}
}
// Cap total bytes of files in `dir`, deleting oldest first. Covers both
// `.heapsnapshot` and `.diagnostics.json` artifacts so orphan sidecars from
// gated auto-triggers cannot accumulate without bound. The newest file is
// always retained even if it alone exceeds the cap.
async function pruneHeapdumps(dir: string): Promise<void> {
const raw = process.env.HERMES_HEAPDUMP_MAX_BYTES?.trim()
const parsed = raw ? Number(raw) : NaN
const cap = Number.isFinite(parsed) && parsed > 0 ? parsed : 2 * 1024 ** 3
const names = await readdir(dir)
const stats = await Promise.all(
names.map(async name => {
const path = join(dir, name)
const s = await stat(path).catch(() => null)
return s && s.isFile() ? { mtimeMs: s.mtimeMs, path, size: s.size } : null
})
)
const valid = stats.filter((s): s is { mtimeMs: number; path: string; size: number } => s !== null)
valid.sort((a, b) => b.mtimeMs - a.mtimeMs)
let total = valid.reduce((acc, s) => acc + s.size, 0)
while (total > cap && valid.length > 1) {
const oldest = valid.pop()
if (!oldest) {
break
}
await unlink(oldest.path).catch(() => undefined)
total -= oldest.size
}
}
export function formatBytes(bytes: number): string {
if (!Number.isFinite(bytes) || bytes <= 0) {
return '0B'

View file

@ -111,6 +111,14 @@ export function startMemoryMonitor({
let warned = false
const WARN_GROWTH_STEP = 150 * MB
// Cooldown prevents repeated auto dumps when heap oscillates around the
// threshold (issue #21767). `dumped` alone is not enough — it clears on
// every transition back to `normal`.
const cooldownRaw = process.env.HERMES_AUTO_HEAPDUMP_COOLDOWN_MS?.trim()
const cooldownParsed = cooldownRaw ? Number(cooldownRaw) : NaN
const cooldownMs = Number.isFinite(cooldownParsed) && cooldownParsed >= 0 ? cooldownParsed : 600_000
let lastAutoDumpAt = 0
const tick = async () => {
const { heapUsed, rss } = process.memoryUsage()
@ -137,7 +145,12 @@ export function startMemoryMonitor({
return
}
if (Date.now() - lastAutoDumpAt < cooldownMs) {
return
}
inFlight.add(level)
lastAutoDumpAt = Date.now()
// Prune Ink content caches before dump/exit — half on 'high' (recoverable),
// full on 'critical' (post-dump RSS reduction, keeps user running).