From 8ae0d054f4d3bb5dd1f376365c5ebb0091b6f9e7 Mon Sep 17 00:00:00 2001 From: alarcritty Date: Fri, 8 May 2026 15:22:26 +0530 Subject: [PATCH] fix(tui): guard automatic heap dumps against disk fill Automatic heap dumps from the TUI memory monitor could write multi-GiB .heapsnapshot files on every threshold cross, growing ~/.hermes/heapdumps to tens of GiB. Add four layered safeguards: - Gate auto-high/auto-critical snapshots behind HERMES_AUTO_HEAPDUMP=1; manual dumps remain unchanged. - Always write the lightweight diagnostics JSON sidecar so users still get an actionable artifact when the snapshot is suppressed. - Cap total bytes in the dump dir (HERMES_HEAPDUMP_MAX_BYTES, default 2 GiB), evicting oldest first, retaining the newest. - Add a cooldown between auto dumps (HERMES_AUTO_HEAPDUMP_COOLDOWN_MS, default 10 min) so an oscillating heap can't re-trigger. Closes #21767 --- ui-tui/src/entry.tsx | 2 +- ui-tui/src/lib/memory.ts | 61 ++++++++++++++++++++++++++++++++- ui-tui/src/lib/memoryMonitor.ts | 13 +++++++ 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/ui-tui/src/entry.tsx b/ui-tui/src/entry.tsx index d1b7b9b4963..a4125981216 100644 --- a/ui-tui/src/entry.tsx +++ b/ui-tui/src/entry.tsx @@ -37,7 +37,7 @@ const gw = new GatewayClient() gw.start() const dumpNotice = (snap: MemorySnapshot, dump: HeapDumpResult | null) => - `hermes-tui: ${snap.level} memory (${formatBytes(snap.heapUsed)}) — auto heap dump → ${dump?.heapPath ?? '(failed)'}\n` + `hermes-tui: ${snap.level} memory (${formatBytes(snap.heapUsed)}) — auto heap dump → ${dump?.heapPath ?? dump?.diagPath ?? '(failed)'}\n` setupGracefulExit({ cleanups: [ diff --git a/ui-tui/src/lib/memory.ts b/ui-tui/src/lib/memory.ts index 9f157adffc8..664b0560ef5 100644 --- a/ui-tui/src/lib/memory.ts +++ b/ui-tui/src/lib/memory.ts @@ -1,5 +1,5 @@ import { createWriteStream } from 'node:fs' -import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises' +import { mkdir, readdir, readFile, stat, unlink, writeFile } from 'node:fs/promises' import { homedir, tmpdir } from 'node:os' import { join } from 'node:path' import { pipeline } from 'node:stream/promises' @@ -51,6 +51,9 @@ export interface HeapDumpResult { diagPath?: string error?: string heapPath?: string + // True when an auto trigger wrote diagnostics only and intentionally skipped + // the heavy snapshot because HERMES_AUTO_HEAPDUMP was not enabled (#21767). + suppressed?: boolean success: boolean } @@ -153,8 +156,26 @@ export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promis const heapPath = join(dir, `${base}.heapsnapshot`) const diagPath = join(dir, `${base}.diagnostics.json`) + // The diagnostics JSON is KB-sized and the most useful artifact when a + // full snapshot is suppressed by the auto-heapdump opt-in gate below. await writeFile(diagPath, JSON.stringify(diagnostics, null, 2), { mode: 0o600 }) + + // Auto triggers require explicit opt-in: multi-GiB snapshots written on + // every threshold cross can fill the user's disk (issue #21767). + const isAuto = trigger === 'auto-critical' || trigger === 'auto-high' + const autoEnabled = /^(?:1|true|yes|on)$/i.test((process.env.HERMES_AUTO_HEAPDUMP ?? '').trim()) + + if (isAuto && !autoEnabled) { + await pruneHeapdumps(dir).catch(() => undefined) + + // Not an error: the dump did its job — it wrote the lightweight + // diagnostics sidecar and intentionally skipped the heavy snapshot. + // `heapPath` is omitted so callers/notices report diagnostics-only. + return { diagPath, suppressed: true, success: true } + } + await pipeline(getHeapSnapshot(), createWriteStream(heapPath, { mode: 0o600 })) + await pruneHeapdumps(dir).catch(() => undefined) return { diagPath, heapPath, success: true } } catch (e) { @@ -162,6 +183,44 @@ export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promis } } +// Cap total bytes of files in `dir`, deleting oldest first. Covers both +// `.heapsnapshot` and `.diagnostics.json` artifacts so orphan sidecars from +// gated auto-triggers cannot accumulate without bound. The newest file is +// always retained even if it alone exceeds the cap. +async function pruneHeapdumps(dir: string): Promise { + const raw = process.env.HERMES_HEAPDUMP_MAX_BYTES?.trim() + const parsed = raw ? Number(raw) : NaN + const cap = Number.isFinite(parsed) && parsed > 0 ? parsed : 2 * 1024 ** 3 + + const names = await readdir(dir) + + const stats = await Promise.all( + names.map(async name => { + const path = join(dir, name) + const s = await stat(path).catch(() => null) + + return s && s.isFile() ? { mtimeMs: s.mtimeMs, path, size: s.size } : null + }) + ) + + const valid = stats.filter((s): s is { mtimeMs: number; path: string; size: number } => s !== null) + + valid.sort((a, b) => b.mtimeMs - a.mtimeMs) + + let total = valid.reduce((acc, s) => acc + s.size, 0) + + while (total > cap && valid.length > 1) { + const oldest = valid.pop() + + if (!oldest) { + break + } + + await unlink(oldest.path).catch(() => undefined) + total -= oldest.size + } +} + export function formatBytes(bytes: number): string { if (!Number.isFinite(bytes) || bytes <= 0) { return '0B' diff --git a/ui-tui/src/lib/memoryMonitor.ts b/ui-tui/src/lib/memoryMonitor.ts index 512421f9433..1cb25390609 100644 --- a/ui-tui/src/lib/memoryMonitor.ts +++ b/ui-tui/src/lib/memoryMonitor.ts @@ -111,6 +111,14 @@ export function startMemoryMonitor({ let warned = false const WARN_GROWTH_STEP = 150 * MB + // Cooldown prevents repeated auto dumps when heap oscillates around the + // threshold (issue #21767). `dumped` alone is not enough — it clears on + // every transition back to `normal`. + const cooldownRaw = process.env.HERMES_AUTO_HEAPDUMP_COOLDOWN_MS?.trim() + const cooldownParsed = cooldownRaw ? Number(cooldownRaw) : NaN + const cooldownMs = Number.isFinite(cooldownParsed) && cooldownParsed >= 0 ? cooldownParsed : 600_000 + let lastAutoDumpAt = 0 + const tick = async () => { const { heapUsed, rss } = process.memoryUsage() @@ -137,7 +145,12 @@ export function startMemoryMonitor({ return } + if (Date.now() - lastAutoDumpAt < cooldownMs) { + return + } + inFlight.add(level) + lastAutoDumpAt = Date.now() // Prune Ink content caches before dump/exit — half on 'high' (recoverable), // full on 'critical' (post-dump RSS reduction, keeps user running).