mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(tui): guard automatic heap dumps against disk fill
Automatic heap dumps from the TUI memory monitor could write multi-GiB
.heapsnapshot files on every threshold cross, growing ~/.hermes/heapdumps
to tens of GiB. Add four layered safeguards:
- Gate auto-high/auto-critical snapshots behind HERMES_AUTO_HEAPDUMP=1;
manual dumps remain unchanged.
- Always write the lightweight diagnostics JSON sidecar so users still
get an actionable artifact when the snapshot is suppressed.
- Cap total bytes in the dump dir (HERMES_HEAPDUMP_MAX_BYTES, default
2 GiB), evicting oldest first, retaining the newest.
- Add a cooldown between auto dumps (HERMES_AUTO_HEAPDUMP_COOLDOWN_MS,
default 10 min) so an oscillating heap can't re-trigger.
Closes #21767
This commit is contained in:
parent
dd0d1222a2
commit
8ae0d054f4
3 changed files with 74 additions and 2 deletions
|
|
@ -37,7 +37,7 @@ const gw = new GatewayClient()
|
|||
gw.start()
|
||||
|
||||
const dumpNotice = (snap: MemorySnapshot, dump: HeapDumpResult | null) =>
|
||||
`hermes-tui: ${snap.level} memory (${formatBytes(snap.heapUsed)}) — auto heap dump → ${dump?.heapPath ?? '(failed)'}\n`
|
||||
`hermes-tui: ${snap.level} memory (${formatBytes(snap.heapUsed)}) — auto heap dump → ${dump?.heapPath ?? dump?.diagPath ?? '(failed)'}\n`
|
||||
|
||||
setupGracefulExit({
|
||||
cleanups: [
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import { createWriteStream } from 'node:fs'
|
||||
import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises'
|
||||
import { mkdir, readdir, readFile, stat, unlink, writeFile } from 'node:fs/promises'
|
||||
import { homedir, tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { pipeline } from 'node:stream/promises'
|
||||
|
|
@ -51,6 +51,9 @@ export interface HeapDumpResult {
|
|||
diagPath?: string
|
||||
error?: string
|
||||
heapPath?: string
|
||||
// True when an auto trigger wrote diagnostics only and intentionally skipped
|
||||
// the heavy snapshot because HERMES_AUTO_HEAPDUMP was not enabled (#21767).
|
||||
suppressed?: boolean
|
||||
success: boolean
|
||||
}
|
||||
|
||||
|
|
@ -153,8 +156,26 @@ export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promis
|
|||
const heapPath = join(dir, `${base}.heapsnapshot`)
|
||||
const diagPath = join(dir, `${base}.diagnostics.json`)
|
||||
|
||||
// The diagnostics JSON is KB-sized and the most useful artifact when a
|
||||
// full snapshot is suppressed by the auto-heapdump opt-in gate below.
|
||||
await writeFile(diagPath, JSON.stringify(diagnostics, null, 2), { mode: 0o600 })
|
||||
|
||||
// Auto triggers require explicit opt-in: multi-GiB snapshots written on
|
||||
// every threshold cross can fill the user's disk (issue #21767).
|
||||
const isAuto = trigger === 'auto-critical' || trigger === 'auto-high'
|
||||
const autoEnabled = /^(?:1|true|yes|on)$/i.test((process.env.HERMES_AUTO_HEAPDUMP ?? '').trim())
|
||||
|
||||
if (isAuto && !autoEnabled) {
|
||||
await pruneHeapdumps(dir).catch(() => undefined)
|
||||
|
||||
// Not an error: the dump did its job — it wrote the lightweight
|
||||
// diagnostics sidecar and intentionally skipped the heavy snapshot.
|
||||
// `heapPath` is omitted so callers/notices report diagnostics-only.
|
||||
return { diagPath, suppressed: true, success: true }
|
||||
}
|
||||
|
||||
await pipeline(getHeapSnapshot(), createWriteStream(heapPath, { mode: 0o600 }))
|
||||
await pruneHeapdumps(dir).catch(() => undefined)
|
||||
|
||||
return { diagPath, heapPath, success: true }
|
||||
} catch (e) {
|
||||
|
|
@ -162,6 +183,44 @@ export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promis
|
|||
}
|
||||
}
|
||||
|
||||
// Cap total bytes of files in `dir`, deleting oldest first. Covers both
|
||||
// `.heapsnapshot` and `.diagnostics.json` artifacts so orphan sidecars from
|
||||
// gated auto-triggers cannot accumulate without bound. The newest file is
|
||||
// always retained even if it alone exceeds the cap.
|
||||
async function pruneHeapdumps(dir: string): Promise<void> {
|
||||
const raw = process.env.HERMES_HEAPDUMP_MAX_BYTES?.trim()
|
||||
const parsed = raw ? Number(raw) : NaN
|
||||
const cap = Number.isFinite(parsed) && parsed > 0 ? parsed : 2 * 1024 ** 3
|
||||
|
||||
const names = await readdir(dir)
|
||||
|
||||
const stats = await Promise.all(
|
||||
names.map(async name => {
|
||||
const path = join(dir, name)
|
||||
const s = await stat(path).catch(() => null)
|
||||
|
||||
return s && s.isFile() ? { mtimeMs: s.mtimeMs, path, size: s.size } : null
|
||||
})
|
||||
)
|
||||
|
||||
const valid = stats.filter((s): s is { mtimeMs: number; path: string; size: number } => s !== null)
|
||||
|
||||
valid.sort((a, b) => b.mtimeMs - a.mtimeMs)
|
||||
|
||||
let total = valid.reduce((acc, s) => acc + s.size, 0)
|
||||
|
||||
while (total > cap && valid.length > 1) {
|
||||
const oldest = valid.pop()
|
||||
|
||||
if (!oldest) {
|
||||
break
|
||||
}
|
||||
|
||||
await unlink(oldest.path).catch(() => undefined)
|
||||
total -= oldest.size
|
||||
}
|
||||
}
|
||||
|
||||
export function formatBytes(bytes: number): string {
|
||||
if (!Number.isFinite(bytes) || bytes <= 0) {
|
||||
return '0B'
|
||||
|
|
|
|||
|
|
@ -111,6 +111,14 @@ export function startMemoryMonitor({
|
|||
let warned = false
|
||||
const WARN_GROWTH_STEP = 150 * MB
|
||||
|
||||
// Cooldown prevents repeated auto dumps when heap oscillates around the
|
||||
// threshold (issue #21767). `dumped` alone is not enough — it clears on
|
||||
// every transition back to `normal`.
|
||||
const cooldownRaw = process.env.HERMES_AUTO_HEAPDUMP_COOLDOWN_MS?.trim()
|
||||
const cooldownParsed = cooldownRaw ? Number(cooldownRaw) : NaN
|
||||
const cooldownMs = Number.isFinite(cooldownParsed) && cooldownParsed >= 0 ? cooldownParsed : 600_000
|
||||
let lastAutoDumpAt = 0
|
||||
|
||||
const tick = async () => {
|
||||
const { heapUsed, rss } = process.memoryUsage()
|
||||
|
||||
|
|
@ -137,7 +145,12 @@ export function startMemoryMonitor({
|
|||
return
|
||||
}
|
||||
|
||||
if (Date.now() - lastAutoDumpAt < cooldownMs) {
|
||||
return
|
||||
}
|
||||
|
||||
inFlight.add(level)
|
||||
lastAutoDumpAt = Date.now()
|
||||
|
||||
// Prune Ink content caches before dump/exit — half on 'high' (recoverable),
|
||||
// full on 'critical' (post-dump RSS reduction, keeps user running).
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue