mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-02 02:01:47 +00:00
fix(tui): harden against Node V8 OOM + GatewayClient memory leaks
Long TUI sessions were crashing Node via V8 fatal-OOM once transcripts +
reasoning blobs crossed the default 1.5–4GB heap cap. This adds defense
in depth: a bigger heap, leak-proofing the RPC hot path, bounded
diagnostic buffers, automatic heap dumps at high-water marks, and
graceful signal / uncaught handlers.
## Changes
### Heap budget
- hermes_cli/main.py: `_launch_tui` now injects `NODE_OPTIONS=
--max-old-space-size=8192 --expose-gc` (appended — does not clobber
user-supplied NODE_OPTIONS). Covers both `node dist/entry.js` and
`tsx src/entry.tsx` launch paths.
- ui-tui/src/entry.tsx: shebang rewritten to
`#!/usr/bin/env -S node --max-old-space-size=8192 --expose-gc` as a
fallback when the binary is invoked directly.
### GatewayClient (ui-tui/src/gatewayClient.ts)
- `setMaxListeners(0)` — silences spurious warnings from React hook
subscribers.
- `logs` and `bufferedEvents` replaced with fixed-capacity
CircularBuffer — O(1) push, no splice(0, …) copies under load.
- RPC timeout refactor: `setTimeout(this.onTimeout.bind(this), …, id)`
replaces the inline arrow closure that captured `method`/`params`/
`resolve`/`reject` for the full 120 s request timeout. Each Pending
record now stores its own timeout handle, `.unref()`'d so stuck
timers never keep the event loop alive, and `rejectPending()` clears
them (previously leaked the timer itself).
### Memory diagnostics (new)
- ui-tui/src/lib/memory.ts: `performHeapDump()` +
`captureMemoryDiagnostics()`. Writes heap snapshot + JSON diag
sidecar to `~/.hermes/heapdumps/` (override via
`HERMES_HEAPDUMP_DIR`). Diagnostics are written first so we still get
useful data if the snapshot crashes on very large heaps.
Captures: detached V8 contexts (closure-leak signal), active
handles/requests (`process._getActiveHandles/_getActiveRequests`),
Linux `/proc/self/fd` count + `/proc/self/smaps_rollup`, heap growth
rate (MB/hr), and auto-classifies likely leak sources.
- ui-tui/src/lib/memoryMonitor.ts: 10 s interval polling heapUsed. At
1.5 GB writes an auto heap dump (trigger=`auto-high`); at 2.5 GB
writes a final dump and exits 137 before V8 fatal-OOMs so the user
can restart cleanly. Handle is `.unref()`'d so it never holds the
process open.
### Graceful exit (new)
- ui-tui/src/lib/gracefulExit.ts: SIGINT/SIGTERM/SIGHUP run registered
cleanups through a 4 s failsafe `setTimeout` that hard-exits if
cleanup hangs.
`uncaughtException` / `unhandledRejection` are logged to stderr
instead of crashing — a transient TUI render error should not kill
an in-flight agent turn.
### Slash commands (new)
- ui-tui/src/app/slash/commands/debug.ts:
- `/heapdump` — manual snapshot + diagnostics.
- `/mem` — live heap / rss / external / array-buffer / uptime panel.
- Registered in `ui-tui/src/app/slash/registry.ts`.
### Utility (new)
- ui-tui/src/lib/circularBuffer.ts: small fixed-capacity ring buffer
with `push` / `tail(n)` / `drain()` / `clear()`. Replaces the ad-hoc
`array.splice(0, len - MAX)` pattern.
## Validation
- tsc `--noEmit` clean
- `vitest run`: 15 files, 102 tests passing
- eslint clean on all touched/new files
- build produces executable `dist/entry.js` with preserved shebang
- smoke-tested: `HERMES_HEAPDUMP_DIR=… performHeapDump('manual')`
writes both a valid `.heapsnapshot` and a `.diagnostics.json`
containing detached-contexts, active-handles, smaps_rollup.
## Env knobs
- `HERMES_HEAPDUMP_DIR` — override snapshot output dir
- `HERMES_HEAPDUMP_ON_START=1` — dump once at boot
- existing `NODE_OPTIONS` is respected and appended, not replaced
This commit is contained in:
parent
36e8435d3e
commit
0785aec444
9 changed files with 569 additions and 40 deletions
58
ui-tui/src/lib/circularBuffer.ts
Normal file
58
ui-tui/src/lib/circularBuffer.ts
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
export class CircularBuffer<T> {
|
||||
private buf: T[]
|
||||
private head = 0
|
||||
private len = 0
|
||||
|
||||
constructor(private capacity: number) {
|
||||
this.buf = new Array<T>(capacity)
|
||||
}
|
||||
|
||||
push(item: T) {
|
||||
this.buf[this.head] = item
|
||||
this.head = (this.head + 1) % this.capacity
|
||||
|
||||
if (this.len < this.capacity) {
|
||||
this.len++
|
||||
}
|
||||
}
|
||||
|
||||
pushAll(items: readonly T[]) {
|
||||
for (const item of items) {
|
||||
this.push(item)
|
||||
}
|
||||
}
|
||||
|
||||
tail(n = this.len): T[] {
|
||||
const take = Math.min(Math.max(0, n), this.len)
|
||||
const start = this.len < this.capacity ? 0 : this.head
|
||||
const out: T[] = new Array<T>(take)
|
||||
|
||||
for (let i = 0; i < take; i++) {
|
||||
out[i] = this.buf[(start + this.len - take + i) % this.capacity]!
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
toArray(): T[] {
|
||||
return this.tail(this.len)
|
||||
}
|
||||
|
||||
drain(): T[] {
|
||||
const out = this.toArray()
|
||||
|
||||
this.clear()
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.buf = new Array<T>(this.capacity)
|
||||
this.head = 0
|
||||
this.len = 0
|
||||
}
|
||||
|
||||
get size() {
|
||||
return this.len
|
||||
}
|
||||
}
|
||||
63
ui-tui/src/lib/gracefulExit.ts
Normal file
63
ui-tui/src/lib/gracefulExit.ts
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
type Cleanup = () => Promise<void> | void
|
||||
|
||||
interface SetupOptions {
|
||||
cleanups?: Cleanup[]
|
||||
failsafeMs?: number
|
||||
onError?: (scope: 'uncaughtException' | 'unhandledRejection', err: unknown) => void
|
||||
onSignal?: (signal: NodeJS.Signals) => void
|
||||
}
|
||||
|
||||
const DEFAULT_FAILSAFE_MS = 4000
|
||||
|
||||
let wired = false
|
||||
|
||||
export function setupGracefulExit({
|
||||
cleanups = [],
|
||||
failsafeMs = DEFAULT_FAILSAFE_MS,
|
||||
onError,
|
||||
onSignal
|
||||
}: SetupOptions = {}) {
|
||||
if (wired) {
|
||||
return
|
||||
}
|
||||
|
||||
wired = true
|
||||
|
||||
let shuttingDown = false
|
||||
|
||||
const exit = (code: number, signal?: NodeJS.Signals) => {
|
||||
if (shuttingDown) {
|
||||
return
|
||||
}
|
||||
|
||||
shuttingDown = true
|
||||
|
||||
if (signal) {
|
||||
onSignal?.(signal)
|
||||
}
|
||||
|
||||
const failsafe = setTimeout(() => process.exit(code), failsafeMs)
|
||||
|
||||
failsafe.unref?.()
|
||||
|
||||
void Promise.allSettled(cleanups.map(fn => Promise.resolve().then(fn)))
|
||||
.catch(() => {})
|
||||
.finally(() => process.exit(code))
|
||||
}
|
||||
|
||||
for (const sig of ['SIGINT', 'SIGTERM', 'SIGHUP'] as const) {
|
||||
process.on(sig, () => exit(sig === 'SIGINT' ? 130 : sig === 'SIGTERM' ? 143 : 129, sig))
|
||||
}
|
||||
|
||||
process.on('uncaughtException', err => {
|
||||
onError?.('uncaughtException', err)
|
||||
})
|
||||
|
||||
process.on('unhandledRejection', reason => {
|
||||
onError?.('unhandledRejection', reason)
|
||||
})
|
||||
}
|
||||
|
||||
export function forceExit(code = 0) {
|
||||
process.exit(code)
|
||||
}
|
||||
208
ui-tui/src/lib/memory.ts
Normal file
208
ui-tui/src/lib/memory.ts
Normal file
|
|
@ -0,0 +1,208 @@
|
|||
import { createWriteStream } from 'node:fs'
|
||||
import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises'
|
||||
import { homedir, tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { pipeline } from 'node:stream/promises'
|
||||
import { getHeapSnapshot, getHeapSpaceStatistics, getHeapStatistics } from 'node:v8'
|
||||
|
||||
export type MemoryTrigger = 'auto-high' | 'auto-critical' | 'manual'
|
||||
|
||||
export interface MemoryDiagnostics {
|
||||
activeHandles: number
|
||||
activeRequests: number
|
||||
analysis: {
|
||||
potentialLeaks: string[]
|
||||
recommendation: string
|
||||
}
|
||||
memoryGrowthRate: {
|
||||
bytesPerSecond: number
|
||||
mbPerHour: number
|
||||
}
|
||||
memoryUsage: {
|
||||
arrayBuffers: number
|
||||
external: number
|
||||
heapTotal: number
|
||||
heapUsed: number
|
||||
rss: number
|
||||
}
|
||||
nodeVersion: string
|
||||
openFileDescriptors?: number
|
||||
platform: string
|
||||
resourceUsage: {
|
||||
maxRSS: number
|
||||
systemCPUTime: number
|
||||
userCPUTime: number
|
||||
}
|
||||
smapsRollup?: string
|
||||
timestamp: string
|
||||
trigger: MemoryTrigger
|
||||
uptimeSeconds: number
|
||||
v8HeapSpaces?: { available: number; name: string; size: number; used: number }[]
|
||||
v8HeapStats: {
|
||||
detachedContexts: number
|
||||
heapSizeLimit: number
|
||||
mallocedMemory: number
|
||||
nativeContexts: number
|
||||
peakMallocedMemory: number
|
||||
}
|
||||
}
|
||||
|
||||
export interface HeapDumpResult {
|
||||
diagPath?: string
|
||||
error?: string
|
||||
heapPath?: string
|
||||
success: boolean
|
||||
}
|
||||
|
||||
const heapDumpRoot = () =>
|
||||
process.env.HERMES_HEAPDUMP_DIR?.trim() || join(homedir() || tmpdir(), '.hermes', 'heapdumps')
|
||||
|
||||
const processInternals = process as unknown as {
|
||||
_getActiveHandles: () => unknown[]
|
||||
_getActiveRequests: () => unknown[]
|
||||
}
|
||||
|
||||
export async function captureMemoryDiagnostics(trigger: MemoryTrigger): Promise<MemoryDiagnostics> {
|
||||
const usage = process.memoryUsage()
|
||||
const heapStats = getHeapStatistics()
|
||||
const resourceUsage = process.resourceUsage()
|
||||
const uptimeSeconds = process.uptime()
|
||||
|
||||
let heapSpaces: ReturnType<typeof getHeapSpaceStatistics> | undefined
|
||||
|
||||
try {
|
||||
heapSpaces = getHeapSpaceStatistics()
|
||||
} catch {
|
||||
/* Bun / older Node — ignore */
|
||||
}
|
||||
|
||||
const activeHandles = processInternals._getActiveHandles().length
|
||||
const activeRequests = processInternals._getActiveRequests().length
|
||||
|
||||
let openFileDescriptors: number | undefined
|
||||
|
||||
try {
|
||||
openFileDescriptors = (await readdir('/proc/self/fd')).length
|
||||
} catch {
|
||||
/* non-Linux */
|
||||
}
|
||||
|
||||
let smapsRollup: string | undefined
|
||||
|
||||
try {
|
||||
smapsRollup = await readFile('/proc/self/smaps_rollup', 'utf8')
|
||||
} catch {
|
||||
/* non-Linux / no access */
|
||||
}
|
||||
|
||||
const nativeMemory = usage.rss - usage.heapUsed
|
||||
const bytesPerSecond = uptimeSeconds > 0 ? usage.rss / uptimeSeconds : 0
|
||||
const mbPerHour = (bytesPerSecond * 3600) / (1024 * 1024)
|
||||
|
||||
const potentialLeaks: string[] = []
|
||||
|
||||
if (heapStats.number_of_detached_contexts > 0) {
|
||||
potentialLeaks.push(
|
||||
`${heapStats.number_of_detached_contexts} detached context(s) — possible component/closure leak`
|
||||
)
|
||||
}
|
||||
|
||||
if (activeHandles > 100) {
|
||||
potentialLeaks.push(`${activeHandles} active handles — possible timer/socket leak`)
|
||||
}
|
||||
|
||||
if (nativeMemory > usage.heapUsed) {
|
||||
potentialLeaks.push('Native memory > heap — leak may be in native addons')
|
||||
}
|
||||
|
||||
if (mbPerHour > 100) {
|
||||
potentialLeaks.push(`High memory growth rate: ${mbPerHour.toFixed(1)} MB/hour`)
|
||||
}
|
||||
|
||||
if (openFileDescriptors && openFileDescriptors > 500) {
|
||||
potentialLeaks.push(`${openFileDescriptors} open FDs — possible file/socket leak`)
|
||||
}
|
||||
|
||||
return {
|
||||
activeHandles,
|
||||
activeRequests,
|
||||
analysis: {
|
||||
potentialLeaks,
|
||||
recommendation: potentialLeaks.length
|
||||
? `WARNING: ${potentialLeaks.length} potential leak indicator(s). See potentialLeaks.`
|
||||
: 'No obvious leak indicators. Inspect heap snapshot for retained objects.'
|
||||
},
|
||||
memoryGrowthRate: { bytesPerSecond, mbPerHour },
|
||||
memoryUsage: {
|
||||
arrayBuffers: usage.arrayBuffers,
|
||||
external: usage.external,
|
||||
heapTotal: usage.heapTotal,
|
||||
heapUsed: usage.heapUsed,
|
||||
rss: usage.rss
|
||||
},
|
||||
nodeVersion: process.version,
|
||||
openFileDescriptors,
|
||||
platform: process.platform,
|
||||
resourceUsage: {
|
||||
maxRSS: resourceUsage.maxRSS * 1024,
|
||||
systemCPUTime: resourceUsage.systemCPUTime,
|
||||
userCPUTime: resourceUsage.userCPUTime
|
||||
},
|
||||
smapsRollup,
|
||||
timestamp: new Date().toISOString(),
|
||||
trigger,
|
||||
uptimeSeconds,
|
||||
v8HeapSpaces: heapSpaces?.map(s => ({
|
||||
available: s.space_available_size,
|
||||
name: s.space_name,
|
||||
size: s.space_size,
|
||||
used: s.space_used_size
|
||||
})),
|
||||
v8HeapStats: {
|
||||
detachedContexts: heapStats.number_of_detached_contexts,
|
||||
heapSizeLimit: heapStats.heap_size_limit,
|
||||
mallocedMemory: heapStats.malloced_memory,
|
||||
nativeContexts: heapStats.number_of_native_contexts,
|
||||
peakMallocedMemory: heapStats.peak_malloced_memory
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promise<HeapDumpResult> {
|
||||
try {
|
||||
const diagnostics = await captureMemoryDiagnostics(trigger)
|
||||
const dir = heapDumpRoot()
|
||||
|
||||
await mkdir(dir, { recursive: true })
|
||||
|
||||
const stamp = new Date().toISOString().replace(/[:.]/g, '-')
|
||||
const base = `hermes-${stamp}-${process.pid}-${trigger}`
|
||||
const heapPath = join(dir, `${base}.heapsnapshot`)
|
||||
const diagPath = join(dir, `${base}.diagnostics.json`)
|
||||
|
||||
await writeFile(diagPath, JSON.stringify(diagnostics, null, 2), { mode: 0o600 })
|
||||
await writeSnapshot(heapPath)
|
||||
|
||||
return { diagPath, heapPath, success: true }
|
||||
} catch (e) {
|
||||
return { error: e instanceof Error ? e.message : String(e), success: false }
|
||||
}
|
||||
}
|
||||
|
||||
export function formatBytes(bytes: number): string {
|
||||
if (!Number.isFinite(bytes) || bytes <= 0) {
|
||||
return '0B'
|
||||
}
|
||||
|
||||
const units = ['B', 'KB', 'MB', 'GB', 'TB']
|
||||
const exp = Math.min(units.length - 1, Math.floor(Math.log10(bytes) / 3))
|
||||
const value = bytes / 1024 ** exp
|
||||
|
||||
return `${value >= 100 ? value.toFixed(0) : value.toFixed(1)}${units[exp]}`
|
||||
}
|
||||
|
||||
async function writeSnapshot(filepath: string) {
|
||||
const stream = createWriteStream(filepath, { mode: 0o600 })
|
||||
|
||||
await pipeline(getHeapSnapshot(), stream)
|
||||
}
|
||||
75
ui-tui/src/lib/memoryMonitor.ts
Normal file
75
ui-tui/src/lib/memoryMonitor.ts
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
import { type HeapDumpResult, performHeapDump } from './memory.js'
|
||||
|
||||
export type MemoryLevel = 'critical' | 'high' | 'normal'
|
||||
|
||||
export interface MemorySnapshot {
|
||||
heapUsed: number
|
||||
level: MemoryLevel
|
||||
rss: number
|
||||
}
|
||||
|
||||
export interface MemoryMonitorOptions {
|
||||
criticalBytes?: number
|
||||
highBytes?: number
|
||||
intervalMs?: number
|
||||
onCritical?: (snap: MemorySnapshot, dump: HeapDumpResult | null) => void
|
||||
onHigh?: (snap: MemorySnapshot, dump: HeapDumpResult | null) => void
|
||||
onSnapshot?: (snap: MemorySnapshot) => void
|
||||
}
|
||||
|
||||
const GB = 1024 ** 3
|
||||
|
||||
const DEFAULTS = {
|
||||
criticalBytes: 2.5 * GB,
|
||||
highBytes: 1.5 * GB,
|
||||
intervalMs: 10_000
|
||||
}
|
||||
|
||||
export function startMemoryMonitor({
|
||||
criticalBytes = DEFAULTS.criticalBytes,
|
||||
highBytes = DEFAULTS.highBytes,
|
||||
intervalMs = DEFAULTS.intervalMs,
|
||||
onCritical,
|
||||
onHigh,
|
||||
onSnapshot
|
||||
}: MemoryMonitorOptions = {}): () => void {
|
||||
let dumpedHigh = false
|
||||
let dumpedCritical = false
|
||||
|
||||
const tick = async () => {
|
||||
const { heapUsed, rss } = process.memoryUsage()
|
||||
const level: MemoryLevel = heapUsed >= criticalBytes ? 'critical' : heapUsed >= highBytes ? 'high' : 'normal'
|
||||
const snap: MemorySnapshot = { heapUsed, level, rss }
|
||||
|
||||
onSnapshot?.(snap)
|
||||
|
||||
if (level === 'normal') {
|
||||
dumpedHigh = false
|
||||
dumpedCritical = false
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
if (level === 'high' && !dumpedHigh) {
|
||||
dumpedHigh = true
|
||||
const dump = await performHeapDump('auto-high').catch(() => null)
|
||||
|
||||
onHigh?.(snap, dump)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
if (level === 'critical' && !dumpedCritical) {
|
||||
dumpedCritical = true
|
||||
const dump = await performHeapDump('auto-critical').catch(() => null)
|
||||
|
||||
onCritical?.(snap, dump)
|
||||
}
|
||||
}
|
||||
|
||||
const handle = setInterval(() => void tick(), intervalMs)
|
||||
|
||||
handle.unref?.()
|
||||
|
||||
return () => clearInterval(handle)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue