mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-30 01:41:43 +00:00
fix(tui): harden against Node V8 OOM + GatewayClient memory leaks
Long TUI sessions were crashing Node via V8 fatal-OOM once transcripts +
reasoning blobs crossed the default 1.5–4GB heap cap. This adds defense
in depth: a bigger heap, leak-proofing the RPC hot path, bounded
diagnostic buffers, automatic heap dumps at high-water marks, and
graceful signal / uncaught handlers.
## Changes
### Heap budget
- hermes_cli/main.py: `_launch_tui` now injects `NODE_OPTIONS=
--max-old-space-size=8192 --expose-gc` (appended — does not clobber
user-supplied NODE_OPTIONS). Covers both `node dist/entry.js` and
`tsx src/entry.tsx` launch paths.
- ui-tui/src/entry.tsx: shebang rewritten to
`#!/usr/bin/env -S node --max-old-space-size=8192 --expose-gc` as a
fallback when the binary is invoked directly.
### GatewayClient (ui-tui/src/gatewayClient.ts)
- `setMaxListeners(0)` — silences spurious warnings from React hook
subscribers.
- `logs` and `bufferedEvents` replaced with fixed-capacity
CircularBuffer — O(1) push, no splice(0, …) copies under load.
- RPC timeout refactor: `setTimeout(this.onTimeout.bind(this), …, id)`
replaces the inline arrow closure that captured `method`/`params`/
`resolve`/`reject` for the full 120 s request timeout. Each Pending
record now stores its own timeout handle, `.unref()`'d so stuck
timers never keep the event loop alive, and `rejectPending()` clears
them (previously leaked the timer itself).
### Memory diagnostics (new)
- ui-tui/src/lib/memory.ts: `performHeapDump()` +
`captureMemoryDiagnostics()`. Writes heap snapshot + JSON diag
sidecar to `~/.hermes/heapdumps/` (override via
`HERMES_HEAPDUMP_DIR`). Diagnostics are written first so we still get
useful data if the snapshot crashes on very large heaps.
Captures: detached V8 contexts (closure-leak signal), active
handles/requests (`process._getActiveHandles/_getActiveRequests`),
Linux `/proc/self/fd` count + `/proc/self/smaps_rollup`, heap growth
rate (MB/hr), and auto-classifies likely leak sources.
- ui-tui/src/lib/memoryMonitor.ts: 10 s interval polling heapUsed. At
1.5 GB writes an auto heap dump (trigger=`auto-high`); at 2.5 GB
writes a final dump and exits 137 before V8 fatal-OOMs so the user
can restart cleanly. Handle is `.unref()`'d so it never holds the
process open.
### Graceful exit (new)
- ui-tui/src/lib/gracefulExit.ts: SIGINT/SIGTERM/SIGHUP run registered
cleanups through a 4 s failsafe `setTimeout` that hard-exits if
cleanup hangs.
`uncaughtException` / `unhandledRejection` are logged to stderr
instead of crashing — a transient TUI render error should not kill
an in-flight agent turn.
### Slash commands (new)
- ui-tui/src/app/slash/commands/debug.ts:
- `/heapdump` — manual snapshot + diagnostics.
- `/mem` — live heap / rss / external / array-buffer / uptime panel.
- Registered in `ui-tui/src/app/slash/registry.ts`.
### Utility (new)
- ui-tui/src/lib/circularBuffer.ts: small fixed-capacity ring buffer
with `push` / `tail(n)` / `drain()` / `clear()`. Replaces the ad-hoc
`array.splice(0, len - MAX)` pattern.
## Validation
- tsc `--noEmit` clean
- `vitest run`: 15 files, 102 tests passing
- eslint clean on all touched/new files
- build produces executable `dist/entry.js` with preserved shebang
- smoke-tested: `HERMES_HEAPDUMP_DIR=… performHeapDump('manual')`
writes both a valid `.heapsnapshot` and a `.diagnostics.json`
containing detached-contexts, active-handles, smaps_rollup.
## Env knobs
- `HERMES_HEAPDUMP_DIR` — override snapshot output dir
- `HERMES_HEAPDUMP_ON_START=1` — dump once at boot
- existing `NODE_OPTIONS` is respected and appended, not replaced
This commit is contained in:
parent
36e8435d3e
commit
0785aec444
9 changed files with 569 additions and 40 deletions
|
|
@ -5,6 +5,7 @@ import { delimiter, resolve } from 'node:path'
|
|||
import { createInterface } from 'node:readline'
|
||||
|
||||
import type { GatewayEvent } from './gatewayTypes.js'
|
||||
import { CircularBuffer } from './lib/circularBuffer.js'
|
||||
|
||||
const MAX_GATEWAY_LOG_LINES = 200
|
||||
const MAX_LOG_LINE_BYTES = 4096
|
||||
|
|
@ -43,16 +44,19 @@ const asGatewayEvent = (value: unknown): GatewayEvent | null =>
|
|||
: null
|
||||
|
||||
interface Pending {
|
||||
id: string
|
||||
method: string
|
||||
reject: (e: Error) => void
|
||||
resolve: (v: unknown) => void
|
||||
timeout: ReturnType<typeof setTimeout>
|
||||
}
|
||||
|
||||
export class GatewayClient extends EventEmitter {
|
||||
private proc: ChildProcess | null = null
|
||||
private reqId = 0
|
||||
private logs: string[] = []
|
||||
private logs = new CircularBuffer<string>(MAX_GATEWAY_LOG_LINES)
|
||||
private pending = new Map<string, Pending>()
|
||||
private bufferedEvents: GatewayEvent[] = []
|
||||
private bufferedEvents = new CircularBuffer<GatewayEvent>(MAX_BUFFERED_EVENTS)
|
||||
private pendingExit: number | null | undefined
|
||||
private ready = false
|
||||
private readyTimer: ReturnType<typeof setTimeout> | null = null
|
||||
|
|
@ -60,6 +64,13 @@ export class GatewayClient extends EventEmitter {
|
|||
private stdoutRl: ReturnType<typeof createInterface> | null = null
|
||||
private stderrRl: ReturnType<typeof createInterface> | null = null
|
||||
|
||||
constructor() {
|
||||
super()
|
||||
// useInput / createGatewayEventHandler can legitimately attach many
|
||||
// listeners. Default 10-cap triggers spurious warnings.
|
||||
this.setMaxListeners(0)
|
||||
}
|
||||
|
||||
private publish(ev: GatewayEvent) {
|
||||
if (ev.type === 'gateway.ready') {
|
||||
this.ready = true
|
||||
|
|
@ -74,9 +85,7 @@ export class GatewayClient extends EventEmitter {
|
|||
return void this.emit('event', ev)
|
||||
}
|
||||
|
||||
if (this.bufferedEvents.push(ev) > MAX_BUFFERED_EVENTS) {
|
||||
this.bufferedEvents.splice(0, this.bufferedEvents.length - MAX_BUFFERED_EVENTS)
|
||||
}
|
||||
this.bufferedEvents.push(ev)
|
||||
}
|
||||
|
||||
start() {
|
||||
|
|
@ -88,7 +97,7 @@ export class GatewayClient extends EventEmitter {
|
|||
env.PYTHONPATH = pyPath ? `${root}${delimiter}${pyPath}` : root
|
||||
|
||||
this.ready = false
|
||||
this.bufferedEvents = []
|
||||
this.bufferedEvents.clear()
|
||||
this.pendingExit = undefined
|
||||
this.stdoutRl?.close()
|
||||
this.stderrRl?.close()
|
||||
|
|
@ -165,15 +174,7 @@ export class GatewayClient extends EventEmitter {
|
|||
const p = id ? this.pending.get(id) : undefined
|
||||
|
||||
if (p) {
|
||||
this.pending.delete(id!)
|
||||
|
||||
if (msg.error) {
|
||||
const err = msg.error as { message?: unknown } | null | undefined
|
||||
|
||||
p.reject(new Error(typeof err?.message === 'string' ? err.message : 'request failed'))
|
||||
} else {
|
||||
p.resolve(msg.result)
|
||||
}
|
||||
this.settle(p, msg.error ? this.toError(msg.error) : null, msg.result)
|
||||
|
||||
return
|
||||
}
|
||||
|
|
@ -187,24 +188,49 @@ export class GatewayClient extends EventEmitter {
|
|||
}
|
||||
}
|
||||
|
||||
private pushLog(line: string) {
|
||||
if (this.logs.push(truncateLine(line)) > MAX_GATEWAY_LOG_LINES) {
|
||||
this.logs.splice(0, this.logs.length - MAX_GATEWAY_LOG_LINES)
|
||||
private toError(raw: unknown): Error {
|
||||
const err = raw as { message?: unknown } | null | undefined
|
||||
|
||||
return new Error(typeof err?.message === 'string' ? err.message : 'request failed')
|
||||
}
|
||||
|
||||
private settle(p: Pending, err: Error | null, result: unknown) {
|
||||
clearTimeout(p.timeout)
|
||||
this.pending.delete(p.id)
|
||||
|
||||
if (err) {
|
||||
p.reject(err)
|
||||
} else {
|
||||
p.resolve(result)
|
||||
}
|
||||
}
|
||||
|
||||
private pushLog(line: string) {
|
||||
this.logs.push(truncateLine(line))
|
||||
}
|
||||
|
||||
private rejectPending(err: Error) {
|
||||
for (const p of this.pending.values()) {
|
||||
clearTimeout(p.timeout)
|
||||
p.reject(err)
|
||||
}
|
||||
|
||||
this.pending.clear()
|
||||
}
|
||||
|
||||
private onTimeout(id: string) {
|
||||
const p = this.pending.get(id)
|
||||
|
||||
if (p) {
|
||||
this.pending.delete(id)
|
||||
p.reject(new Error(`timeout: ${p.method}`))
|
||||
}
|
||||
}
|
||||
|
||||
drain() {
|
||||
this.subscribed = true
|
||||
|
||||
for (const ev of this.bufferedEvents.splice(0)) {
|
||||
for (const ev of this.bufferedEvents.drain()) {
|
||||
this.emit('event', ev)
|
||||
}
|
||||
|
||||
|
|
@ -217,7 +243,7 @@ export class GatewayClient extends EventEmitter {
|
|||
}
|
||||
|
||||
getLogTail(limit = 20): string {
|
||||
return this.logs.slice(-Math.max(1, limit)).join('\n')
|
||||
return this.logs.tail(Math.max(1, limit)).join('\n')
|
||||
}
|
||||
|
||||
request<T = unknown>(method: string, params: Record<string, unknown> = {}): Promise<T> {
|
||||
|
|
@ -231,29 +257,29 @@ export class GatewayClient extends EventEmitter {
|
|||
|
||||
const id = `r${++this.reqId}`
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
if (this.pending.delete(id)) {
|
||||
reject(new Error(`timeout: ${method}`))
|
||||
}
|
||||
}, REQUEST_TIMEOUT_MS)
|
||||
return new Promise<T>((resolve, reject) => {
|
||||
const timeout = setTimeout(this.onTimeout.bind(this), REQUEST_TIMEOUT_MS, id)
|
||||
|
||||
timeout.unref?.()
|
||||
|
||||
this.pending.set(id, {
|
||||
reject: e => {
|
||||
clearTimeout(timeout)
|
||||
reject(e)
|
||||
},
|
||||
resolve: v => {
|
||||
clearTimeout(timeout)
|
||||
resolve(v as T)
|
||||
}
|
||||
id,
|
||||
method,
|
||||
reject,
|
||||
resolve: v => resolve(v as T),
|
||||
timeout
|
||||
})
|
||||
|
||||
try {
|
||||
this.proc!.stdin!.write(JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n')
|
||||
this.proc!.stdin!.write(JSON.stringify({ id, jsonrpc: '2.0', method, params }) + '\n')
|
||||
} catch (e) {
|
||||
clearTimeout(timeout)
|
||||
this.pending.delete(id)
|
||||
const pending = this.pending.get(id)
|
||||
|
||||
if (pending) {
|
||||
clearTimeout(pending.timeout)
|
||||
this.pending.delete(id)
|
||||
}
|
||||
|
||||
reject(e instanceof Error ? e : new Error(String(e)))
|
||||
}
|
||||
})
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue