diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 714ad82bf2..fc29b848a6 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -1003,6 +1003,17 @@ def _launch_tui(resume_session_id: Optional[str] = None, tui_dev: bool = False): ) env.setdefault("HERMES_PYTHON", sys.executable) env.setdefault("HERMES_CWD", os.getcwd()) + # Guarantee an 8GB V8 heap + exposed GC for the TUI. Default node cap is + # ~1.5–4GB depending on version and can fatal-OOM on long sessions with + # large transcripts / reasoning blobs. Append (don't clobber) any user + # NODE_OPTIONS. + _existing_node_opts = env.get("NODE_OPTIONS", "").strip() + _hermes_tui_node_opts = "--max-old-space-size=8192 --expose-gc" + env["NODE_OPTIONS"] = ( + f"{_existing_node_opts} {_hermes_tui_node_opts}".strip() + if _hermes_tui_node_opts not in _existing_node_opts + else _existing_node_opts + ) if resume_session_id: env["HERMES_TUI_RESUME"] = resume_session_id diff --git a/ui-tui/src/app/slash/commands/debug.ts b/ui-tui/src/app/slash/commands/debug.ts new file mode 100644 index 0000000000..d44c76f34b --- /dev/null +++ b/ui-tui/src/app/slash/commands/debug.ts @@ -0,0 +1,48 @@ +import { formatBytes, performHeapDump } from '../../../lib/memory.js' +import type { SlashCommand } from '../types.js' + +export const debugCommands: SlashCommand[] = [ + { + help: 'write a V8 heap snapshot + memory diagnostics to ~/.hermes/heapdumps', + name: 'heapdump', + run: (_arg, ctx) => { + const { heapUsed, rss } = process.memoryUsage() + + ctx.transcript.sys(`writing heap dump (heap ${formatBytes(heapUsed)} · rss ${formatBytes(rss)})…`) + + void performHeapDump('manual').then(r => { + if (ctx.stale()) { + return + } + + if (!r.success) { + return ctx.transcript.sys(`heapdump failed: ${r.error ?? 'unknown error'}`) + } + + ctx.transcript.sys(`heapdump: ${r.heapPath}`) + ctx.transcript.sys(`diagnostics: ${r.diagPath}`) + }) + } + }, + + { + help: 'print live V8 heap + rss numbers', + name: 'mem', + run: (_arg, ctx) => { + const { arrayBuffers, external, heapTotal, heapUsed, rss } = process.memoryUsage() + + ctx.transcript.panel('Memory', [ + { + rows: [ + ['heap used', formatBytes(heapUsed)], + ['heap total', formatBytes(heapTotal)], + ['external', formatBytes(external)], + ['array buffers', formatBytes(arrayBuffers)], + ['rss', formatBytes(rss)], + ['uptime', `${process.uptime().toFixed(0)}s`] + ] + } + ]) + } + } +] diff --git a/ui-tui/src/app/slash/registry.ts b/ui-tui/src/app/slash/registry.ts index ae7d7d50be..353b0a83d1 100644 --- a/ui-tui/src/app/slash/registry.ts +++ b/ui-tui/src/app/slash/registry.ts @@ -1,10 +1,17 @@ import { coreCommands } from './commands/core.js' +import { debugCommands } from './commands/debug.js' import { opsCommands } from './commands/ops.js' import { sessionCommands } from './commands/session.js' import { setupCommands } from './commands/setup.js' import type { SlashCommand } from './types.js' -export const SLASH_COMMANDS: SlashCommand[] = [...coreCommands, ...sessionCommands, ...opsCommands, ...setupCommands] +export const SLASH_COMMANDS: SlashCommand[] = [ + ...coreCommands, + ...sessionCommands, + ...opsCommands, + ...setupCommands, + ...debugCommands +] const byName = new Map( SLASH_COMMANDS.flatMap(cmd => [cmd.name, ...(cmd.aliases ?? [])].map(name => [name, cmd] as const)) diff --git a/ui-tui/src/entry.tsx b/ui-tui/src/entry.tsx index e0a4379342..a9571e1353 100644 --- a/ui-tui/src/entry.tsx +++ b/ui-tui/src/entry.tsx @@ -1,7 +1,9 @@ -#!/usr/bin/env node -// Order matters: paint banner + spawn python before loading @hermes/ink. +#!/usr/bin/env -S node --max-old-space-size=8192 --expose-gc import { bootBanner } from './bootBanner.js' import { GatewayClient } from './gatewayClient.js' +import { setupGracefulExit } from './lib/gracefulExit.js' +import { formatBytes, performHeapDump } from './lib/memory.js' +import { startMemoryMonitor } from './lib/memoryMonitor.js' if (!process.stdin.isTTY) { console.log('hermes-tui: no TTY') @@ -11,8 +13,39 @@ if (!process.stdin.isTTY) { process.stdout.write(bootBanner()) const gw = new GatewayClient() + gw.start() +setupGracefulExit({ + cleanups: [() => gw.kill()], + onError: (scope, err) => { + const message = err instanceof Error ? `${err.name}: ${err.message}` : String(err) + + process.stderr.write(`hermes-tui ${scope}: ${message.slice(0, 2000)}\n`) + }, + onSignal: signal => process.stderr.write(`hermes-tui: received ${signal}\n`) +}) + +const stopMemoryMonitor = startMemoryMonitor({ + onCritical: (snap, dump) => { + process.stderr.write( + `hermes-tui: critical memory (${formatBytes(snap.heapUsed)}) — auto heap dump → ${dump?.heapPath ?? '(failed)'}\n` + ) + process.stderr.write('hermes-tui: exiting to avoid OOM; restart to recover\n') + process.exit(137) + }, + onHigh: (snap, dump) => + process.stderr.write( + `hermes-tui: high memory (${formatBytes(snap.heapUsed)}) — auto heap dump → ${dump?.heapPath ?? '(failed)'}\n` + ) +}) + +if (process.env.HERMES_HEAPDUMP_ON_START === '1') { + void performHeapDump('manual') +} + +process.on('beforeExit', () => stopMemoryMonitor()) + const [{ render }, { App }] = await Promise.all([import('@hermes/ink'), import('./app.js')]) render(, { exitOnCtrlC: false }) diff --git a/ui-tui/src/gatewayClient.ts b/ui-tui/src/gatewayClient.ts index a238c7638d..bf5210faa8 100644 --- a/ui-tui/src/gatewayClient.ts +++ b/ui-tui/src/gatewayClient.ts @@ -5,6 +5,7 @@ import { delimiter, resolve } from 'node:path' import { createInterface } from 'node:readline' import type { GatewayEvent } from './gatewayTypes.js' +import { CircularBuffer } from './lib/circularBuffer.js' const MAX_GATEWAY_LOG_LINES = 200 const MAX_LOG_LINE_BYTES = 4096 @@ -43,16 +44,19 @@ const asGatewayEvent = (value: unknown): GatewayEvent | null => : null interface Pending { + id: string + method: string reject: (e: Error) => void resolve: (v: unknown) => void + timeout: ReturnType } export class GatewayClient extends EventEmitter { private proc: ChildProcess | null = null private reqId = 0 - private logs: string[] = [] + private logs = new CircularBuffer(MAX_GATEWAY_LOG_LINES) private pending = new Map() - private bufferedEvents: GatewayEvent[] = [] + private bufferedEvents = new CircularBuffer(MAX_BUFFERED_EVENTS) private pendingExit: number | null | undefined private ready = false private readyTimer: ReturnType | null = null @@ -60,6 +64,13 @@ export class GatewayClient extends EventEmitter { private stdoutRl: ReturnType | null = null private stderrRl: ReturnType | null = null + constructor() { + super() + // useInput / createGatewayEventHandler can legitimately attach many + // listeners. Default 10-cap triggers spurious warnings. + this.setMaxListeners(0) + } + private publish(ev: GatewayEvent) { if (ev.type === 'gateway.ready') { this.ready = true @@ -74,9 +85,7 @@ export class GatewayClient extends EventEmitter { return void this.emit('event', ev) } - if (this.bufferedEvents.push(ev) > MAX_BUFFERED_EVENTS) { - this.bufferedEvents.splice(0, this.bufferedEvents.length - MAX_BUFFERED_EVENTS) - } + this.bufferedEvents.push(ev) } start() { @@ -88,7 +97,7 @@ export class GatewayClient extends EventEmitter { env.PYTHONPATH = pyPath ? `${root}${delimiter}${pyPath}` : root this.ready = false - this.bufferedEvents = [] + this.bufferedEvents.clear() this.pendingExit = undefined this.stdoutRl?.close() this.stderrRl?.close() @@ -165,15 +174,7 @@ export class GatewayClient extends EventEmitter { const p = id ? this.pending.get(id) : undefined if (p) { - this.pending.delete(id!) - - if (msg.error) { - const err = msg.error as { message?: unknown } | null | undefined - - p.reject(new Error(typeof err?.message === 'string' ? err.message : 'request failed')) - } else { - p.resolve(msg.result) - } + this.settle(p, msg.error ? this.toError(msg.error) : null, msg.result) return } @@ -187,24 +188,49 @@ export class GatewayClient extends EventEmitter { } } - private pushLog(line: string) { - if (this.logs.push(truncateLine(line)) > MAX_GATEWAY_LOG_LINES) { - this.logs.splice(0, this.logs.length - MAX_GATEWAY_LOG_LINES) + private toError(raw: unknown): Error { + const err = raw as { message?: unknown } | null | undefined + + return new Error(typeof err?.message === 'string' ? err.message : 'request failed') + } + + private settle(p: Pending, err: Error | null, result: unknown) { + clearTimeout(p.timeout) + this.pending.delete(p.id) + + if (err) { + p.reject(err) + } else { + p.resolve(result) } } + private pushLog(line: string) { + this.logs.push(truncateLine(line)) + } + private rejectPending(err: Error) { for (const p of this.pending.values()) { + clearTimeout(p.timeout) p.reject(err) } this.pending.clear() } + private onTimeout(id: string) { + const p = this.pending.get(id) + + if (p) { + this.pending.delete(id) + p.reject(new Error(`timeout: ${p.method}`)) + } + } + drain() { this.subscribed = true - for (const ev of this.bufferedEvents.splice(0)) { + for (const ev of this.bufferedEvents.drain()) { this.emit('event', ev) } @@ -217,7 +243,7 @@ export class GatewayClient extends EventEmitter { } getLogTail(limit = 20): string { - return this.logs.slice(-Math.max(1, limit)).join('\n') + return this.logs.tail(Math.max(1, limit)).join('\n') } request(method: string, params: Record = {}): Promise { @@ -231,29 +257,29 @@ export class GatewayClient extends EventEmitter { const id = `r${++this.reqId}` - return new Promise((resolve, reject) => { - const timeout = setTimeout(() => { - if (this.pending.delete(id)) { - reject(new Error(`timeout: ${method}`)) - } - }, REQUEST_TIMEOUT_MS) + return new Promise((resolve, reject) => { + const timeout = setTimeout(this.onTimeout.bind(this), REQUEST_TIMEOUT_MS, id) + + timeout.unref?.() this.pending.set(id, { - reject: e => { - clearTimeout(timeout) - reject(e) - }, - resolve: v => { - clearTimeout(timeout) - resolve(v as T) - } + id, + method, + reject, + resolve: v => resolve(v as T), + timeout }) try { - this.proc!.stdin!.write(JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n') + this.proc!.stdin!.write(JSON.stringify({ id, jsonrpc: '2.0', method, params }) + '\n') } catch (e) { - clearTimeout(timeout) - this.pending.delete(id) + const pending = this.pending.get(id) + + if (pending) { + clearTimeout(pending.timeout) + this.pending.delete(id) + } + reject(e instanceof Error ? e : new Error(String(e))) } }) diff --git a/ui-tui/src/lib/circularBuffer.ts b/ui-tui/src/lib/circularBuffer.ts new file mode 100644 index 0000000000..09023fae56 --- /dev/null +++ b/ui-tui/src/lib/circularBuffer.ts @@ -0,0 +1,58 @@ +export class CircularBuffer { + private buf: T[] + private head = 0 + private len = 0 + + constructor(private capacity: number) { + this.buf = new Array(capacity) + } + + push(item: T) { + this.buf[this.head] = item + this.head = (this.head + 1) % this.capacity + + if (this.len < this.capacity) { + this.len++ + } + } + + pushAll(items: readonly T[]) { + for (const item of items) { + this.push(item) + } + } + + tail(n = this.len): T[] { + const take = Math.min(Math.max(0, n), this.len) + const start = this.len < this.capacity ? 0 : this.head + const out: T[] = new Array(take) + + for (let i = 0; i < take; i++) { + out[i] = this.buf[(start + this.len - take + i) % this.capacity]! + } + + return out + } + + toArray(): T[] { + return this.tail(this.len) + } + + drain(): T[] { + const out = this.toArray() + + this.clear() + + return out + } + + clear() { + this.buf = new Array(this.capacity) + this.head = 0 + this.len = 0 + } + + get size() { + return this.len + } +} diff --git a/ui-tui/src/lib/gracefulExit.ts b/ui-tui/src/lib/gracefulExit.ts new file mode 100644 index 0000000000..ae6a23a5e8 --- /dev/null +++ b/ui-tui/src/lib/gracefulExit.ts @@ -0,0 +1,63 @@ +type Cleanup = () => Promise | void + +interface SetupOptions { + cleanups?: Cleanup[] + failsafeMs?: number + onError?: (scope: 'uncaughtException' | 'unhandledRejection', err: unknown) => void + onSignal?: (signal: NodeJS.Signals) => void +} + +const DEFAULT_FAILSAFE_MS = 4000 + +let wired = false + +export function setupGracefulExit({ + cleanups = [], + failsafeMs = DEFAULT_FAILSAFE_MS, + onError, + onSignal +}: SetupOptions = {}) { + if (wired) { + return + } + + wired = true + + let shuttingDown = false + + const exit = (code: number, signal?: NodeJS.Signals) => { + if (shuttingDown) { + return + } + + shuttingDown = true + + if (signal) { + onSignal?.(signal) + } + + const failsafe = setTimeout(() => process.exit(code), failsafeMs) + + failsafe.unref?.() + + void Promise.allSettled(cleanups.map(fn => Promise.resolve().then(fn))) + .catch(() => {}) + .finally(() => process.exit(code)) + } + + for (const sig of ['SIGINT', 'SIGTERM', 'SIGHUP'] as const) { + process.on(sig, () => exit(sig === 'SIGINT' ? 130 : sig === 'SIGTERM' ? 143 : 129, sig)) + } + + process.on('uncaughtException', err => { + onError?.('uncaughtException', err) + }) + + process.on('unhandledRejection', reason => { + onError?.('unhandledRejection', reason) + }) +} + +export function forceExit(code = 0) { + process.exit(code) +} diff --git a/ui-tui/src/lib/memory.ts b/ui-tui/src/lib/memory.ts new file mode 100644 index 0000000000..0afbab7729 --- /dev/null +++ b/ui-tui/src/lib/memory.ts @@ -0,0 +1,208 @@ +import { createWriteStream } from 'node:fs' +import { mkdir, readdir, readFile, writeFile } from 'node:fs/promises' +import { homedir, tmpdir } from 'node:os' +import { join } from 'node:path' +import { pipeline } from 'node:stream/promises' +import { getHeapSnapshot, getHeapSpaceStatistics, getHeapStatistics } from 'node:v8' + +export type MemoryTrigger = 'auto-high' | 'auto-critical' | 'manual' + +export interface MemoryDiagnostics { + activeHandles: number + activeRequests: number + analysis: { + potentialLeaks: string[] + recommendation: string + } + memoryGrowthRate: { + bytesPerSecond: number + mbPerHour: number + } + memoryUsage: { + arrayBuffers: number + external: number + heapTotal: number + heapUsed: number + rss: number + } + nodeVersion: string + openFileDescriptors?: number + platform: string + resourceUsage: { + maxRSS: number + systemCPUTime: number + userCPUTime: number + } + smapsRollup?: string + timestamp: string + trigger: MemoryTrigger + uptimeSeconds: number + v8HeapSpaces?: { available: number; name: string; size: number; used: number }[] + v8HeapStats: { + detachedContexts: number + heapSizeLimit: number + mallocedMemory: number + nativeContexts: number + peakMallocedMemory: number + } +} + +export interface HeapDumpResult { + diagPath?: string + error?: string + heapPath?: string + success: boolean +} + +const heapDumpRoot = () => + process.env.HERMES_HEAPDUMP_DIR?.trim() || join(homedir() || tmpdir(), '.hermes', 'heapdumps') + +const processInternals = process as unknown as { + _getActiveHandles: () => unknown[] + _getActiveRequests: () => unknown[] +} + +export async function captureMemoryDiagnostics(trigger: MemoryTrigger): Promise { + const usage = process.memoryUsage() + const heapStats = getHeapStatistics() + const resourceUsage = process.resourceUsage() + const uptimeSeconds = process.uptime() + + let heapSpaces: ReturnType | undefined + + try { + heapSpaces = getHeapSpaceStatistics() + } catch { + /* Bun / older Node — ignore */ + } + + const activeHandles = processInternals._getActiveHandles().length + const activeRequests = processInternals._getActiveRequests().length + + let openFileDescriptors: number | undefined + + try { + openFileDescriptors = (await readdir('/proc/self/fd')).length + } catch { + /* non-Linux */ + } + + let smapsRollup: string | undefined + + try { + smapsRollup = await readFile('/proc/self/smaps_rollup', 'utf8') + } catch { + /* non-Linux / no access */ + } + + const nativeMemory = usage.rss - usage.heapUsed + const bytesPerSecond = uptimeSeconds > 0 ? usage.rss / uptimeSeconds : 0 + const mbPerHour = (bytesPerSecond * 3600) / (1024 * 1024) + + const potentialLeaks: string[] = [] + + if (heapStats.number_of_detached_contexts > 0) { + potentialLeaks.push( + `${heapStats.number_of_detached_contexts} detached context(s) — possible component/closure leak` + ) + } + + if (activeHandles > 100) { + potentialLeaks.push(`${activeHandles} active handles — possible timer/socket leak`) + } + + if (nativeMemory > usage.heapUsed) { + potentialLeaks.push('Native memory > heap — leak may be in native addons') + } + + if (mbPerHour > 100) { + potentialLeaks.push(`High memory growth rate: ${mbPerHour.toFixed(1)} MB/hour`) + } + + if (openFileDescriptors && openFileDescriptors > 500) { + potentialLeaks.push(`${openFileDescriptors} open FDs — possible file/socket leak`) + } + + return { + activeHandles, + activeRequests, + analysis: { + potentialLeaks, + recommendation: potentialLeaks.length + ? `WARNING: ${potentialLeaks.length} potential leak indicator(s). See potentialLeaks.` + : 'No obvious leak indicators. Inspect heap snapshot for retained objects.' + }, + memoryGrowthRate: { bytesPerSecond, mbPerHour }, + memoryUsage: { + arrayBuffers: usage.arrayBuffers, + external: usage.external, + heapTotal: usage.heapTotal, + heapUsed: usage.heapUsed, + rss: usage.rss + }, + nodeVersion: process.version, + openFileDescriptors, + platform: process.platform, + resourceUsage: { + maxRSS: resourceUsage.maxRSS * 1024, + systemCPUTime: resourceUsage.systemCPUTime, + userCPUTime: resourceUsage.userCPUTime + }, + smapsRollup, + timestamp: new Date().toISOString(), + trigger, + uptimeSeconds, + v8HeapSpaces: heapSpaces?.map(s => ({ + available: s.space_available_size, + name: s.space_name, + size: s.space_size, + used: s.space_used_size + })), + v8HeapStats: { + detachedContexts: heapStats.number_of_detached_contexts, + heapSizeLimit: heapStats.heap_size_limit, + mallocedMemory: heapStats.malloced_memory, + nativeContexts: heapStats.number_of_native_contexts, + peakMallocedMemory: heapStats.peak_malloced_memory + } + } +} + +export async function performHeapDump(trigger: MemoryTrigger = 'manual'): Promise { + try { + const diagnostics = await captureMemoryDiagnostics(trigger) + const dir = heapDumpRoot() + + await mkdir(dir, { recursive: true }) + + const stamp = new Date().toISOString().replace(/[:.]/g, '-') + const base = `hermes-${stamp}-${process.pid}-${trigger}` + const heapPath = join(dir, `${base}.heapsnapshot`) + const diagPath = join(dir, `${base}.diagnostics.json`) + + await writeFile(diagPath, JSON.stringify(diagnostics, null, 2), { mode: 0o600 }) + await writeSnapshot(heapPath) + + return { diagPath, heapPath, success: true } + } catch (e) { + return { error: e instanceof Error ? e.message : String(e), success: false } + } +} + +export function formatBytes(bytes: number): string { + if (!Number.isFinite(bytes) || bytes <= 0) { + return '0B' + } + + const units = ['B', 'KB', 'MB', 'GB', 'TB'] + const exp = Math.min(units.length - 1, Math.floor(Math.log10(bytes) / 3)) + const value = bytes / 1024 ** exp + + return `${value >= 100 ? value.toFixed(0) : value.toFixed(1)}${units[exp]}` +} + +async function writeSnapshot(filepath: string) { + const stream = createWriteStream(filepath, { mode: 0o600 }) + + await pipeline(getHeapSnapshot(), stream) +} diff --git a/ui-tui/src/lib/memoryMonitor.ts b/ui-tui/src/lib/memoryMonitor.ts new file mode 100644 index 0000000000..58d7d38783 --- /dev/null +++ b/ui-tui/src/lib/memoryMonitor.ts @@ -0,0 +1,75 @@ +import { type HeapDumpResult, performHeapDump } from './memory.js' + +export type MemoryLevel = 'critical' | 'high' | 'normal' + +export interface MemorySnapshot { + heapUsed: number + level: MemoryLevel + rss: number +} + +export interface MemoryMonitorOptions { + criticalBytes?: number + highBytes?: number + intervalMs?: number + onCritical?: (snap: MemorySnapshot, dump: HeapDumpResult | null) => void + onHigh?: (snap: MemorySnapshot, dump: HeapDumpResult | null) => void + onSnapshot?: (snap: MemorySnapshot) => void +} + +const GB = 1024 ** 3 + +const DEFAULTS = { + criticalBytes: 2.5 * GB, + highBytes: 1.5 * GB, + intervalMs: 10_000 +} + +export function startMemoryMonitor({ + criticalBytes = DEFAULTS.criticalBytes, + highBytes = DEFAULTS.highBytes, + intervalMs = DEFAULTS.intervalMs, + onCritical, + onHigh, + onSnapshot +}: MemoryMonitorOptions = {}): () => void { + let dumpedHigh = false + let dumpedCritical = false + + const tick = async () => { + const { heapUsed, rss } = process.memoryUsage() + const level: MemoryLevel = heapUsed >= criticalBytes ? 'critical' : heapUsed >= highBytes ? 'high' : 'normal' + const snap: MemorySnapshot = { heapUsed, level, rss } + + onSnapshot?.(snap) + + if (level === 'normal') { + dumpedHigh = false + dumpedCritical = false + + return + } + + if (level === 'high' && !dumpedHigh) { + dumpedHigh = true + const dump = await performHeapDump('auto-high').catch(() => null) + + onHigh?.(snap, dump) + + return + } + + if (level === 'critical' && !dumpedCritical) { + dumpedCritical = true + const dump = await performHeapDump('auto-critical').catch(() => null) + + onCritical?.(snap, dump) + } + } + + const handle = setInterval(() => void tick(), intervalMs) + + handle.unref?.() + + return () => clearInterval(handle) +}