From e6ca730a2219c602284f9807d65497f8c697ce5e Mon Sep 17 00:00:00 2001 From: brooklyn! Date: Sat, 23 May 2026 13:28:40 -0500 Subject: [PATCH] fix(tui): log parent gateway lifecycle exits (#31051) * fix(tui): log parent gateway lifecycle exits Add parent-side breadcrumbs for TUI gateway shutdown and transport exits so future backend EOF/SIGTERM reports identify the parent action that caused them. * chore(tui): retrigger lifecycle logging checks Retry transient GitHub checkout failures on the lifecycle logging PR. --- ui-tui/src/__tests__/gatewayClient.test.ts | 10 +++++- ui-tui/src/app/useMainApp.ts | 9 +++-- ui-tui/src/entry.tsx | 9 ++--- ui-tui/src/gatewayClient.ts | 38 +++++++++++++++++++--- 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/ui-tui/src/__tests__/gatewayClient.test.ts b/ui-tui/src/__tests__/gatewayClient.test.ts index eac96c20780..f1228e56fbe 100644 --- a/ui-tui/src/__tests__/gatewayClient.test.ts +++ b/ui-tui/src/__tests__/gatewayClient.test.ts @@ -34,6 +34,7 @@ class FakeWebSocket { options !== null && 'once' in options && Boolean((options as { once?: unknown }).once) + const entries = this.listeners.get(type) ?? [] entries.push({ callback, once }) @@ -84,6 +85,7 @@ class FakeWebSocket { for (const entry of entries) { entry.callback(event) + if (entry.once) { this.removeEventListener(type, entry.callback) } @@ -170,6 +172,7 @@ describe('GatewayClient websocket attach mode', () => { method: 'event', params: { type: 'tool.start', payload: { tool_id: 't1' } } }) + gatewaySocket.message(eventFrame) expect(seen).toContain('tool.start') @@ -193,6 +196,8 @@ describe('GatewayClient websocket attach mode', () => { gatewaySocket.close(1011) expect(exits).toEqual([1011]) + expect(gw.getLogTail(20)).toContain('[lifecycle] websocket close code=1011') + expect(gw.getLogTail(20)).toContain('[lifecycle] transport exit code=1011') }) it('rejects pending RPCs with websocket wording when the attached socket closes', async () => { @@ -226,9 +231,10 @@ describe('GatewayClient websocket attach mode', () => { const req = gw.request('session.create', {}) await vi.waitFor(() => expect(gatewaySocket.sent.length).toBeGreaterThan(0)) - gw.kill() + gw.kill('test.shutdown') await expect(req).rejects.toThrow(/gateway closed/) + expect(gw.getLogTail(20)).toContain('[lifecycle] GatewayClient.kill reason=test.shutdown') }) it('reattaches when HERMES_TUI_GATEWAY_URL rotates between requests', async () => { @@ -279,6 +285,7 @@ describe('GatewayClient websocket attach mode', () => { gw.drain() expect(stderrLines.length).toBeGreaterThan(0) + for (const line of stderrLines) { expect(line).not.toContain('hunter2') expect(line).not.toContain('channel=secret') @@ -370,6 +377,7 @@ describe('GatewayClient websocket attach mode', () => { gw.drain() expect(stderrLines.length).toBeGreaterThan(0) + for (const line of stderrLines) { expect(line).not.toContain('alice') expect(line).not.toContain('hunter2') diff --git a/ui-tui/src/app/useMainApp.ts b/ui-tui/src/app/useMainApp.ts index 7996c7b910b..71768bc2b0a 100644 --- a/ui-tui/src/app/useMainApp.ts +++ b/ui-tui/src/app/useMainApp.ts @@ -1,4 +1,4 @@ -import { useApp, useHasSelection, useSelection, useStdout, useTerminalTitle, type ScrollBoxHandle } from '@hermes/ink' +import { type ScrollBoxHandle, useApp, useHasSelection, useSelection, useStdout, useTerminalTitle } from '@hermes/ink' import { useStore } from '@nanostores/react' import { useCallback, useEffect, useMemo, useRef, useState } from 'react' @@ -365,7 +365,7 @@ export function useMainApp(gw: GatewayClient) { const gateway = useMemo(() => ({ gw, rpc }), [gw, rpc]) const die = useCallback(() => { - gw.kill() + gw.kill('app.die') exit() // Ink's exit() calls unmount() which resets terminal modes but does NOT // call process.exit(). Without an explicit exit the Node process stays @@ -377,7 +377,7 @@ export function useMainApp(gw: GatewayClient) { }, [exit, gw]) const dieWithCode = useCallback((code: number) => { - gw.kill() + gw.kill(`app.dieWithCode:${code}`) exit() process.exit(code) }, [exit, gw]) @@ -736,10 +736,13 @@ export function useMainApp(gw: GatewayClient) { const anyPanelVisible = SECTION_NAMES.some( s => sectionMode(s, ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden' ) + const thinkingPanelVisible = sectionMode('thinking', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden' + const toolsPanelVisible = sectionMode('tools', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden' + const activityPanelVisible = sectionMode('activity', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden' diff --git a/ui-tui/src/entry.tsx b/ui-tui/src/entry.tsx index 690caf0cc95..effde40fef9 100644 --- a/ui-tui/src/entry.tsx +++ b/ui-tui/src/entry.tsx @@ -43,23 +43,24 @@ setupGracefulExit({ () => { resetTerminalModes() - return gw.kill() + return gw.kill('graceful-exit-cleanup') } ], onError: (scope, err) => { - const message = err instanceof Error ? `${err.name}: ${err.message}` : String(err) + const message = err instanceof Error ? `${err.name}: ${err.message}\n${err.stack ?? ''}` : String(err) - process.stderr.write(`hermes-tui ${scope}: ${message.slice(0, 2000)}\n`) + process.stderr.write(`hermes-tui lifecycle ${scope}: ${message.slice(0, 2000)}\n`) }, onSignal: signal => { resetTerminalModes() - process.stderr.write(`hermes-tui: received ${signal}\n`) + process.stderr.write(`hermes-tui lifecycle: received ${signal}\n`) } }) const stopMemoryMonitor = startMemoryMonitor({ onCritical: (snap, dump) => { resetTerminalModes() + process.stderr.write(`hermes-tui lifecycle: memory critical exit heap=${formatBytes(snap.heapUsed)} rss=${formatBytes(snap.rss)}\n`) process.stderr.write(dumpNotice(snap, dump)) process.stderr.write('hermes-tui: exiting to avoid OOM; restart to recover\n') process.exit(137) diff --git a/ui-tui/src/gatewayClient.ts b/ui-tui/src/gatewayClient.ts index 9590b386aa6..f3121152c90 100644 --- a/ui-tui/src/gatewayClient.ts +++ b/ui-tui/src/gatewayClient.ts @@ -21,6 +21,14 @@ const WS_CLOSED = 3 const truncateLine = (line: string) => line.length > MAX_LOG_LINE_BYTES ? `${line.slice(0, MAX_LOG_LINE_BYTES)}… [truncated ${line.length} bytes]` : line +const describeChild = (proc: ChildProcess | null) => { + if (!proc) { + return 'pid=none' + } + + return `pid=${proc.pid ?? 'unknown'} killed=${proc.killed} exitCode=${proc.exitCode ?? 'null'} signal=${proc.signalCode ?? 'null'}` +} + const resolveGatewayAttachUrl = () => { const raw = process.env.HERMES_TUI_GATEWAY_URL?.trim() @@ -85,7 +93,7 @@ const asWireText = (raw: unknown): string | null => { // otherwise-malformed URLs that the WHATWG `URL` parser can't accept. // Used by the `redactUrl` fallback so embedded credentials are // scrubbed from log lines even when the URL is unparseable. -const _USERINFO_FALLBACK_RE = /^([a-z][a-z0-9+.\-]*:\/\/)[^/?#@]*@/i +const _USERINFO_FALLBACK_RE = /^([a-z][a-z0-9+.-]*:\/\/)[^/?#@]*@/i // Connection URLs (gateway, sidecar) often carry bearer tokens in the query // string. We surface them in user-facing log lines and the @@ -191,6 +199,7 @@ export class GatewayClient extends EventEmitter { const ws = this.ws this.ws = null this.wsConnectPromise = null + try { ws?.close() } catch { @@ -239,6 +248,7 @@ export class GatewayClient extends EventEmitter { private handleTransportExit(code: null | number, reason?: string) { this.clearReadyTimer() this.closeSidecarSocket() + this.pushLog(`[lifecycle] transport exit code=${code ?? 'null'} reason=${reason ?? 'none'}`) this.rejectPending(new Error(reason || `gateway exited${code === null ? '' : ` (${code})`}`)) if (this.subscribed) { @@ -257,6 +267,7 @@ export class GatewayClient extends EventEmitter { if (typeof WebSocket === 'undefined') { this.pushLog(`[sidecar] WebSocket unavailable; skipping mirror to ${redactUrl(this.sidecarUrl)}`) + return } @@ -324,6 +335,7 @@ export class GatewayClient extends EventEmitter { env.PYTHONPATH = pyPath ? `${root}${delimiter}${pyPath}` : root this.startReadyTimer(python, cwd) this.proc = spawn(python, ['-m', 'tui_gateway.entry'], { cwd, env, stdio: ['pipe', 'pipe', 'pipe'] }) + this.pushLog(`[lifecycle] spawned gateway child ${describeChild(this.proc)} python=${python} cwd=${cwd}`) this.stdoutRl = createInterface({ input: this.proc.stdout! }) this.stdoutRl.on('line', raw => { @@ -353,11 +365,14 @@ export class GatewayClient extends EventEmitter { this.proc.on('error', err => { // Skip stale errors on an already-replaced child. if (this.proc !== ownedProc) { + this.pushLog(`[lifecycle] stale child error ignored ${describeChild(ownedProc)} message=${err.message}`) + return } const line = `[spawn] ${err.message}` + this.pushLog(`[lifecycle] child error ${describeChild(ownedProc)} message=${err.message}`) this.pushLog(line) this.publish({ type: 'gateway.stderr', payload: { line } }) // Detach the reference up front so the late `exit` event for @@ -369,14 +384,19 @@ export class GatewayClient extends EventEmitter { this.proc = null this.handleTransportExit(1, `gateway error: ${err.message}`) }) - this.proc.on('exit', code => { + this.proc.on('exit', (code, signal) => { // start() can replace `this.proc` while an old child is still // tearing down. Skip stale exits so we don't clear the new // startup timer or reject newly-issued pending requests. if (this.proc !== ownedProc) { + this.pushLog( + `[lifecycle] stale child exit ignored ${describeChild(ownedProc)} code=${code ?? 'null'} signal=${signal ?? 'null'}` + ) + return } + this.pushLog(`[lifecycle] child exit ${describeChild(ownedProc)} code=${code ?? 'null'} signal=${signal ?? 'null'}`) this.handleTransportExit(code) }) } @@ -400,6 +420,7 @@ export class GatewayClient extends EventEmitter { let settled = false this.ws = ws + const connectPromise = new Promise((resolve, reject) => { ws.addEventListener( 'open', @@ -454,9 +475,12 @@ export class GatewayClient extends EventEmitter { // new ready timer or reject the new pending requests on behalf // of a stale socket. if (this.ws !== ws) { + this.pushLog(`[lifecycle] stale websocket close ignored code=${ev.code}`) + return } + this.pushLog(`[lifecycle] websocket close code=${ev.code}`) this.ws = null this.wsConnectPromise = null this.handleTransportExit(ev.code, `gateway websocket closed${ev.code ? ` (${ev.code})` : ''}`) @@ -483,14 +507,17 @@ export class GatewayClient extends EventEmitter { this.resetStartupState() if (this.proc && !this.proc.killed && this.proc.exitCode === null) { + this.pushLog(`[lifecycle] replacing live gateway child ${describeChild(this.proc)}`) this.proc.kill() } + this.proc = null this.closeGatewaySocket() this.closeSidecarSocket() if (attachUrl) { this.startAttachedGateway(attachUrl) + return } @@ -686,8 +713,11 @@ export class GatewayClient extends EventEmitter { }) } - kill() { - this.proc?.kill() + kill(reason = 'requested') { + const proc = this.proc + const killed = proc?.kill() + + this.pushLog(`[lifecycle] GatewayClient.kill reason=${reason} ${describeChild(proc)} killResult=${killed ?? 'none'}`) this.closeGatewaySocket() this.closeSidecarSocket() this.clearReadyTimer()