fix(tui): log parent gateway lifecycle exits (#31051)

* fix(tui): log parent gateway lifecycle exits

Add parent-side breadcrumbs for TUI gateway shutdown and transport exits so future backend EOF/SIGTERM reports identify the parent action that caused them.

* chore(tui): retrigger lifecycle logging checks

Retry transient GitHub checkout failures on the lifecycle logging PR.
This commit is contained in:
brooklyn! 2026-05-23 13:28:40 -05:00 committed by GitHub
parent 026f64f8e0
commit e6ca730a22
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 54 additions and 12 deletions

View file

@ -34,6 +34,7 @@ class FakeWebSocket {
options !== null &&
'once' in options &&
Boolean((options as { once?: unknown }).once)
const entries = this.listeners.get(type) ?? []
entries.push({ callback, once })
@ -84,6 +85,7 @@ class FakeWebSocket {
for (const entry of entries) {
entry.callback(event)
if (entry.once) {
this.removeEventListener(type, entry.callback)
}
@ -170,6 +172,7 @@ describe('GatewayClient websocket attach mode', () => {
method: 'event',
params: { type: 'tool.start', payload: { tool_id: 't1' } }
})
gatewaySocket.message(eventFrame)
expect(seen).toContain('tool.start')
@ -193,6 +196,8 @@ describe('GatewayClient websocket attach mode', () => {
gatewaySocket.close(1011)
expect(exits).toEqual([1011])
expect(gw.getLogTail(20)).toContain('[lifecycle] websocket close code=1011')
expect(gw.getLogTail(20)).toContain('[lifecycle] transport exit code=1011')
})
it('rejects pending RPCs with websocket wording when the attached socket closes', async () => {
@ -226,9 +231,10 @@ describe('GatewayClient websocket attach mode', () => {
const req = gw.request('session.create', {})
await vi.waitFor(() => expect(gatewaySocket.sent.length).toBeGreaterThan(0))
gw.kill()
gw.kill('test.shutdown')
await expect(req).rejects.toThrow(/gateway closed/)
expect(gw.getLogTail(20)).toContain('[lifecycle] GatewayClient.kill reason=test.shutdown')
})
it('reattaches when HERMES_TUI_GATEWAY_URL rotates between requests', async () => {
@ -279,6 +285,7 @@ describe('GatewayClient websocket attach mode', () => {
gw.drain()
expect(stderrLines.length).toBeGreaterThan(0)
for (const line of stderrLines) {
expect(line).not.toContain('hunter2')
expect(line).not.toContain('channel=secret')
@ -370,6 +377,7 @@ describe('GatewayClient websocket attach mode', () => {
gw.drain()
expect(stderrLines.length).toBeGreaterThan(0)
for (const line of stderrLines) {
expect(line).not.toContain('alice')
expect(line).not.toContain('hunter2')

View file

@ -1,4 +1,4 @@
import { useApp, useHasSelection, useSelection, useStdout, useTerminalTitle, type ScrollBoxHandle } from '@hermes/ink'
import { type ScrollBoxHandle, useApp, useHasSelection, useSelection, useStdout, useTerminalTitle } from '@hermes/ink'
import { useStore } from '@nanostores/react'
import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
@ -365,7 +365,7 @@ export function useMainApp(gw: GatewayClient) {
const gateway = useMemo(() => ({ gw, rpc }), [gw, rpc])
const die = useCallback(() => {
gw.kill()
gw.kill('app.die')
exit()
// Ink's exit() calls unmount() which resets terminal modes but does NOT
// call process.exit(). Without an explicit exit the Node process stays
@ -377,7 +377,7 @@ export function useMainApp(gw: GatewayClient) {
}, [exit, gw])
const dieWithCode = useCallback((code: number) => {
gw.kill()
gw.kill(`app.dieWithCode:${code}`)
exit()
process.exit(code)
}, [exit, gw])
@ -736,10 +736,13 @@ export function useMainApp(gw: GatewayClient) {
const anyPanelVisible = SECTION_NAMES.some(
s => sectionMode(s, ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden'
)
const thinkingPanelVisible =
sectionMode('thinking', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden'
const toolsPanelVisible =
sectionMode('tools', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden'
const activityPanelVisible =
sectionMode('activity', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden'

View file

@ -43,23 +43,24 @@ setupGracefulExit({
() => {
resetTerminalModes()
return gw.kill()
return gw.kill('graceful-exit-cleanup')
}
],
onError: (scope, err) => {
const message = err instanceof Error ? `${err.name}: ${err.message}` : String(err)
const message = err instanceof Error ? `${err.name}: ${err.message}\n${err.stack ?? ''}` : String(err)
process.stderr.write(`hermes-tui ${scope}: ${message.slice(0, 2000)}\n`)
process.stderr.write(`hermes-tui lifecycle ${scope}: ${message.slice(0, 2000)}\n`)
},
onSignal: signal => {
resetTerminalModes()
process.stderr.write(`hermes-tui: received ${signal}\n`)
process.stderr.write(`hermes-tui lifecycle: received ${signal}\n`)
}
})
const stopMemoryMonitor = startMemoryMonitor({
onCritical: (snap, dump) => {
resetTerminalModes()
process.stderr.write(`hermes-tui lifecycle: memory critical exit heap=${formatBytes(snap.heapUsed)} rss=${formatBytes(snap.rss)}\n`)
process.stderr.write(dumpNotice(snap, dump))
process.stderr.write('hermes-tui: exiting to avoid OOM; restart to recover\n')
process.exit(137)

View file

@ -21,6 +21,14 @@ const WS_CLOSED = 3
const truncateLine = (line: string) =>
line.length > MAX_LOG_LINE_BYTES ? `${line.slice(0, MAX_LOG_LINE_BYTES)}… [truncated ${line.length} bytes]` : line
const describeChild = (proc: ChildProcess | null) => {
if (!proc) {
return 'pid=none'
}
return `pid=${proc.pid ?? 'unknown'} killed=${proc.killed} exitCode=${proc.exitCode ?? 'null'} signal=${proc.signalCode ?? 'null'}`
}
const resolveGatewayAttachUrl = () => {
const raw = process.env.HERMES_TUI_GATEWAY_URL?.trim()
@ -85,7 +93,7 @@ const asWireText = (raw: unknown): string | null => {
// otherwise-malformed URLs that the WHATWG `URL` parser can't accept.
// Used by the `redactUrl` fallback so embedded credentials are
// scrubbed from log lines even when the URL is unparseable.
const _USERINFO_FALLBACK_RE = /^([a-z][a-z0-9+.\-]*:\/\/)[^/?#@]*@/i
const _USERINFO_FALLBACK_RE = /^([a-z][a-z0-9+.-]*:\/\/)[^/?#@]*@/i
// Connection URLs (gateway, sidecar) often carry bearer tokens in the query
// string. We surface them in user-facing log lines and the
@ -191,6 +199,7 @@ export class GatewayClient extends EventEmitter {
const ws = this.ws
this.ws = null
this.wsConnectPromise = null
try {
ws?.close()
} catch {
@ -239,6 +248,7 @@ export class GatewayClient extends EventEmitter {
private handleTransportExit(code: null | number, reason?: string) {
this.clearReadyTimer()
this.closeSidecarSocket()
this.pushLog(`[lifecycle] transport exit code=${code ?? 'null'} reason=${reason ?? 'none'}`)
this.rejectPending(new Error(reason || `gateway exited${code === null ? '' : ` (${code})`}`))
if (this.subscribed) {
@ -257,6 +267,7 @@ export class GatewayClient extends EventEmitter {
if (typeof WebSocket === 'undefined') {
this.pushLog(`[sidecar] WebSocket unavailable; skipping mirror to ${redactUrl(this.sidecarUrl)}`)
return
}
@ -324,6 +335,7 @@ export class GatewayClient extends EventEmitter {
env.PYTHONPATH = pyPath ? `${root}${delimiter}${pyPath}` : root
this.startReadyTimer(python, cwd)
this.proc = spawn(python, ['-m', 'tui_gateway.entry'], { cwd, env, stdio: ['pipe', 'pipe', 'pipe'] })
this.pushLog(`[lifecycle] spawned gateway child ${describeChild(this.proc)} python=${python} cwd=${cwd}`)
this.stdoutRl = createInterface({ input: this.proc.stdout! })
this.stdoutRl.on('line', raw => {
@ -353,11 +365,14 @@ export class GatewayClient extends EventEmitter {
this.proc.on('error', err => {
// Skip stale errors on an already-replaced child.
if (this.proc !== ownedProc) {
this.pushLog(`[lifecycle] stale child error ignored ${describeChild(ownedProc)} message=${err.message}`)
return
}
const line = `[spawn] ${err.message}`
this.pushLog(`[lifecycle] child error ${describeChild(ownedProc)} message=${err.message}`)
this.pushLog(line)
this.publish({ type: 'gateway.stderr', payload: { line } })
// Detach the reference up front so the late `exit` event for
@ -369,14 +384,19 @@ export class GatewayClient extends EventEmitter {
this.proc = null
this.handleTransportExit(1, `gateway error: ${err.message}`)
})
this.proc.on('exit', code => {
this.proc.on('exit', (code, signal) => {
// start() can replace `this.proc` while an old child is still
// tearing down. Skip stale exits so we don't clear the new
// startup timer or reject newly-issued pending requests.
if (this.proc !== ownedProc) {
this.pushLog(
`[lifecycle] stale child exit ignored ${describeChild(ownedProc)} code=${code ?? 'null'} signal=${signal ?? 'null'}`
)
return
}
this.pushLog(`[lifecycle] child exit ${describeChild(ownedProc)} code=${code ?? 'null'} signal=${signal ?? 'null'}`)
this.handleTransportExit(code)
})
}
@ -400,6 +420,7 @@ export class GatewayClient extends EventEmitter {
let settled = false
this.ws = ws
const connectPromise = new Promise<void>((resolve, reject) => {
ws.addEventListener(
'open',
@ -454,9 +475,12 @@ export class GatewayClient extends EventEmitter {
// new ready timer or reject the new pending requests on behalf
// of a stale socket.
if (this.ws !== ws) {
this.pushLog(`[lifecycle] stale websocket close ignored code=${ev.code}`)
return
}
this.pushLog(`[lifecycle] websocket close code=${ev.code}`)
this.ws = null
this.wsConnectPromise = null
this.handleTransportExit(ev.code, `gateway websocket closed${ev.code ? ` (${ev.code})` : ''}`)
@ -483,14 +507,17 @@ export class GatewayClient extends EventEmitter {
this.resetStartupState()
if (this.proc && !this.proc.killed && this.proc.exitCode === null) {
this.pushLog(`[lifecycle] replacing live gateway child ${describeChild(this.proc)}`)
this.proc.kill()
}
this.proc = null
this.closeGatewaySocket()
this.closeSidecarSocket()
if (attachUrl) {
this.startAttachedGateway(attachUrl)
return
}
@ -686,8 +713,11 @@ export class GatewayClient extends EventEmitter {
})
}
kill() {
this.proc?.kill()
kill(reason = 'requested') {
const proc = this.proc
const killed = proc?.kill()
this.pushLog(`[lifecycle] GatewayClient.kill reason=${reason} ${describeChild(proc)} killResult=${killed ?? 'none'}`)
this.closeGatewaySocket()
this.closeSidecarSocket()
this.clearReadyTimer()