mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-01 07:01:41 +00:00
fix(tui): log parent gateway lifecycle exits (#31051)
* fix(tui): log parent gateway lifecycle exits Add parent-side breadcrumbs for TUI gateway shutdown and transport exits so future backend EOF/SIGTERM reports identify the parent action that caused them. * chore(tui): retrigger lifecycle logging checks Retry transient GitHub checkout failures on the lifecycle logging PR.
This commit is contained in:
parent
026f64f8e0
commit
e6ca730a22
4 changed files with 54 additions and 12 deletions
|
|
@ -34,6 +34,7 @@ class FakeWebSocket {
|
|||
options !== null &&
|
||||
'once' in options &&
|
||||
Boolean((options as { once?: unknown }).once)
|
||||
|
||||
const entries = this.listeners.get(type) ?? []
|
||||
|
||||
entries.push({ callback, once })
|
||||
|
|
@ -84,6 +85,7 @@ class FakeWebSocket {
|
|||
|
||||
for (const entry of entries) {
|
||||
entry.callback(event)
|
||||
|
||||
if (entry.once) {
|
||||
this.removeEventListener(type, entry.callback)
|
||||
}
|
||||
|
|
@ -170,6 +172,7 @@ describe('GatewayClient websocket attach mode', () => {
|
|||
method: 'event',
|
||||
params: { type: 'tool.start', payload: { tool_id: 't1' } }
|
||||
})
|
||||
|
||||
gatewaySocket.message(eventFrame)
|
||||
|
||||
expect(seen).toContain('tool.start')
|
||||
|
|
@ -193,6 +196,8 @@ describe('GatewayClient websocket attach mode', () => {
|
|||
gatewaySocket.close(1011)
|
||||
|
||||
expect(exits).toEqual([1011])
|
||||
expect(gw.getLogTail(20)).toContain('[lifecycle] websocket close code=1011')
|
||||
expect(gw.getLogTail(20)).toContain('[lifecycle] transport exit code=1011')
|
||||
})
|
||||
|
||||
it('rejects pending RPCs with websocket wording when the attached socket closes', async () => {
|
||||
|
|
@ -226,9 +231,10 @@ describe('GatewayClient websocket attach mode', () => {
|
|||
const req = gw.request('session.create', {})
|
||||
await vi.waitFor(() => expect(gatewaySocket.sent.length).toBeGreaterThan(0))
|
||||
|
||||
gw.kill()
|
||||
gw.kill('test.shutdown')
|
||||
|
||||
await expect(req).rejects.toThrow(/gateway closed/)
|
||||
expect(gw.getLogTail(20)).toContain('[lifecycle] GatewayClient.kill reason=test.shutdown')
|
||||
})
|
||||
|
||||
it('reattaches when HERMES_TUI_GATEWAY_URL rotates between requests', async () => {
|
||||
|
|
@ -279,6 +285,7 @@ describe('GatewayClient websocket attach mode', () => {
|
|||
gw.drain()
|
||||
|
||||
expect(stderrLines.length).toBeGreaterThan(0)
|
||||
|
||||
for (const line of stderrLines) {
|
||||
expect(line).not.toContain('hunter2')
|
||||
expect(line).not.toContain('channel=secret')
|
||||
|
|
@ -370,6 +377,7 @@ describe('GatewayClient websocket attach mode', () => {
|
|||
gw.drain()
|
||||
|
||||
expect(stderrLines.length).toBeGreaterThan(0)
|
||||
|
||||
for (const line of stderrLines) {
|
||||
expect(line).not.toContain('alice')
|
||||
expect(line).not.toContain('hunter2')
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import { useApp, useHasSelection, useSelection, useStdout, useTerminalTitle, type ScrollBoxHandle } from '@hermes/ink'
|
||||
import { type ScrollBoxHandle, useApp, useHasSelection, useSelection, useStdout, useTerminalTitle } from '@hermes/ink'
|
||||
import { useStore } from '@nanostores/react'
|
||||
import { useCallback, useEffect, useMemo, useRef, useState } from 'react'
|
||||
|
||||
|
|
@ -365,7 +365,7 @@ export function useMainApp(gw: GatewayClient) {
|
|||
const gateway = useMemo(() => ({ gw, rpc }), [gw, rpc])
|
||||
|
||||
const die = useCallback(() => {
|
||||
gw.kill()
|
||||
gw.kill('app.die')
|
||||
exit()
|
||||
// Ink's exit() calls unmount() which resets terminal modes but does NOT
|
||||
// call process.exit(). Without an explicit exit the Node process stays
|
||||
|
|
@ -377,7 +377,7 @@ export function useMainApp(gw: GatewayClient) {
|
|||
}, [exit, gw])
|
||||
|
||||
const dieWithCode = useCallback((code: number) => {
|
||||
gw.kill()
|
||||
gw.kill(`app.dieWithCode:${code}`)
|
||||
exit()
|
||||
process.exit(code)
|
||||
}, [exit, gw])
|
||||
|
|
@ -736,10 +736,13 @@ export function useMainApp(gw: GatewayClient) {
|
|||
const anyPanelVisible = SECTION_NAMES.some(
|
||||
s => sectionMode(s, ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden'
|
||||
)
|
||||
|
||||
const thinkingPanelVisible =
|
||||
sectionMode('thinking', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden'
|
||||
|
||||
const toolsPanelVisible =
|
||||
sectionMode('tools', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden'
|
||||
|
||||
const activityPanelVisible =
|
||||
sectionMode('activity', ui.detailsMode, ui.sections, ui.detailsModeCommandOverride) !== 'hidden'
|
||||
|
||||
|
|
|
|||
|
|
@ -43,23 +43,24 @@ setupGracefulExit({
|
|||
() => {
|
||||
resetTerminalModes()
|
||||
|
||||
return gw.kill()
|
||||
return gw.kill('graceful-exit-cleanup')
|
||||
}
|
||||
],
|
||||
onError: (scope, err) => {
|
||||
const message = err instanceof Error ? `${err.name}: ${err.message}` : String(err)
|
||||
const message = err instanceof Error ? `${err.name}: ${err.message}\n${err.stack ?? ''}` : String(err)
|
||||
|
||||
process.stderr.write(`hermes-tui ${scope}: ${message.slice(0, 2000)}\n`)
|
||||
process.stderr.write(`hermes-tui lifecycle ${scope}: ${message.slice(0, 2000)}\n`)
|
||||
},
|
||||
onSignal: signal => {
|
||||
resetTerminalModes()
|
||||
process.stderr.write(`hermes-tui: received ${signal}\n`)
|
||||
process.stderr.write(`hermes-tui lifecycle: received ${signal}\n`)
|
||||
}
|
||||
})
|
||||
|
||||
const stopMemoryMonitor = startMemoryMonitor({
|
||||
onCritical: (snap, dump) => {
|
||||
resetTerminalModes()
|
||||
process.stderr.write(`hermes-tui lifecycle: memory critical exit heap=${formatBytes(snap.heapUsed)} rss=${formatBytes(snap.rss)}\n`)
|
||||
process.stderr.write(dumpNotice(snap, dump))
|
||||
process.stderr.write('hermes-tui: exiting to avoid OOM; restart to recover\n')
|
||||
process.exit(137)
|
||||
|
|
|
|||
|
|
@ -21,6 +21,14 @@ const WS_CLOSED = 3
|
|||
const truncateLine = (line: string) =>
|
||||
line.length > MAX_LOG_LINE_BYTES ? `${line.slice(0, MAX_LOG_LINE_BYTES)}… [truncated ${line.length} bytes]` : line
|
||||
|
||||
const describeChild = (proc: ChildProcess | null) => {
|
||||
if (!proc) {
|
||||
return 'pid=none'
|
||||
}
|
||||
|
||||
return `pid=${proc.pid ?? 'unknown'} killed=${proc.killed} exitCode=${proc.exitCode ?? 'null'} signal=${proc.signalCode ?? 'null'}`
|
||||
}
|
||||
|
||||
const resolveGatewayAttachUrl = () => {
|
||||
const raw = process.env.HERMES_TUI_GATEWAY_URL?.trim()
|
||||
|
||||
|
|
@ -85,7 +93,7 @@ const asWireText = (raw: unknown): string | null => {
|
|||
// otherwise-malformed URLs that the WHATWG `URL` parser can't accept.
|
||||
// Used by the `redactUrl` fallback so embedded credentials are
|
||||
// scrubbed from log lines even when the URL is unparseable.
|
||||
const _USERINFO_FALLBACK_RE = /^([a-z][a-z0-9+.\-]*:\/\/)[^/?#@]*@/i
|
||||
const _USERINFO_FALLBACK_RE = /^([a-z][a-z0-9+.-]*:\/\/)[^/?#@]*@/i
|
||||
|
||||
// Connection URLs (gateway, sidecar) often carry bearer tokens in the query
|
||||
// string. We surface them in user-facing log lines and the
|
||||
|
|
@ -191,6 +199,7 @@ export class GatewayClient extends EventEmitter {
|
|||
const ws = this.ws
|
||||
this.ws = null
|
||||
this.wsConnectPromise = null
|
||||
|
||||
try {
|
||||
ws?.close()
|
||||
} catch {
|
||||
|
|
@ -239,6 +248,7 @@ export class GatewayClient extends EventEmitter {
|
|||
private handleTransportExit(code: null | number, reason?: string) {
|
||||
this.clearReadyTimer()
|
||||
this.closeSidecarSocket()
|
||||
this.pushLog(`[lifecycle] transport exit code=${code ?? 'null'} reason=${reason ?? 'none'}`)
|
||||
this.rejectPending(new Error(reason || `gateway exited${code === null ? '' : ` (${code})`}`))
|
||||
|
||||
if (this.subscribed) {
|
||||
|
|
@ -257,6 +267,7 @@ export class GatewayClient extends EventEmitter {
|
|||
|
||||
if (typeof WebSocket === 'undefined') {
|
||||
this.pushLog(`[sidecar] WebSocket unavailable; skipping mirror to ${redactUrl(this.sidecarUrl)}`)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -324,6 +335,7 @@ export class GatewayClient extends EventEmitter {
|
|||
env.PYTHONPATH = pyPath ? `${root}${delimiter}${pyPath}` : root
|
||||
this.startReadyTimer(python, cwd)
|
||||
this.proc = spawn(python, ['-m', 'tui_gateway.entry'], { cwd, env, stdio: ['pipe', 'pipe', 'pipe'] })
|
||||
this.pushLog(`[lifecycle] spawned gateway child ${describeChild(this.proc)} python=${python} cwd=${cwd}`)
|
||||
|
||||
this.stdoutRl = createInterface({ input: this.proc.stdout! })
|
||||
this.stdoutRl.on('line', raw => {
|
||||
|
|
@ -353,11 +365,14 @@ export class GatewayClient extends EventEmitter {
|
|||
this.proc.on('error', err => {
|
||||
// Skip stale errors on an already-replaced child.
|
||||
if (this.proc !== ownedProc) {
|
||||
this.pushLog(`[lifecycle] stale child error ignored ${describeChild(ownedProc)} message=${err.message}`)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
const line = `[spawn] ${err.message}`
|
||||
|
||||
this.pushLog(`[lifecycle] child error ${describeChild(ownedProc)} message=${err.message}`)
|
||||
this.pushLog(line)
|
||||
this.publish({ type: 'gateway.stderr', payload: { line } })
|
||||
// Detach the reference up front so the late `exit` event for
|
||||
|
|
@ -369,14 +384,19 @@ export class GatewayClient extends EventEmitter {
|
|||
this.proc = null
|
||||
this.handleTransportExit(1, `gateway error: ${err.message}`)
|
||||
})
|
||||
this.proc.on('exit', code => {
|
||||
this.proc.on('exit', (code, signal) => {
|
||||
// start() can replace `this.proc` while an old child is still
|
||||
// tearing down. Skip stale exits so we don't clear the new
|
||||
// startup timer or reject newly-issued pending requests.
|
||||
if (this.proc !== ownedProc) {
|
||||
this.pushLog(
|
||||
`[lifecycle] stale child exit ignored ${describeChild(ownedProc)} code=${code ?? 'null'} signal=${signal ?? 'null'}`
|
||||
)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
this.pushLog(`[lifecycle] child exit ${describeChild(ownedProc)} code=${code ?? 'null'} signal=${signal ?? 'null'}`)
|
||||
this.handleTransportExit(code)
|
||||
})
|
||||
}
|
||||
|
|
@ -400,6 +420,7 @@ export class GatewayClient extends EventEmitter {
|
|||
let settled = false
|
||||
|
||||
this.ws = ws
|
||||
|
||||
const connectPromise = new Promise<void>((resolve, reject) => {
|
||||
ws.addEventListener(
|
||||
'open',
|
||||
|
|
@ -454,9 +475,12 @@ export class GatewayClient extends EventEmitter {
|
|||
// new ready timer or reject the new pending requests on behalf
|
||||
// of a stale socket.
|
||||
if (this.ws !== ws) {
|
||||
this.pushLog(`[lifecycle] stale websocket close ignored code=${ev.code}`)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
this.pushLog(`[lifecycle] websocket close code=${ev.code}`)
|
||||
this.ws = null
|
||||
this.wsConnectPromise = null
|
||||
this.handleTransportExit(ev.code, `gateway websocket closed${ev.code ? ` (${ev.code})` : ''}`)
|
||||
|
|
@ -483,14 +507,17 @@ export class GatewayClient extends EventEmitter {
|
|||
this.resetStartupState()
|
||||
|
||||
if (this.proc && !this.proc.killed && this.proc.exitCode === null) {
|
||||
this.pushLog(`[lifecycle] replacing live gateway child ${describeChild(this.proc)}`)
|
||||
this.proc.kill()
|
||||
}
|
||||
|
||||
this.proc = null
|
||||
this.closeGatewaySocket()
|
||||
this.closeSidecarSocket()
|
||||
|
||||
if (attachUrl) {
|
||||
this.startAttachedGateway(attachUrl)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
|
@ -686,8 +713,11 @@ export class GatewayClient extends EventEmitter {
|
|||
})
|
||||
}
|
||||
|
||||
kill() {
|
||||
this.proc?.kill()
|
||||
kill(reason = 'requested') {
|
||||
const proc = this.proc
|
||||
const killed = proc?.kill()
|
||||
|
||||
this.pushLog(`[lifecycle] GatewayClient.kill reason=${reason} ${describeChild(proc)} killResult=${killed ?? 'none'}`)
|
||||
this.closeGatewaySocket()
|
||||
this.closeSidecarSocket()
|
||||
this.clearReadyTimer()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue