hermes-agent/ui-tui/src/gatewayClient.ts
brooklyn! a830f25f71
fix(tui): surface gateway stderr tail in start_timeout activity (#17112)
* fix(tui): append gateway stderr tail to start_timeout activity

`gateway.start_timeout` previously published only `cwd` + `python`,
which made TUI startup failures hard to disambiguate.  The user saw
`gateway startup timed out · /path/to/python /repo · /logs to inspect`
with no signal whether the actual cause was a wrong python interpreter,
a missing dependency, or a config parse failure.

Plumb a 20-line stderr tail through the event so the most useful lines
land directly in the TUI activity feed, capped to the last 8 non-empty
lines for readability:

* `gatewayClient.ts` — collect `getLogTail(20)` when the readyTimer
  fires and attach it as `payload.stderr_tail`.
* `gatewayTypes.ts`  — extend the `gateway.start_timeout` event union
  with the new optional field.
* `createGatewayEventHandler.ts` — emit the trimmed lines after the
  existing `gateway startup timed out` activity entry, classified
  `error`.

Tests: regression test in `createGatewayEventHandler.test.ts` checks
that `ModuleNotFoundError` / `FileNotFoundError` lines from the tail
land in `getTurnState().activity` so they show up in the UI immediately.

Validation: `npm run type-check` clean, `npm test --run` 390/390.

* review(copilot): filter blanks before slice and cap stderr tail at 120 chars
2026-04-28 15:56:02 -05:00

303 lines
8.4 KiB
TypeScript

import { type ChildProcess, spawn } from 'node:child_process'
import { EventEmitter } from 'node:events'
import { existsSync } from 'node:fs'
import { delimiter, resolve } from 'node:path'
import { createInterface } from 'node:readline'
import type { GatewayEvent } from './gatewayTypes.js'
import { CircularBuffer } from './lib/circularBuffer.js'
const MAX_GATEWAY_LOG_LINES = 200
const MAX_LOG_LINE_BYTES = 4096
const MAX_BUFFERED_EVENTS = 2000
const MAX_LOG_PREVIEW = 240
const STARTUP_TIMEOUT_MS = Math.max(5000, parseInt(process.env.HERMES_TUI_STARTUP_TIMEOUT_MS ?? '15000', 10) || 15000)
const REQUEST_TIMEOUT_MS = Math.max(30000, parseInt(process.env.HERMES_TUI_RPC_TIMEOUT_MS ?? '120000', 10) || 120000)
const truncateLine = (line: string) =>
line.length > MAX_LOG_LINE_BYTES ? `${line.slice(0, MAX_LOG_LINE_BYTES)}… [truncated ${line.length} bytes]` : line
const resolvePython = (root: string) => {
const configured = process.env.HERMES_PYTHON?.trim() || process.env.PYTHON?.trim()
if (configured) {
return configured
}
const venv = process.env.VIRTUAL_ENV?.trim()
const hit = [
venv && resolve(venv, 'bin/python'),
venv && resolve(venv, 'Scripts/python.exe'),
resolve(root, '.venv/bin/python'),
resolve(root, '.venv/bin/python3'),
resolve(root, 'venv/bin/python'),
resolve(root, 'venv/bin/python3')
].find(p => p && existsSync(p))
return hit || (process.platform === 'win32' ? 'python' : 'python3')
}
const asGatewayEvent = (value: unknown): GatewayEvent | null =>
value && typeof value === 'object' && !Array.isArray(value) && typeof (value as { type?: unknown }).type === 'string'
? (value as GatewayEvent)
: null
interface Pending {
id: string
method: string
reject: (e: Error) => void
resolve: (v: unknown) => void
timeout: ReturnType<typeof setTimeout>
}
export class GatewayClient extends EventEmitter {
private proc: ChildProcess | null = null
private reqId = 0
private logs = new CircularBuffer<string>(MAX_GATEWAY_LOG_LINES)
private pending = new Map<string, Pending>()
private bufferedEvents = new CircularBuffer<GatewayEvent>(MAX_BUFFERED_EVENTS)
private pendingExit: number | null | undefined
private ready = false
private readyTimer: ReturnType<typeof setTimeout> | null = null
private subscribed = false
private stdoutRl: ReturnType<typeof createInterface> | null = null
private stderrRl: ReturnType<typeof createInterface> | null = null
constructor() {
super()
// useInput / createGatewayEventHandler can legitimately attach many
// listeners. Default 10-cap triggers spurious warnings.
this.setMaxListeners(0)
}
private publish(ev: GatewayEvent) {
if (ev.type === 'gateway.ready') {
this.ready = true
if (this.readyTimer) {
clearTimeout(this.readyTimer)
this.readyTimer = null
}
}
if (this.subscribed) {
return void this.emit('event', ev)
}
this.bufferedEvents.push(ev)
}
start() {
const root = process.env.HERMES_PYTHON_SRC_ROOT ?? resolve(import.meta.dirname, '../../')
const python = resolvePython(root)
const cwd = process.env.HERMES_CWD || root
const env = { ...process.env }
const pyPath = env.PYTHONPATH?.trim()
env.PYTHONPATH = pyPath ? `${root}${delimiter}${pyPath}` : root
this.ready = false
this.bufferedEvents.clear()
this.pendingExit = undefined
this.stdoutRl?.close()
this.stderrRl?.close()
this.stdoutRl = null
this.stderrRl = null
if (this.proc && !this.proc.killed && this.proc.exitCode === null) {
this.proc.kill()
}
if (this.readyTimer) {
clearTimeout(this.readyTimer)
}
this.readyTimer = setTimeout(() => {
if (this.ready) {
return
}
// Append the most recent gateway stderr/log lines to the timeout
// event so users can tell apart "wrong python", "missing dep",
// and "config parse failure" from one glance instead of having
// to dig through `/logs`. Capped to keep the activity feed
// readable on slow boots.
const stderrTail = this.getLogTail(20)
this.pushLog(`[startup] timed out waiting for gateway.ready (python=${python}, cwd=${cwd})`)
this.publish({
type: 'gateway.start_timeout',
payload: { cwd, python, stderr_tail: stderrTail }
})
}, STARTUP_TIMEOUT_MS)
this.proc = spawn(python, ['-m', 'tui_gateway.entry'], { cwd, env, stdio: ['pipe', 'pipe', 'pipe'] })
this.stdoutRl = createInterface({ input: this.proc.stdout! })
this.stdoutRl.on('line', raw => {
try {
this.dispatch(JSON.parse(raw))
} catch {
const preview = raw.trim().slice(0, MAX_LOG_PREVIEW) || '(empty line)'
this.pushLog(`[protocol] malformed stdout: ${preview}`)
this.publish({ type: 'gateway.protocol_error', payload: { preview } })
}
})
this.stderrRl = createInterface({ input: this.proc.stderr! })
this.stderrRl.on('line', raw => {
const line = truncateLine(raw.trim())
if (!line) {
return
}
this.pushLog(line)
this.publish({ type: 'gateway.stderr', payload: { line } })
})
this.proc.on('error', err => {
this.pushLog(`[spawn] ${err.message}`)
this.rejectPending(new Error(`gateway error: ${err.message}`))
this.publish({ type: 'gateway.stderr', payload: { line: `[spawn] ${err.message}` } })
})
this.proc.on('exit', code => {
if (this.readyTimer) {
clearTimeout(this.readyTimer)
this.readyTimer = null
}
this.rejectPending(new Error(`gateway exited${code === null ? '' : ` (${code})`}`))
if (this.subscribed) {
this.emit('exit', code)
} else {
this.pendingExit = code
}
})
}
private dispatch(msg: Record<string, unknown>) {
const id = msg.id as string | undefined
const p = id ? this.pending.get(id) : undefined
if (p) {
this.settle(p, msg.error ? this.toError(msg.error) : null, msg.result)
return
}
if (msg.method === 'event') {
const ev = asGatewayEvent(msg.params)
if (ev) {
this.publish(ev)
}
}
}
private toError(raw: unknown): Error {
const err = raw as { message?: unknown } | null | undefined
return new Error(typeof err?.message === 'string' ? err.message : 'request failed')
}
private settle(p: Pending, err: Error | null, result: unknown) {
clearTimeout(p.timeout)
this.pending.delete(p.id)
if (err) {
p.reject(err)
} else {
p.resolve(result)
}
}
private pushLog(line: string) {
this.logs.push(truncateLine(line))
}
private rejectPending(err: Error) {
for (const p of this.pending.values()) {
clearTimeout(p.timeout)
p.reject(err)
}
this.pending.clear()
}
// Arrow class-field — stable identity, so `setTimeout(this.onTimeout, …, id)`
// doesn't allocate a bound function per request.
private onTimeout = (id: string) => {
const p = this.pending.get(id)
if (p) {
this.pending.delete(id)
p.reject(new Error(`timeout: ${p.method}`))
}
}
drain() {
this.subscribed = true
for (const ev of this.bufferedEvents.drain()) {
this.emit('event', ev)
}
if (this.pendingExit !== undefined) {
const code = this.pendingExit
this.pendingExit = undefined
this.emit('exit', code)
}
}
getLogTail(limit = 20): string {
return this.logs.tail(Math.max(1, limit)).join('\n')
}
request<T = unknown>(method: string, params: Record<string, unknown> = {}): Promise<T> {
if (!this.proc?.stdin || this.proc.killed || this.proc.exitCode !== null) {
this.start()
}
if (!this.proc?.stdin) {
return Promise.reject(new Error('gateway not running'))
}
const id = `r${++this.reqId}`
return new Promise<T>((resolve, reject) => {
const timeout = setTimeout(this.onTimeout, REQUEST_TIMEOUT_MS, id)
timeout.unref?.()
this.pending.set(id, {
id,
method,
reject,
resolve: v => resolve(v as T),
timeout
})
try {
this.proc!.stdin!.write(JSON.stringify({ id, jsonrpc: '2.0', method, params }) + '\n')
} catch (e) {
const pending = this.pending.get(id)
if (pending) {
clearTimeout(pending.timeout)
this.pending.delete(id)
}
reject(e instanceof Error ? e : new Error(String(e)))
}
})
}
kill() {
this.proc?.kill()
}
}