fix(desktop): make cold-start port-announcement deadline tolerant

The port-announcement clock in waitForDashboardPort starts the instant the
backend process is spawned — before uvicorn binds its socket. On a cold
install the child first compiles and imports the whole hermes_cli.main ->
web_server -> FastAPI/uvicorn chain, and on Windows real-time AV scans every
freshly written .pyc. That pre-bind cost can exceed the old hardcoded 45s
deadline, so the desktop killed a healthy-but-still-starting backend and
respawned it, piling up orphaned processes (#50209).

Raise the default to 90s and make it overridable via
HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS, clamped to a 45s floor so a bad
override can't reintroduce the loop. Warm starts still announce in well under
a second; both call sites inherit the new default with no change. Adds
backend-ready.test.cjs (wired into test:desktop:platforms).
This commit is contained in:
teknium1 2026-06-21 12:08:29 -07:00 committed by Teknium
parent e580706d4d
commit 6bbacc2238
3 changed files with 160 additions and 3 deletions

View file

@ -1,5 +1,32 @@
const _READY_RE = /^HERMES_DASHBOARD_READY port=(\d+)/m
// The announcement clock starts the instant the backend process is spawned —
// before uvicorn binds its socket. On a cold install the child must first
// compile and import the whole `hermes_cli.main` → `web_server` → FastAPI/
// uvicorn chain, and on Windows real-time AV (Defender) scans every freshly
// written `.pyc`. That pre-bind cost can run 30-60s on a slow disk, so a tight
// 45s deadline kills a *healthy but still-starting* backend and respawns it,
// piling up orphaned processes (issue #50209). A roomier default absorbs the
// cold-start cost; a warm start still announces in well under a second.
const DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS = 90_000
// Never trust a deadline tighter than the warm-start path needs; floor at 45s
// (the historical default) so a malformed override can't reintroduce the loop.
const MIN_PORT_ANNOUNCE_TIMEOUT_MS = 45_000
/**
* Resolve the port-announcement deadline. Honors the
* HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS env override (for users on slow
* disks / aggressive AV who need an even longer cold-start window), clamped
* to a sane floor so a bad value can't make boot flakier than the default.
*/
function resolvePortAnnounceTimeoutMs(env = process.env) {
const parsed = Number(env.HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS)
if (Number.isFinite(parsed) && parsed > 0) {
return Math.max(MIN_PORT_ANNOUNCE_TIMEOUT_MS, Math.round(parsed))
}
return DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS
}
/**
* Watch a child process's stdout for the `HERMES_DASHBOARD_READY port=<N>`
* line that web_server.py prints after uvicorn binds its socket.
@ -9,11 +36,15 @@ const _READY_RE = /^HERMES_DASHBOARD_READY port=(\d+)/m
* - the child emits an `error` event
* - no line arrives within the timeout
*
* The default timeout is cold-start tolerant (see
* DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS) because the clock starts before the
* backend has even bound its port. Pass an explicit `timeoutMs` to override.
*
* A single `cleanup()` tears down every listener (data/exit/error/timeout)
* on every terminal path resolve, reject, or timeout so repeated
* backend spawns don't leak listener slots on the child.
*/
function waitForDashboardPort(child, timeoutMs = 45_000) {
function waitForDashboardPort(child, timeoutMs = resolvePortAnnounceTimeoutMs()) {
return new Promise((resolve, reject) => {
let buf = ''
let done = false
@ -63,4 +94,9 @@ function waitForDashboardPort(child, timeoutMs = 45_000) {
})
}
module.exports = { waitForDashboardPort }
module.exports = {
waitForDashboardPort,
resolvePortAnnounceTimeoutMs,
DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
MIN_PORT_ANNOUNCE_TIMEOUT_MS,
}

View file

@ -0,0 +1,121 @@
/**
* Tests for electron/backend-ready.cjs.
*
* Run with: node --test electron/backend-ready.test.cjs
* (Wired into npm test:desktop:platforms in package.json.)
*
* Covers the cold-start port-announcement deadline (issue #50209): the clock
* starts before the backend binds its port, so a tight 45s deadline killed a
* healthy-but-still-compiling backend on cold Windows installs. The default is
* now cold-start tolerant and overridable via
* HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS, clamped to a 45s floor.
*/
const test = require('node:test')
const assert = require('node:assert/strict')
const { EventEmitter } = require('node:events')
const {
waitForDashboardPort,
resolvePortAnnounceTimeoutMs,
DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
MIN_PORT_ANNOUNCE_TIMEOUT_MS,
} = require('./backend-ready.cjs')
// A minimal stand-in for a spawned child process: an EventEmitter with a
// stdout EventEmitter, matching the surface waitForDashboardPort consumes
// (child.stdout.on('data'), child.on('exit'|'error') + the .off() teardown).
function makeFakeChild() {
const child = new EventEmitter()
child.stdout = new EventEmitter()
return child
}
// ---------------------------------------------------------------------------
// resolvePortAnnounceTimeoutMs
// ---------------------------------------------------------------------------
test('default is cold-start tolerant (> the historical 45s floor)', () => {
assert.equal(resolvePortAnnounceTimeoutMs({}), DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS)
assert.ok(
DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS > MIN_PORT_ANNOUNCE_TIMEOUT_MS,
'cold-start default must exceed the warm-start floor'
)
})
test('honors a valid HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS override', () => {
const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '120000' }
assert.equal(resolvePortAnnounceTimeoutMs(env), 120_000)
})
test('clamps an override below the floor up to the 45s minimum', () => {
const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '1000' }
assert.equal(resolvePortAnnounceTimeoutMs(env), MIN_PORT_ANNOUNCE_TIMEOUT_MS)
})
test('rounds a fractional override', () => {
const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '60000.7' }
assert.equal(resolvePortAnnounceTimeoutMs(env), 60_001)
})
test('falls back to the default for malformed / non-positive overrides', () => {
for (const bad of ['', 'abc', '0', '-5', 'NaN', undefined]) {
const env = bad === undefined ? {} : { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: bad }
assert.equal(
resolvePortAnnounceTimeoutMs(env),
DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
`override ${JSON.stringify(bad)} should fall through to the default`
)
}
})
// ---------------------------------------------------------------------------
// waitForDashboardPort
// ---------------------------------------------------------------------------
test('resolves with the announced port', async () => {
const child = makeFakeChild()
const p = waitForDashboardPort(child, 1000)
child.stdout.emit('data', 'noise before\nHERMES_DASHBOARD_READY port=54321\n')
assert.equal(await p, 54321)
})
test('parses the port even when the line arrives split across chunks', async () => {
const child = makeFakeChild()
const p = waitForDashboardPort(child, 1000)
child.stdout.emit('data', 'HERMES_DASHBOARD_READY po')
child.stdout.emit('data', 'rt=8080\n')
assert.equal(await p, 8080)
})
test('rejects when the child exits before announcing', async () => {
const child = makeFakeChild()
const p = waitForDashboardPort(child, 1000)
child.emit('exit', 1, null)
await assert.rejects(p, /exited before port announcement/)
})
test('rejects on a child error event', async () => {
const child = makeFakeChild()
const p = waitForDashboardPort(child, 1000)
child.emit('error', new Error('spawn ENOENT'))
await assert.rejects(p, /spawn ENOENT/)
})
test('rejects with the timeout message after the deadline', async () => {
const child = makeFakeChild()
await assert.rejects(
waitForDashboardPort(child, 20),
/Timed out waiting for Hermes backend port announcement \(20ms\)/
)
})
test('a late announcement after timeout does not throw (listeners torn down)', async () => {
const child = makeFakeChild()
await assert.rejects(waitForDashboardPort(child, 20), /Timed out/)
// The orphaned backend may still print its READY line later; the watcher
// must have detached so this emit is a no-op rather than a double-settle.
assert.doesNotThrow(() => {
child.stdout.emit('data', 'HERMES_DASHBOARD_READY port=9999\n')
})
})

View file

@ -37,7 +37,7 @@
"test:desktop:nsis": "node scripts/test-desktop.mjs nsis",
"test:desktop:existing": "node scripts/test-desktop.mjs existing",
"test:desktop:fresh": "node scripts/test-desktop.mjs fresh",
"test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-rebuild.test.cjs electron/windows-user-env.test.cjs",
"test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-rebuild.test.cjs electron/windows-user-env.test.cjs",
"typecheck": "tsc -p . --noEmit",
"lint": "eslint src/ electron/",
"lint:fix": "eslint src/ electron/ --fix",