mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
fix(desktop): make cold-start port-announcement deadline tolerant
The port-announcement clock in waitForDashboardPort starts the instant the backend process is spawned — before uvicorn binds its socket. On a cold install the child first compiles and imports the whole hermes_cli.main -> web_server -> FastAPI/uvicorn chain, and on Windows real-time AV scans every freshly written .pyc. That pre-bind cost can exceed the old hardcoded 45s deadline, so the desktop killed a healthy-but-still-starting backend and respawned it, piling up orphaned processes (#50209). Raise the default to 90s and make it overridable via HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS, clamped to a 45s floor so a bad override can't reintroduce the loop. Warm starts still announce in well under a second; both call sites inherit the new default with no change. Adds backend-ready.test.cjs (wired into test:desktop:platforms).
This commit is contained in:
parent
e580706d4d
commit
6bbacc2238
3 changed files with 160 additions and 3 deletions
|
|
@ -1,5 +1,32 @@
|
|||
const _READY_RE = /^HERMES_DASHBOARD_READY port=(\d+)/m
|
||||
|
||||
// The announcement clock starts the instant the backend process is spawned —
|
||||
// before uvicorn binds its socket. On a cold install the child must first
|
||||
// compile and import the whole `hermes_cli.main` → `web_server` → FastAPI/
|
||||
// uvicorn chain, and on Windows real-time AV (Defender) scans every freshly
|
||||
// written `.pyc`. That pre-bind cost can run 30-60s on a slow disk, so a tight
|
||||
// 45s deadline kills a *healthy but still-starting* backend and respawns it,
|
||||
// piling up orphaned processes (issue #50209). A roomier default absorbs the
|
||||
// cold-start cost; a warm start still announces in well under a second.
|
||||
const DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS = 90_000
|
||||
// Never trust a deadline tighter than the warm-start path needs; floor at 45s
|
||||
// (the historical default) so a malformed override can't reintroduce the loop.
|
||||
const MIN_PORT_ANNOUNCE_TIMEOUT_MS = 45_000
|
||||
|
||||
/**
|
||||
* Resolve the port-announcement deadline. Honors the
|
||||
* HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS env override (for users on slow
|
||||
* disks / aggressive AV who need an even longer cold-start window), clamped
|
||||
* to a sane floor so a bad value can't make boot flakier than the default.
|
||||
*/
|
||||
function resolvePortAnnounceTimeoutMs(env = process.env) {
|
||||
const parsed = Number(env.HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS)
|
||||
if (Number.isFinite(parsed) && parsed > 0) {
|
||||
return Math.max(MIN_PORT_ANNOUNCE_TIMEOUT_MS, Math.round(parsed))
|
||||
}
|
||||
return DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS
|
||||
}
|
||||
|
||||
/**
|
||||
* Watch a child process's stdout for the `HERMES_DASHBOARD_READY port=<N>`
|
||||
* line that web_server.py prints after uvicorn binds its socket.
|
||||
|
|
@ -9,11 +36,15 @@ const _READY_RE = /^HERMES_DASHBOARD_READY port=(\d+)/m
|
|||
* - the child emits an `error` event
|
||||
* - no line arrives within the timeout
|
||||
*
|
||||
* The default timeout is cold-start tolerant (see
|
||||
* DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS) because the clock starts before the
|
||||
* backend has even bound its port. Pass an explicit `timeoutMs` to override.
|
||||
*
|
||||
* A single `cleanup()` tears down every listener (data/exit/error/timeout)
|
||||
* on every terminal path — resolve, reject, or timeout — so repeated
|
||||
* backend spawns don't leak listener slots on the child.
|
||||
*/
|
||||
function waitForDashboardPort(child, timeoutMs = 45_000) {
|
||||
function waitForDashboardPort(child, timeoutMs = resolvePortAnnounceTimeoutMs()) {
|
||||
return new Promise((resolve, reject) => {
|
||||
let buf = ''
|
||||
let done = false
|
||||
|
|
@ -63,4 +94,9 @@ function waitForDashboardPort(child, timeoutMs = 45_000) {
|
|||
})
|
||||
}
|
||||
|
||||
module.exports = { waitForDashboardPort }
|
||||
module.exports = {
|
||||
waitForDashboardPort,
|
||||
resolvePortAnnounceTimeoutMs,
|
||||
DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
|
||||
MIN_PORT_ANNOUNCE_TIMEOUT_MS,
|
||||
}
|
||||
|
|
|
|||
121
apps/desktop/electron/backend-ready.test.cjs
Normal file
121
apps/desktop/electron/backend-ready.test.cjs
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
/**
|
||||
* Tests for electron/backend-ready.cjs.
|
||||
*
|
||||
* Run with: node --test electron/backend-ready.test.cjs
|
||||
* (Wired into npm test:desktop:platforms in package.json.)
|
||||
*
|
||||
* Covers the cold-start port-announcement deadline (issue #50209): the clock
|
||||
* starts before the backend binds its port, so a tight 45s deadline killed a
|
||||
* healthy-but-still-compiling backend on cold Windows installs. The default is
|
||||
* now cold-start tolerant and overridable via
|
||||
* HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS, clamped to a 45s floor.
|
||||
*/
|
||||
|
||||
const test = require('node:test')
|
||||
const assert = require('node:assert/strict')
|
||||
const { EventEmitter } = require('node:events')
|
||||
|
||||
const {
|
||||
waitForDashboardPort,
|
||||
resolvePortAnnounceTimeoutMs,
|
||||
DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
|
||||
MIN_PORT_ANNOUNCE_TIMEOUT_MS,
|
||||
} = require('./backend-ready.cjs')
|
||||
|
||||
// A minimal stand-in for a spawned child process: an EventEmitter with a
|
||||
// stdout EventEmitter, matching the surface waitForDashboardPort consumes
|
||||
// (child.stdout.on('data'), child.on('exit'|'error') + the .off() teardown).
|
||||
function makeFakeChild() {
|
||||
const child = new EventEmitter()
|
||||
child.stdout = new EventEmitter()
|
||||
return child
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// resolvePortAnnounceTimeoutMs
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
test('default is cold-start tolerant (> the historical 45s floor)', () => {
|
||||
assert.equal(resolvePortAnnounceTimeoutMs({}), DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS)
|
||||
assert.ok(
|
||||
DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS > MIN_PORT_ANNOUNCE_TIMEOUT_MS,
|
||||
'cold-start default must exceed the warm-start floor'
|
||||
)
|
||||
})
|
||||
|
||||
test('honors a valid HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS override', () => {
|
||||
const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '120000' }
|
||||
assert.equal(resolvePortAnnounceTimeoutMs(env), 120_000)
|
||||
})
|
||||
|
||||
test('clamps an override below the floor up to the 45s minimum', () => {
|
||||
const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '1000' }
|
||||
assert.equal(resolvePortAnnounceTimeoutMs(env), MIN_PORT_ANNOUNCE_TIMEOUT_MS)
|
||||
})
|
||||
|
||||
test('rounds a fractional override', () => {
|
||||
const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '60000.7' }
|
||||
assert.equal(resolvePortAnnounceTimeoutMs(env), 60_001)
|
||||
})
|
||||
|
||||
test('falls back to the default for malformed / non-positive overrides', () => {
|
||||
for (const bad of ['', 'abc', '0', '-5', 'NaN', undefined]) {
|
||||
const env = bad === undefined ? {} : { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: bad }
|
||||
assert.equal(
|
||||
resolvePortAnnounceTimeoutMs(env),
|
||||
DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS,
|
||||
`override ${JSON.stringify(bad)} should fall through to the default`
|
||||
)
|
||||
}
|
||||
})
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// waitForDashboardPort
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
test('resolves with the announced port', async () => {
|
||||
const child = makeFakeChild()
|
||||
const p = waitForDashboardPort(child, 1000)
|
||||
child.stdout.emit('data', 'noise before\nHERMES_DASHBOARD_READY port=54321\n')
|
||||
assert.equal(await p, 54321)
|
||||
})
|
||||
|
||||
test('parses the port even when the line arrives split across chunks', async () => {
|
||||
const child = makeFakeChild()
|
||||
const p = waitForDashboardPort(child, 1000)
|
||||
child.stdout.emit('data', 'HERMES_DASHBOARD_READY po')
|
||||
child.stdout.emit('data', 'rt=8080\n')
|
||||
assert.equal(await p, 8080)
|
||||
})
|
||||
|
||||
test('rejects when the child exits before announcing', async () => {
|
||||
const child = makeFakeChild()
|
||||
const p = waitForDashboardPort(child, 1000)
|
||||
child.emit('exit', 1, null)
|
||||
await assert.rejects(p, /exited before port announcement/)
|
||||
})
|
||||
|
||||
test('rejects on a child error event', async () => {
|
||||
const child = makeFakeChild()
|
||||
const p = waitForDashboardPort(child, 1000)
|
||||
child.emit('error', new Error('spawn ENOENT'))
|
||||
await assert.rejects(p, /spawn ENOENT/)
|
||||
})
|
||||
|
||||
test('rejects with the timeout message after the deadline', async () => {
|
||||
const child = makeFakeChild()
|
||||
await assert.rejects(
|
||||
waitForDashboardPort(child, 20),
|
||||
/Timed out waiting for Hermes backend port announcement \(20ms\)/
|
||||
)
|
||||
})
|
||||
|
||||
test('a late announcement after timeout does not throw (listeners torn down)', async () => {
|
||||
const child = makeFakeChild()
|
||||
await assert.rejects(waitForDashboardPort(child, 20), /Timed out/)
|
||||
// The orphaned backend may still print its READY line later; the watcher
|
||||
// must have detached so this emit is a no-op rather than a double-settle.
|
||||
assert.doesNotThrow(() => {
|
||||
child.stdout.emit('data', 'HERMES_DASHBOARD_READY port=9999\n')
|
||||
})
|
||||
})
|
||||
|
|
@ -37,7 +37,7 @@
|
|||
"test:desktop:nsis": "node scripts/test-desktop.mjs nsis",
|
||||
"test:desktop:existing": "node scripts/test-desktop.mjs existing",
|
||||
"test:desktop:fresh": "node scripts/test-desktop.mjs fresh",
|
||||
"test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-rebuild.test.cjs electron/windows-user-env.test.cjs",
|
||||
"test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-rebuild.test.cjs electron/windows-user-env.test.cjs",
|
||||
"typecheck": "tsc -p . --noEmit",
|
||||
"lint": "eslint src/ electron/",
|
||||
"lint:fix": "eslint src/ electron/ --fix",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue