From 6bbacc2238997718026c7868f4b76092fe602ed8 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sun, 21 Jun 2026 12:08:29 -0700 Subject: [PATCH] fix(desktop): make cold-start port-announcement deadline tolerant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The port-announcement clock in waitForDashboardPort starts the instant the backend process is spawned — before uvicorn binds its socket. On a cold install the child first compiles and imports the whole hermes_cli.main -> web_server -> FastAPI/uvicorn chain, and on Windows real-time AV scans every freshly written .pyc. That pre-bind cost can exceed the old hardcoded 45s deadline, so the desktop killed a healthy-but-still-starting backend and respawned it, piling up orphaned processes (#50209). Raise the default to 90s and make it overridable via HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS, clamped to a 45s floor so a bad override can't reintroduce the loop. Warm starts still announce in well under a second; both call sites inherit the new default with no change. Adds backend-ready.test.cjs (wired into test:desktop:platforms). --- apps/desktop/electron/backend-ready.cjs | 40 +++++- apps/desktop/electron/backend-ready.test.cjs | 121 +++++++++++++++++++ apps/desktop/package.json | 2 +- 3 files changed, 160 insertions(+), 3 deletions(-) create mode 100644 apps/desktop/electron/backend-ready.test.cjs diff --git a/apps/desktop/electron/backend-ready.cjs b/apps/desktop/electron/backend-ready.cjs index 9af41e549c4..a4899e8657a 100644 --- a/apps/desktop/electron/backend-ready.cjs +++ b/apps/desktop/electron/backend-ready.cjs @@ -1,5 +1,32 @@ const _READY_RE = /^HERMES_DASHBOARD_READY port=(\d+)/m +// The announcement clock starts the instant the backend process is spawned — +// before uvicorn binds its socket. On a cold install the child must first +// compile and import the whole `hermes_cli.main` → `web_server` → FastAPI/ +// uvicorn chain, and on Windows real-time AV (Defender) scans every freshly +// written `.pyc`. That pre-bind cost can run 30-60s on a slow disk, so a tight +// 45s deadline kills a *healthy but still-starting* backend and respawns it, +// piling up orphaned processes (issue #50209). A roomier default absorbs the +// cold-start cost; a warm start still announces in well under a second. +const DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS = 90_000 +// Never trust a deadline tighter than the warm-start path needs; floor at 45s +// (the historical default) so a malformed override can't reintroduce the loop. +const MIN_PORT_ANNOUNCE_TIMEOUT_MS = 45_000 + +/** + * Resolve the port-announcement deadline. Honors the + * HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS env override (for users on slow + * disks / aggressive AV who need an even longer cold-start window), clamped + * to a sane floor so a bad value can't make boot flakier than the default. + */ +function resolvePortAnnounceTimeoutMs(env = process.env) { + const parsed = Number(env.HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS) + if (Number.isFinite(parsed) && parsed > 0) { + return Math.max(MIN_PORT_ANNOUNCE_TIMEOUT_MS, Math.round(parsed)) + } + return DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS +} + /** * Watch a child process's stdout for the `HERMES_DASHBOARD_READY port=` * line that web_server.py prints after uvicorn binds its socket. @@ -9,11 +36,15 @@ const _READY_RE = /^HERMES_DASHBOARD_READY port=(\d+)/m * - the child emits an `error` event * - no line arrives within the timeout * + * The default timeout is cold-start tolerant (see + * DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS) because the clock starts before the + * backend has even bound its port. Pass an explicit `timeoutMs` to override. + * * A single `cleanup()` tears down every listener (data/exit/error/timeout) * on every terminal path — resolve, reject, or timeout — so repeated * backend spawns don't leak listener slots on the child. */ -function waitForDashboardPort(child, timeoutMs = 45_000) { +function waitForDashboardPort(child, timeoutMs = resolvePortAnnounceTimeoutMs()) { return new Promise((resolve, reject) => { let buf = '' let done = false @@ -63,4 +94,9 @@ function waitForDashboardPort(child, timeoutMs = 45_000) { }) } -module.exports = { waitForDashboardPort } +module.exports = { + waitForDashboardPort, + resolvePortAnnounceTimeoutMs, + DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS, + MIN_PORT_ANNOUNCE_TIMEOUT_MS, +} diff --git a/apps/desktop/electron/backend-ready.test.cjs b/apps/desktop/electron/backend-ready.test.cjs new file mode 100644 index 00000000000..8f6267b7929 --- /dev/null +++ b/apps/desktop/electron/backend-ready.test.cjs @@ -0,0 +1,121 @@ +/** + * Tests for electron/backend-ready.cjs. + * + * Run with: node --test electron/backend-ready.test.cjs + * (Wired into npm test:desktop:platforms in package.json.) + * + * Covers the cold-start port-announcement deadline (issue #50209): the clock + * starts before the backend binds its port, so a tight 45s deadline killed a + * healthy-but-still-compiling backend on cold Windows installs. The default is + * now cold-start tolerant and overridable via + * HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS, clamped to a 45s floor. + */ + +const test = require('node:test') +const assert = require('node:assert/strict') +const { EventEmitter } = require('node:events') + +const { + waitForDashboardPort, + resolvePortAnnounceTimeoutMs, + DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS, + MIN_PORT_ANNOUNCE_TIMEOUT_MS, +} = require('./backend-ready.cjs') + +// A minimal stand-in for a spawned child process: an EventEmitter with a +// stdout EventEmitter, matching the surface waitForDashboardPort consumes +// (child.stdout.on('data'), child.on('exit'|'error') + the .off() teardown). +function makeFakeChild() { + const child = new EventEmitter() + child.stdout = new EventEmitter() + return child +} + +// --------------------------------------------------------------------------- +// resolvePortAnnounceTimeoutMs +// --------------------------------------------------------------------------- + +test('default is cold-start tolerant (> the historical 45s floor)', () => { + assert.equal(resolvePortAnnounceTimeoutMs({}), DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS) + assert.ok( + DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS > MIN_PORT_ANNOUNCE_TIMEOUT_MS, + 'cold-start default must exceed the warm-start floor' + ) +}) + +test('honors a valid HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS override', () => { + const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '120000' } + assert.equal(resolvePortAnnounceTimeoutMs(env), 120_000) +}) + +test('clamps an override below the floor up to the 45s minimum', () => { + const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '1000' } + assert.equal(resolvePortAnnounceTimeoutMs(env), MIN_PORT_ANNOUNCE_TIMEOUT_MS) +}) + +test('rounds a fractional override', () => { + const env = { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: '60000.7' } + assert.equal(resolvePortAnnounceTimeoutMs(env), 60_001) +}) + +test('falls back to the default for malformed / non-positive overrides', () => { + for (const bad of ['', 'abc', '0', '-5', 'NaN', undefined]) { + const env = bad === undefined ? {} : { HERMES_DESKTOP_PORT_ANNOUNCE_TIMEOUT_MS: bad } + assert.equal( + resolvePortAnnounceTimeoutMs(env), + DEFAULT_PORT_ANNOUNCE_TIMEOUT_MS, + `override ${JSON.stringify(bad)} should fall through to the default` + ) + } +}) + +// --------------------------------------------------------------------------- +// waitForDashboardPort +// --------------------------------------------------------------------------- + +test('resolves with the announced port', async () => { + const child = makeFakeChild() + const p = waitForDashboardPort(child, 1000) + child.stdout.emit('data', 'noise before\nHERMES_DASHBOARD_READY port=54321\n') + assert.equal(await p, 54321) +}) + +test('parses the port even when the line arrives split across chunks', async () => { + const child = makeFakeChild() + const p = waitForDashboardPort(child, 1000) + child.stdout.emit('data', 'HERMES_DASHBOARD_READY po') + child.stdout.emit('data', 'rt=8080\n') + assert.equal(await p, 8080) +}) + +test('rejects when the child exits before announcing', async () => { + const child = makeFakeChild() + const p = waitForDashboardPort(child, 1000) + child.emit('exit', 1, null) + await assert.rejects(p, /exited before port announcement/) +}) + +test('rejects on a child error event', async () => { + const child = makeFakeChild() + const p = waitForDashboardPort(child, 1000) + child.emit('error', new Error('spawn ENOENT')) + await assert.rejects(p, /spawn ENOENT/) +}) + +test('rejects with the timeout message after the deadline', async () => { + const child = makeFakeChild() + await assert.rejects( + waitForDashboardPort(child, 20), + /Timed out waiting for Hermes backend port announcement \(20ms\)/ + ) +}) + +test('a late announcement after timeout does not throw (listeners torn down)', async () => { + const child = makeFakeChild() + await assert.rejects(waitForDashboardPort(child, 20), /Timed out/) + // The orphaned backend may still print its READY line later; the watcher + // must have detached so this emit is a no-op rather than a double-settle. + assert.doesNotThrow(() => { + child.stdout.emit('data', 'HERMES_DASHBOARD_READY port=9999\n') + }) +}) diff --git a/apps/desktop/package.json b/apps/desktop/package.json index 8861762fa02..ab5d2d588f3 100644 --- a/apps/desktop/package.json +++ b/apps/desktop/package.json @@ -37,7 +37,7 @@ "test:desktop:nsis": "node scripts/test-desktop.mjs nsis", "test:desktop:existing": "node scripts/test-desktop.mjs existing", "test:desktop:fresh": "node scripts/test-desktop.mjs fresh", - "test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-rebuild.test.cjs electron/windows-user-env.test.cjs", + "test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-rebuild.test.cjs electron/windows-user-env.test.cjs", "typecheck": "tsc -p . --noEmit", "lint": "eslint src/ electron/", "lint:fix": "eslint src/ electron/ --fix",