hermes-agent/apps/desktop/electron/update-marker.test.cjs
Teknium f72690825e
fix(desktop/windows): stop in-app update from cascading into a backend restart loop (#50381)
When a Windows user relaunches Hermes while an in-app update is still
running (the desktop vanished with no progress and looks crashed), the
fresh instance spawns its own dashboard backend. That backend re-locks
the venv shim, the updater's straggler cleanup (force_kill_other_hermes
-> taskkill /F /T /IM hermes.exe) kills it, the launch dies with the 45s
"backend didn't come up" timeout, and the user relaunches into the same
trap -- an infinite respawn/kill loop (#50238).

Root cause: no mutual exclusion between an applying update and a fresh
desktop spawning its own local backend.

Fix: the updater publishes a HERMES_HOME/.hermes-update-in-progress
marker (pid + start time) for the whole run via an RAII drop-guard that
removes it on every exit path (success, early return, panic). A
freshly-launched desktop checks the marker before spawning its local
backend and PARKS until the update finishes -- then brings the backend
up itself (it is the surviving instance; the updater's own relaunch hits
the single-instance lock and quits). A stale marker (dead pid or past a
20-minute ceiling) is pruned so a crashed updater can never strand
future launches. No rogue backend spawns mid-update, so
force_kill_other_hermes has nothing legitimate to kill.

Marker parse/staleness logic is extracted to update-marker.cjs and
unit-tested; the Rust guard has unit tests; the Rust-write <-> JS-read
contract is E2E-verified.
2026-06-21 13:10:32 -07:00

92 lines
3.4 KiB
JavaScript

/**
* Tests for electron/update-marker.cjs — the in-app update mutual-exclusion
* marker that prevents a desktop relaunched mid-update from spawning a backend
* the updater then kills in a loop (#50238).
*
* Run with: node --test electron/update-marker.test.cjs
* (Wired into npm test:desktop:platforms in package.json.)
*
* Why this matters: the gate must (a) report a live update only when the
* updater pid is alive AND the marker is fresh, (b) treat absent/malformed/
* dead-pid/expired markers as "no live update" so a crashed updater can't
* strand future launches, and (c) self-heal by deleting a stale marker file.
*/
const test = require('node:test')
const assert = require('node:assert/strict')
const fs = require('fs')
const os = require('os')
const path = require('path')
const { markerPath, isPidAlive, readLiveUpdateMarker, UPDATE_MARKER_MAX_AGE_MS } = require('./update-marker.cjs')
function tmpHome(tag) {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), `hermes-marker-${tag}-`))
return dir
}
function writeMarker(home, pid, startedAtSec) {
fs.writeFileSync(markerPath(home), `${pid}\n${startedAtSec}`)
}
const ALIVE = () => true // injected kill that "succeeds" => pid alive
const DEAD = () => {
const err = new Error('no such process')
err.code = 'ESRCH'
throw err
}
test('absent marker => no live update', () => {
const home = tmpHome('absent')
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE }), null)
})
test('live pid within age ceiling => live update reported', () => {
const home = tmpHome('live')
const now = 1_000_000_000_000
writeMarker(home, 4242, Math.floor(now / 1000) - 5) // 5s old
const res = readLiveUpdateMarker(home, { kill: ALIVE, now: () => now })
assert.ok(res, 'a fresh, alive marker is a live update')
assert.equal(res.pid, 4242)
assert.ok(res.ageMs >= 0 && res.ageMs < 10_000)
assert.ok(fs.existsSync(markerPath(home)), 'a live marker is NOT deleted')
})
test('dead pid => no live update and marker is pruned', () => {
const home = tmpHome('dead')
writeMarker(home, 999999, Math.floor(Date.now() / 1000))
assert.equal(readLiveUpdateMarker(home, { kill: DEAD }), null)
assert.ok(!fs.existsSync(markerPath(home)), 'a dead-pid marker self-heals (deleted)')
})
test('expired marker (past age ceiling) => no live update and pruned', () => {
const home = tmpHome('expired')
const now = 1_000_000_000_000
writeMarker(home, 4242, Math.floor((now - UPDATE_MARKER_MAX_AGE_MS - 60_000) / 1000))
// Even though the pid is "alive", the marker is too old to trust.
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE, now: () => now }), null)
assert.ok(!fs.existsSync(markerPath(home)), 'an expired marker self-heals (deleted)')
})
test('malformed marker => no live update and pruned', () => {
const home = tmpHome('malformed')
fs.writeFileSync(markerPath(home), 'not-a-pid\nnonsense')
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE }), null)
assert.ok(!fs.existsSync(markerPath(home)))
})
test('isPidAlive: own pid is alive, impossible pid is dead', () => {
assert.equal(isPidAlive(process.pid), true)
assert.equal(isPidAlive(-1), false)
assert.equal(isPidAlive(0), false)
assert.equal(isPidAlive(NaN), false)
})
test('isPidAlive: EPERM counts as alive (process owned by another user)', () => {
const eperm = () => {
const err = new Error('operation not permitted')
err.code = 'EPERM'
throw err
}
assert.equal(isPidAlive(4242, eperm), true)
})