mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-24 10:52:21 +00:00
When a Windows user relaunches Hermes while an in-app update is still running (the desktop vanished with no progress and looks crashed), the fresh instance spawns its own dashboard backend. That backend re-locks the venv shim, the updater's straggler cleanup (force_kill_other_hermes -> taskkill /F /T /IM hermes.exe) kills it, the launch dies with the 45s "backend didn't come up" timeout, and the user relaunches into the same trap -- an infinite respawn/kill loop (#50238). Root cause: no mutual exclusion between an applying update and a fresh desktop spawning its own local backend. Fix: the updater publishes a HERMES_HOME/.hermes-update-in-progress marker (pid + start time) for the whole run via an RAII drop-guard that removes it on every exit path (success, early return, panic). A freshly-launched desktop checks the marker before spawning its local backend and PARKS until the update finishes -- then brings the backend up itself (it is the surviving instance; the updater's own relaunch hits the single-instance lock and quits). A stale marker (dead pid or past a 20-minute ceiling) is pruned so a crashed updater can never strand future launches. No rogue backend spawns mid-update, so force_kill_other_hermes has nothing legitimate to kill. Marker parse/staleness logic is extracted to update-marker.cjs and unit-tested; the Rust guard has unit tests; the Rust-write <-> JS-read contract is E2E-verified.
92 lines
3.4 KiB
JavaScript
92 lines
3.4 KiB
JavaScript
/**
|
|
* Tests for electron/update-marker.cjs — the in-app update mutual-exclusion
|
|
* marker that prevents a desktop relaunched mid-update from spawning a backend
|
|
* the updater then kills in a loop (#50238).
|
|
*
|
|
* Run with: node --test electron/update-marker.test.cjs
|
|
* (Wired into npm test:desktop:platforms in package.json.)
|
|
*
|
|
* Why this matters: the gate must (a) report a live update only when the
|
|
* updater pid is alive AND the marker is fresh, (b) treat absent/malformed/
|
|
* dead-pid/expired markers as "no live update" so a crashed updater can't
|
|
* strand future launches, and (c) self-heal by deleting a stale marker file.
|
|
*/
|
|
|
|
const test = require('node:test')
|
|
const assert = require('node:assert/strict')
|
|
const fs = require('fs')
|
|
const os = require('os')
|
|
const path = require('path')
|
|
|
|
const { markerPath, isPidAlive, readLiveUpdateMarker, UPDATE_MARKER_MAX_AGE_MS } = require('./update-marker.cjs')
|
|
|
|
function tmpHome(tag) {
|
|
const dir = fs.mkdtempSync(path.join(os.tmpdir(), `hermes-marker-${tag}-`))
|
|
return dir
|
|
}
|
|
|
|
function writeMarker(home, pid, startedAtSec) {
|
|
fs.writeFileSync(markerPath(home), `${pid}\n${startedAtSec}`)
|
|
}
|
|
|
|
const ALIVE = () => true // injected kill that "succeeds" => pid alive
|
|
const DEAD = () => {
|
|
const err = new Error('no such process')
|
|
err.code = 'ESRCH'
|
|
throw err
|
|
}
|
|
|
|
test('absent marker => no live update', () => {
|
|
const home = tmpHome('absent')
|
|
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE }), null)
|
|
})
|
|
|
|
test('live pid within age ceiling => live update reported', () => {
|
|
const home = tmpHome('live')
|
|
const now = 1_000_000_000_000
|
|
writeMarker(home, 4242, Math.floor(now / 1000) - 5) // 5s old
|
|
const res = readLiveUpdateMarker(home, { kill: ALIVE, now: () => now })
|
|
assert.ok(res, 'a fresh, alive marker is a live update')
|
|
assert.equal(res.pid, 4242)
|
|
assert.ok(res.ageMs >= 0 && res.ageMs < 10_000)
|
|
assert.ok(fs.existsSync(markerPath(home)), 'a live marker is NOT deleted')
|
|
})
|
|
|
|
test('dead pid => no live update and marker is pruned', () => {
|
|
const home = tmpHome('dead')
|
|
writeMarker(home, 999999, Math.floor(Date.now() / 1000))
|
|
assert.equal(readLiveUpdateMarker(home, { kill: DEAD }), null)
|
|
assert.ok(!fs.existsSync(markerPath(home)), 'a dead-pid marker self-heals (deleted)')
|
|
})
|
|
|
|
test('expired marker (past age ceiling) => no live update and pruned', () => {
|
|
const home = tmpHome('expired')
|
|
const now = 1_000_000_000_000
|
|
writeMarker(home, 4242, Math.floor((now - UPDATE_MARKER_MAX_AGE_MS - 60_000) / 1000))
|
|
// Even though the pid is "alive", the marker is too old to trust.
|
|
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE, now: () => now }), null)
|
|
assert.ok(!fs.existsSync(markerPath(home)), 'an expired marker self-heals (deleted)')
|
|
})
|
|
|
|
test('malformed marker => no live update and pruned', () => {
|
|
const home = tmpHome('malformed')
|
|
fs.writeFileSync(markerPath(home), 'not-a-pid\nnonsense')
|
|
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE }), null)
|
|
assert.ok(!fs.existsSync(markerPath(home)))
|
|
})
|
|
|
|
test('isPidAlive: own pid is alive, impossible pid is dead', () => {
|
|
assert.equal(isPidAlive(process.pid), true)
|
|
assert.equal(isPidAlive(-1), false)
|
|
assert.equal(isPidAlive(0), false)
|
|
assert.equal(isPidAlive(NaN), false)
|
|
})
|
|
|
|
test('isPidAlive: EPERM counts as alive (process owned by another user)', () => {
|
|
const eperm = () => {
|
|
const err = new Error('operation not permitted')
|
|
err.code = 'EPERM'
|
|
throw err
|
|
}
|
|
assert.equal(isPidAlive(4242, eperm), true)
|
|
})
|