hermes-agent/apps/desktop/electron/update-marker.cjs
Teknium f72690825e
fix(desktop/windows): stop in-app update from cascading into a backend restart loop (#50381)
When a Windows user relaunches Hermes while an in-app update is still
running (the desktop vanished with no progress and looks crashed), the
fresh instance spawns its own dashboard backend. That backend re-locks
the venv shim, the updater's straggler cleanup (force_kill_other_hermes
-> taskkill /F /T /IM hermes.exe) kills it, the launch dies with the 45s
"backend didn't come up" timeout, and the user relaunches into the same
trap -- an infinite respawn/kill loop (#50238).

Root cause: no mutual exclusion between an applying update and a fresh
desktop spawning its own local backend.

Fix: the updater publishes a HERMES_HOME/.hermes-update-in-progress
marker (pid + start time) for the whole run via an RAII drop-guard that
removes it on every exit path (success, early return, panic). A
freshly-launched desktop checks the marker before spawning its local
backend and PARKS until the update finishes -- then brings the backend
up itself (it is the surviving instance; the updater's own relaunch hits
the single-instance lock and quits). A stale marker (dead pid or past a
20-minute ceiling) is pruned so a crashed updater can never strand
future launches. No rogue backend spawns mid-update, so
force_kill_other_hermes has nothing legitimate to kill.

Marker parse/staleness logic is extracted to update-marker.cjs and
unit-tested; the Rust guard has unit tests; the Rust-write <-> JS-read
contract is E2E-verified.
2026-06-21 13:10:32 -07:00

93 lines
3.4 KiB
JavaScript

/**
* In-app update mutual-exclusion marker (#50238).
*
* The Tauri updater writes HERMES_HOME/.hermes-update-in-progress for the whole
* duration of an `--update` run (see apps/bootstrap-installer/src-tauri/src/
* update.rs `UpdateMarkerGuard`). The marker body is two lines: the updater's
* pid and the unix-seconds it started.
*
* Why: if the user relaunches the desktop mid-update — the window vanished with
* no progress and looks crashed — a fresh instance must NOT spawn its own local
* backend. That backend re-locks the venv shim, the updater's straggler cleanup
* (`force_kill_other_hermes`, taskkill /IM hermes.exe) kills it, the launch
* fails with the 45s "backend didn't come up" timeout, and the user relaunches
* into the same trap — an infinite respawn/kill loop. The desktop gates local
* backend startup on this marker and parks until the update finishes.
*
* This module holds the PURE, side-effect-light logic (path, pid liveness,
* parse + staleness) so it is unit-testable without booting Electron. The
* polling/boot-progress wrapper lives in main.cjs where the boot-progress and
* log sinks are.
*/
const fs = require('fs')
const path = require('path')
// Even with a live-looking PID, never treat a marker older than this as a live
// update. A full update (git pull + pip + desktop rebuild) is minutes, not tens
// of minutes; past this the marker is almost certainly stale (e.g. the OS
// recycled the pid onto an unrelated process), so the gate self-heals.
const UPDATE_MARKER_MAX_AGE_MS = 20 * 60 * 1000
function markerPath(hermesHome) {
return path.join(hermesHome, '.hermes-update-in-progress')
}
// True only if a host process with this pid is currently alive. Signal 0 does
// not deliver a signal — it just probes existence/permission. ESRCH => dead;
// EPERM => alive but owned by another user (still "alive" for our purposes).
// Injectable `kill` keeps it unit-testable.
function isPidAlive(pid, kill = process.kill.bind(process)) {
if (!Number.isInteger(pid) || pid <= 0) return false
try {
kill(pid, 0)
return true
} catch (err) {
return Boolean(err && err.code === 'EPERM')
}
}
/**
* Read + interpret the marker.
*
* Returns `{ pid, ageMs }` only when an update is GENUINELY still running
* (parseable pid that is alive, within the age ceiling). Returns `null` for
* every "no live update" case — absent, unreadable, malformed, dead pid, or
* past the ceiling — and, when a stale marker file exists, deletes it so it
* cannot strand future launches.
*
* Pure-ish: file I/O against the given path, plus an injectable pid probe and
* clock for tests.
*/
function readLiveUpdateMarker(hermesHome, { kill, now = Date.now, maxAgeMs = UPDATE_MARKER_MAX_AGE_MS } = {}) {
const file = markerPath(hermesHome)
let raw
try {
raw = fs.readFileSync(file, 'utf8')
} catch {
return null // absent or unreadable => no live update
}
const [pidLine, startedLine] = String(raw).split('\n')
const pid = Number.parseInt((pidLine || '').trim(), 10)
const startedAt = Number.parseInt((startedLine || '').trim(), 10)
const ageMs = Number.isFinite(startedAt) ? now() - startedAt * 1000 : Infinity
const alive = Number.isInteger(pid) && isPidAlive(pid, kill)
if (!alive || ageMs > maxAgeMs) {
try {
fs.unlinkSync(file)
} catch {
void 0
}
return null
}
return { pid, ageMs }
}
module.exports = {
UPDATE_MARKER_MAX_AGE_MS,
markerPath,
isPidAlive,
readLiveUpdateMarker
}