fix(desktop/windows): stop in-app update from cascading into a backend restart loop (#50381)

When a Windows user relaunches Hermes while an in-app update is still
running (the desktop vanished with no progress and looks crashed), the
fresh instance spawns its own dashboard backend. That backend re-locks
the venv shim, the updater's straggler cleanup (force_kill_other_hermes
-> taskkill /F /T /IM hermes.exe) kills it, the launch dies with the 45s
"backend didn't come up" timeout, and the user relaunches into the same
trap -- an infinite respawn/kill loop (#50238).

Root cause: no mutual exclusion between an applying update and a fresh
desktop spawning its own local backend.

Fix: the updater publishes a HERMES_HOME/.hermes-update-in-progress
marker (pid + start time) for the whole run via an RAII drop-guard that
removes it on every exit path (success, early return, panic). A
freshly-launched desktop checks the marker before spawning its local
backend and PARKS until the update finishes -- then brings the backend
up itself (it is the surviving instance; the updater's own relaunch hits
the single-instance lock and quits). A stale marker (dead pid or past a
20-minute ceiling) is pruned so a crashed updater can never strand
future launches. No rogue backend spawns mid-update, so
force_kill_other_hermes has nothing legitimate to kill.

Marker parse/staleness logic is extracted to update-marker.cjs and
unit-tested; the Rust guard has unit tests; the Rust-write <-> JS-read
contract is E2E-verified.
This commit is contained in:
Teknium 2026-06-21 13:10:32 -07:00 committed by GitHub
parent 09a96ba0f6
commit f72690825e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 354 additions and 6 deletions

View file

@ -77,6 +77,19 @@ pub fn installer_dest() -> PathBuf {
hermes_home().join(name)
}
/// Marker the updater writes for the duration of an in-app update and removes
/// when it finishes (see update.rs `UpdateMarkerGuard`). A freshly-launched
/// desktop checks this before spawning its own local backend: spawning one
/// mid-update re-locks the venv shim and triggers `force_kill_other_hermes`,
/// which then kills that legitimate backend in a respawn loop (#50238).
///
/// Lives directly under HERMES_HOME (same rationale as `installer_dest`) so the
/// Electron desktop — which resolves HERMES_HOME identically and pins it into
/// the updater's env — agrees on the exact path.
pub fn update_in_progress_marker() -> PathBuf {
hermes_home().join(".hermes-update-in-progress")
}
/// Copy the currently-running installer binary to `installer_dest()` so it's
/// available for future `--update` runs and shortcut launches.
///

View file

@ -103,9 +103,61 @@ pub async fn start_update(app: AppHandle) -> Result<(), String> {
Ok(())
}
/// RAII guard that owns the "update in progress" marker (see
/// `paths::update_in_progress_marker`). Created at the top of `run_update`;
/// its `Drop` removes the marker on EVERY exit path — success, early
/// `return Err`, or a panic that unwinds through `run_update` — so a crashed
/// or aborted updater can never permanently strand the marker and block
/// future desktop launches. The marker payload is `{pid}\n{started_at_unix}`
/// so the desktop's launch gate can detect a stale marker (dead PID / past a
/// hard ceiling) and self-heal rather than wait forever.
struct UpdateMarkerGuard {
path: PathBuf,
}
impl UpdateMarkerGuard {
/// Write the marker. Best-effort: a write failure must NOT abort the
/// update (the gate degrades to "no marker => proceed", i.e. exactly the
/// pre-fix behavior), so we log and carry on with a guard that still
/// attempts cleanup of whatever may exist at the path.
fn acquire(path: PathBuf) -> Self {
let pid = std::process::id();
let started_at = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
if let Some(parent) = path.parent() {
let _ = std::fs::create_dir_all(parent);
}
if let Err(err) = std::fs::write(&path, format!("{pid}\n{started_at}")) {
tracing::warn!(?path, %err, "could not write update-in-progress marker");
}
Self { path }
}
}
impl Drop for UpdateMarkerGuard {
fn drop(&mut self) {
if let Err(err) = std::fs::remove_file(&self.path) {
if err.kind() != std::io::ErrorKind::NotFound {
tracing::warn!(path = ?self.path, %err, "could not remove update-in-progress marker");
}
}
}
}
async fn run_update(app: AppHandle) -> Result<()> {
let hermes_home = crate::paths::hermes_home();
let install_root = hermes_home.join("hermes-agent");
// Mutual exclusion (#50238): publish an "update in progress" marker for the
// entire duration of this update. A desktop instance the user relaunches
// mid-update consults this before spawning its own local backend — without
// it, that backend re-locks the venv shim, our `force_kill_other_hermes`
// straggler-cleanup kills it, and the relaunch/kill cycle loops. The guard
// removes the marker on every exit path (incl. early returns / panics).
let _update_marker = UpdateMarkerGuard::acquire(crate::paths::update_in_progress_marker());
let update_branch = update_branch_from_args(std::env::args().skip(1))
.or_else(|| option_env_string("BUILD_PIN_BRANCH"))
.unwrap_or_else(|| "main".to_string());
@ -518,11 +570,13 @@ fn format_locked_paths(paths: &[PathBuf]) -> String {
/// taskkill, excluding our own PID.
///
/// Safe w.r.t. our own update child: this runs inside the install-lock wait,
/// which completes BEFORE we spawn `venv\Scripts\hermes.exe update`. At this
/// point no update-driven hermes.exe exists yet, so the only hermes.exe images
/// are stragglers from the old desktop — exactly what we want gone. (`/FI PID
/// ne <self>` also spares this Tauri process, though it isn't named
/// hermes.exe.)
/// which completes BEFORE we spawn `venv\Scripts\hermes.exe update`. And a
/// desktop the user relaunches mid-update will NOT have spawned a backend —
/// `startHermes()` in the desktop gates local-backend startup on our
/// update-in-progress marker and parks until we finish (#50238). So the only
/// hermes.exe images here are stragglers from the old desktop — exactly what
/// we want gone. (`/FI PID ne <self>` also spares this Tauri process, though it
/// isn't named hermes.exe.)
fn force_kill_other_hermes() {
if !cfg!(target_os = "windows") {
return;
@ -992,6 +1046,48 @@ mod tests {
assert!(locked_paths(&probes).is_empty());
}
#[test]
fn update_marker_guard_writes_then_removes_on_drop() {
let dir = unique_tmp_dir("marker-guard");
std::fs::create_dir_all(&dir).unwrap();
let marker = dir.join(".hermes-update-in-progress");
{
let _g = UpdateMarkerGuard::acquire(marker.clone());
assert!(marker.exists(), "marker must exist while the guard is held");
let body = std::fs::read_to_string(&marker).unwrap();
let pid_line = body.lines().next().unwrap();
assert_eq!(
pid_line.trim().parse::<u32>().unwrap(),
std::process::id(),
"marker records our pid so the desktop can probe liveness"
);
assert_eq!(body.lines().count(), 2, "marker is pid + started_at lines");
}
assert!(
!marker.exists(),
"Drop must remove the marker on every exit path (incl. early return / panic unwind)"
);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn update_marker_guard_drop_is_quiet_when_already_gone() {
let dir = unique_tmp_dir("marker-guard-gone");
std::fs::create_dir_all(&dir).unwrap();
let marker = dir.join(".hermes-update-in-progress");
let guard = UpdateMarkerGuard::acquire(marker.clone());
// Simulate an external cleanup (e.g. the desktop pruned a marker it
// judged stale) before our guard drops — Drop must not panic.
std::fs::remove_file(&marker).unwrap();
drop(guard);
assert!(!marker.exists());
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn parses_update_branch_from_space_or_equals_args() {
assert_eq!(

View file

@ -43,6 +43,7 @@ const { fetchMarketplaceThemes, searchMarketplaceThemes } = require('./vscode-ma
const { buildDesktopBackendEnv, normalizeHermesHomeRoot } = require('./backend-env.cjs')
const { readWindowsUserEnvVar } = require('./windows-user-env.cjs')
const { readDirForIpc } = require('./fs-read-dir.cjs')
const { readLiveUpdateMarker } = require('./update-marker.cjs')
const { gitRootForIpc } = require('./git-root.cjs')
const { worktreesForIpc } = require('./git-worktrees.cjs')
const { OFFICIAL_REPO_HTTPS_URL, isOfficialSshRemote } = require('./update-remote.cjs')
@ -1110,6 +1111,51 @@ function directoryExists(filePath) {
}
}
// --- in-app update mutual exclusion (#50238) -------------------------------
// The Tauri updater writes HERMES_HOME/.hermes-update-in-progress for the whole
// duration of an `--update` run (see update.rs UpdateMarkerGuard). If the user
// relaunches the desktop mid-update — because the window vanished with no
// progress and looks crashed — a fresh instance must NOT spawn its own local
// backend: that backend re-locks the venv shim, the updater's straggler cleanup
// (`force_kill_other_hermes`, taskkill /IM hermes.exe) kills it, the launch
// fails with the 45s "backend didn't come up" error, and the relaunch/kill
// cycle loops. Instead the fresh instance parks until the update finishes, then
// brings the backend up itself (it is the surviving instance — the updater's
// own relaunch hits our single-instance lock and quits). Marker parsing +
// staleness self-heal live in update-marker.cjs (unit-tested).
// How long we'll park the launch waiting for a live update to finish before
// giving up and starting the backend anyway (belt-and-suspenders alongside the
// marker's own age ceiling; covers a stuck-but-alive updater).
const UPDATE_WAIT_TIMEOUT_MS = 20 * 60 * 1000
const UPDATE_WAIT_POLL_MS = 1000
// Block until no live update is in progress (or we hit the wait timeout).
// Emits a boot-progress phase so the renderer shows "Update in progress…"
// rather than a frozen splash. Returns true if it parked at all.
async function waitForUpdateToFinish() {
let marker = readLiveUpdateMarker(HERMES_HOME)
if (!marker) return false
rememberLog(`[updates] update in progress (pid=${marker.pid}); deferring backend start until it finishes`)
const deadline = Date.now() + UPDATE_WAIT_TIMEOUT_MS
while (marker && Date.now() < deadline) {
await advanceBootProgress(
'backend.update-wait',
'An update is finishing — Hermes will start automatically when it completes…',
12
)
await new Promise(r => setTimeout(r, UPDATE_WAIT_POLL_MS))
marker = readLiveUpdateMarker(HERMES_HOME)
}
if (marker) {
rememberLog('[updates] update still in progress after wait timeout; starting backend anyway')
} else {
rememberLog('[updates] update finished; proceeding with backend start')
}
return true
}
function unpackedPathFor(filePath) {
return filePath.replace(/app\.asar(?=$|[\\/])/, 'app.asar.unpacked')
}
@ -4910,6 +4956,14 @@ async function startHermes() {
}
}
// Mutual exclusion with an in-app update (#50238). If this instance was
// relaunched while the Tauri updater is still applying an update, spawning
// a local backend now re-locks the venv shim and gets killed by the
// updater's straggler cleanup — looping. Park until the update finishes (or
// is detected stale), THEN start the backend. Local backends only; remote
// connections returned above and never touch the install tree.
await waitForUpdateToFinish()
const token = crypto.randomBytes(32).toString('base64url')
// --port 0: the OS assigns an ephemeral port; the child announces it on stdout.
const dashboardArgs = ['dashboard', '--no-open', '--host', '127.0.0.1', '--port', '0']

View file

@ -0,0 +1,93 @@
/**
* In-app update mutual-exclusion marker (#50238).
*
* The Tauri updater writes HERMES_HOME/.hermes-update-in-progress for the whole
* duration of an `--update` run (see apps/bootstrap-installer/src-tauri/src/
* update.rs `UpdateMarkerGuard`). The marker body is two lines: the updater's
* pid and the unix-seconds it started.
*
* Why: if the user relaunches the desktop mid-update the window vanished with
* no progress and looks crashed a fresh instance must NOT spawn its own local
* backend. That backend re-locks the venv shim, the updater's straggler cleanup
* (`force_kill_other_hermes`, taskkill /IM hermes.exe) kills it, the launch
* fails with the 45s "backend didn't come up" timeout, and the user relaunches
* into the same trap an infinite respawn/kill loop. The desktop gates local
* backend startup on this marker and parks until the update finishes.
*
* This module holds the PURE, side-effect-light logic (path, pid liveness,
* parse + staleness) so it is unit-testable without booting Electron. The
* polling/boot-progress wrapper lives in main.cjs where the boot-progress and
* log sinks are.
*/
const fs = require('fs')
const path = require('path')
// Even with a live-looking PID, never treat a marker older than this as a live
// update. A full update (git pull + pip + desktop rebuild) is minutes, not tens
// of minutes; past this the marker is almost certainly stale (e.g. the OS
// recycled the pid onto an unrelated process), so the gate self-heals.
const UPDATE_MARKER_MAX_AGE_MS = 20 * 60 * 1000
function markerPath(hermesHome) {
return path.join(hermesHome, '.hermes-update-in-progress')
}
// True only if a host process with this pid is currently alive. Signal 0 does
// not deliver a signal — it just probes existence/permission. ESRCH => dead;
// EPERM => alive but owned by another user (still "alive" for our purposes).
// Injectable `kill` keeps it unit-testable.
function isPidAlive(pid, kill = process.kill.bind(process)) {
if (!Number.isInteger(pid) || pid <= 0) return false
try {
kill(pid, 0)
return true
} catch (err) {
return Boolean(err && err.code === 'EPERM')
}
}
/**
* Read + interpret the marker.
*
* Returns `{ pid, ageMs }` only when an update is GENUINELY still running
* (parseable pid that is alive, within the age ceiling). Returns `null` for
* every "no live update" case absent, unreadable, malformed, dead pid, or
* past the ceiling and, when a stale marker file exists, deletes it so it
* cannot strand future launches.
*
* Pure-ish: file I/O against the given path, plus an injectable pid probe and
* clock for tests.
*/
function readLiveUpdateMarker(hermesHome, { kill, now = Date.now, maxAgeMs = UPDATE_MARKER_MAX_AGE_MS } = {}) {
const file = markerPath(hermesHome)
let raw
try {
raw = fs.readFileSync(file, 'utf8')
} catch {
return null // absent or unreadable => no live update
}
const [pidLine, startedLine] = String(raw).split('\n')
const pid = Number.parseInt((pidLine || '').trim(), 10)
const startedAt = Number.parseInt((startedLine || '').trim(), 10)
const ageMs = Number.isFinite(startedAt) ? now() - startedAt * 1000 : Infinity
const alive = Number.isInteger(pid) && isPidAlive(pid, kill)
if (!alive || ageMs > maxAgeMs) {
try {
fs.unlinkSync(file)
} catch {
void 0
}
return null
}
return { pid, ageMs }
}
module.exports = {
UPDATE_MARKER_MAX_AGE_MS,
markerPath,
isPidAlive,
readLiveUpdateMarker
}

View file

@ -0,0 +1,92 @@
/**
* Tests for electron/update-marker.cjs the in-app update mutual-exclusion
* marker that prevents a desktop relaunched mid-update from spawning a backend
* the updater then kills in a loop (#50238).
*
* Run with: node --test electron/update-marker.test.cjs
* (Wired into npm test:desktop:platforms in package.json.)
*
* Why this matters: the gate must (a) report a live update only when the
* updater pid is alive AND the marker is fresh, (b) treat absent/malformed/
* dead-pid/expired markers as "no live update" so a crashed updater can't
* strand future launches, and (c) self-heal by deleting a stale marker file.
*/
const test = require('node:test')
const assert = require('node:assert/strict')
const fs = require('fs')
const os = require('os')
const path = require('path')
const { markerPath, isPidAlive, readLiveUpdateMarker, UPDATE_MARKER_MAX_AGE_MS } = require('./update-marker.cjs')
function tmpHome(tag) {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), `hermes-marker-${tag}-`))
return dir
}
function writeMarker(home, pid, startedAtSec) {
fs.writeFileSync(markerPath(home), `${pid}\n${startedAtSec}`)
}
const ALIVE = () => true // injected kill that "succeeds" => pid alive
const DEAD = () => {
const err = new Error('no such process')
err.code = 'ESRCH'
throw err
}
test('absent marker => no live update', () => {
const home = tmpHome('absent')
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE }), null)
})
test('live pid within age ceiling => live update reported', () => {
const home = tmpHome('live')
const now = 1_000_000_000_000
writeMarker(home, 4242, Math.floor(now / 1000) - 5) // 5s old
const res = readLiveUpdateMarker(home, { kill: ALIVE, now: () => now })
assert.ok(res, 'a fresh, alive marker is a live update')
assert.equal(res.pid, 4242)
assert.ok(res.ageMs >= 0 && res.ageMs < 10_000)
assert.ok(fs.existsSync(markerPath(home)), 'a live marker is NOT deleted')
})
test('dead pid => no live update and marker is pruned', () => {
const home = tmpHome('dead')
writeMarker(home, 999999, Math.floor(Date.now() / 1000))
assert.equal(readLiveUpdateMarker(home, { kill: DEAD }), null)
assert.ok(!fs.existsSync(markerPath(home)), 'a dead-pid marker self-heals (deleted)')
})
test('expired marker (past age ceiling) => no live update and pruned', () => {
const home = tmpHome('expired')
const now = 1_000_000_000_000
writeMarker(home, 4242, Math.floor((now - UPDATE_MARKER_MAX_AGE_MS - 60_000) / 1000))
// Even though the pid is "alive", the marker is too old to trust.
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE, now: () => now }), null)
assert.ok(!fs.existsSync(markerPath(home)), 'an expired marker self-heals (deleted)')
})
test('malformed marker => no live update and pruned', () => {
const home = tmpHome('malformed')
fs.writeFileSync(markerPath(home), 'not-a-pid\nnonsense')
assert.equal(readLiveUpdateMarker(home, { kill: ALIVE }), null)
assert.ok(!fs.existsSync(markerPath(home)))
})
test('isPidAlive: own pid is alive, impossible pid is dead', () => {
assert.equal(isPidAlive(process.pid), true)
assert.equal(isPidAlive(-1), false)
assert.equal(isPidAlive(0), false)
assert.equal(isPidAlive(NaN), false)
})
test('isPidAlive: EPERM counts as alive (process owned by another user)', () => {
const eperm = () => {
const err = new Error('operation not permitted')
err.code = 'EPERM'
throw err
}
assert.equal(isPidAlive(4242, eperm), true)
})

View file

@ -37,7 +37,7 @@
"test:desktop:nsis": "node scripts/test-desktop.mjs nsis",
"test:desktop:existing": "node scripts/test-desktop.mjs existing",
"test:desktop:fresh": "node scripts/test-desktop.mjs fresh",
"test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-rebuild.test.cjs electron/windows-user-env.test.cjs",
"test:desktop:platforms": "node --test electron/bootstrap-platform.test.cjs electron/hardening.test.cjs electron/backend-env.test.cjs electron/backend-probes.test.cjs electron/backend-ready.test.cjs electron/bootstrap-runner.test.cjs electron/connection-config.test.cjs electron/dashboard-token.test.cjs electron/gateway-ws-probe.test.cjs electron/oauth-net-request.test.cjs electron/desktop-uninstall.test.cjs electron/session-windows.test.cjs electron/link-title-window.test.cjs electron/workspace-cwd.test.cjs electron/fs-read-dir.test.cjs electron/git-root.test.cjs electron/windows-child-process.test.cjs electron/update-remote.test.cjs electron/update-rebuild.test.cjs electron/update-marker.test.cjs electron/windows-user-env.test.cjs",
"typecheck": "tsc -p . --noEmit",
"lint": "eslint src/ electron/",
"lint:fix": "eslint src/ electron/ --fix",