From 98528c78c1434a093be44760b31d4a81b433cc3e Mon Sep 17 00:00:00 2001 From: brooklyn! Date: Fri, 5 Jun 2026 08:33:53 -0500 Subject: [PATCH] fix(desktop/windows): stop racing our own backend during in-app update (#39828) * fix(desktop/windows): stop racing our own backend during in-app update The Windows in-app update (Update button -> hermes-setup.exe --update handoff) bricked because it raced a still-locked hermes.exe: the desktop quit fire-and-forget without reaping its backend child + grandchildren, so when the updater ran `hermes update`, the venv shim was still open. The quarantine rename then failed, uv's `pip install -e .` hit "Access is denied", the git path bailed to a full ZIP re-download, and the deps still couldn't write the locked shim -- leaving a half-applied install. macOS is fine because it never blocks REPLACE on a running executable. Three coordinated fixes restore Mac-style parity (click Update -> progress -> relaunch, no terminal): A. Desktop (main.cjs): before spawning the updater, releaseBackendLockForUpdate() tree-kills the primary + pool backends (taskkill /T /F on Windows, to catch REPL/pty/gateway grandchildren that SIGTERM misses) and polls the venv shim until it is actually writable (bounded 15s) -- so the lock is gone before we hand off. Also fixes resolveHermesCliBinary to use venv\Scripts\hermes.exe on Windows. B. Updater (update.rs): wait_for_venv_free no longer "proceeds anyway" on timeout -- it force-kills any lingering hermes.exe (excluding itself) and re-checks, so a straggler can't doom the install. C. Updater (update.rs): pass --force to `hermes update`. By contract the desktop has exited + waited, and the wait force-kills stragglers, so the running-exe guard would only produce a false "Hermes is still running" dead-end. Verified: node --check on main.cjs, cargo check on the updater (clean), and the Windows-gated taskkill body type-checks standalone. Field repro: ryanc's update.log (manual + handoff both hit the same lock cascade). * review: scope backend kill+wait to Windows; drop meaningless POSIX pgid kill --- .../src-tauri/src/update.rs | 68 ++++++++++- apps/desktop/electron/main.cjs | 111 ++++++++++++++++++ 2 files changed, 178 insertions(+), 1 deletion(-) diff --git a/apps/bootstrap-installer/src-tauri/src/update.rs b/apps/bootstrap-installer/src-tauri/src/update.rs index c84f18965bb..bd0595af2f8 100644 --- a/apps/bootstrap-installer/src-tauri/src/update.rs +++ b/apps/bootstrap-installer/src-tauri/src/update.rs @@ -171,6 +171,13 @@ async fn run_update(app: AppHandle) -> Result<()> { let child_env = update_child_env(&install_root); let mut update_args: Vec = vec!["update".into(), "--yes".into(), "--gateway".into()]; + // --force skips `hermes update`'s Windows running-exe guard (which would + // `sys.exit(2)` and dead-end the handoff). By contract the desktop has + // already exited and waited for the venv shim to unlock before launching + // us, and wait_for_venv_free below force-kills any straggler — so by the + // time `hermes update` runs there is no legitimate hermes.exe to protect, + // and the guard would only produce a false "Hermes is still running" stop. + update_args.push("--force".into()); update_args.push("--branch".into()); update_args.push(update_branch); @@ -366,18 +373,77 @@ async fn wait_for_venv_free(install_root: &Path, app: &AppHandle) { return; } if Instant::now() >= deadline { + // Last resort: a backend hermes.exe (or a grandchild it spawned) + // is still holding the shim. The desktop should have reaped its + // tree before handing off, but SIGTERM races / detached + // grandchildren / AV handles can leave a straggler. Rather than + // "proceed anyway" straight into uv's "Access is denied", force-kill + // every hermes.exe except ourselves, then give the OS a beat to + // unload the image. emit_log( app, Some("update"), LogStream::Stdout, - "[update] timed out waiting for Hermes to exit; proceeding anyway", + "[update] Hermes still holding the venv shim; force-killing stragglers…", ); + force_kill_other_hermes(); + tokio::time::sleep(Duration::from_millis(800)).await; + if !is_locked(&shim) { + emit_log( + app, + Some("update"), + LogStream::Stdout, + "[update] venv shim freed after force-kill", + ); + } else { + emit_log( + app, + Some("update"), + LogStream::Stdout, + "[update] venv shim still locked; proceeding (--force + quarantine will handle it)", + ); + } return; } tokio::time::sleep(DESKTOP_EXIT_POLL).await; } } +/// Force-kill any `hermes.exe` other than this process. Windows-only; a no-op +/// elsewhere (POSIX has no mandatory-lock contention). We can't selectively +/// target "the backend" by PID here — the desktop already exited and we never +/// knew its children — so we kill the whole `hermes.exe` image tree via +/// taskkill, excluding our own PID. +/// +/// Safe w.r.t. our own update child: this runs inside `wait_for_venv_free`, +/// which completes BEFORE we spawn `venv\Scripts\hermes.exe update`. At this +/// point no update-driven hermes.exe exists yet, so the only hermes.exe images +/// are stragglers from the old desktop — exactly what we want gone. (`/FI PID +/// ne ` also spares this Tauri process, though it isn't named +/// hermes.exe.) +fn force_kill_other_hermes() { + if !cfg!(target_os = "windows") { + return; + } + #[cfg(target_os = "windows")] + { + let my_pid = std::process::id(); + // /FI excludes our own PID; /T kills the tree; /F forces. + let _ = std::process::Command::new("taskkill") + .args([ + "/F", + "/T", + "/IM", + "hermes.exe", + "/FI", + &format!("PID ne {my_pid}"), + ]) + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status(); + } +} + /// Best-effort lock probe: try to open the file for read+write. On Windows an /// exclusively-held running .exe refuses the open with a sharing violation. /// On Unix this almost always succeeds (no mandatory locking), which is fine — diff --git a/apps/desktop/electron/main.cjs b/apps/desktop/electron/main.cjs index 583a316dcbe..0234dea340b 100644 --- a/apps/desktop/electron/main.cjs +++ b/apps/desktop/electron/main.cjs @@ -1313,6 +1313,111 @@ function resolveUpdaterBinary() { return fileExists(candidate) ? candidate : null } +// Path to the venv shim whose lock decides whether `hermes update` can write +// fresh entry points. On Windows this is the file the running backend +// `hermes.exe` holds open; on POSIX it's never mandatory-locked. +function venvHermesShimPath(updateRoot) { + return IS_WINDOWS + ? path.join(updateRoot, 'venv', 'Scripts', 'hermes.exe') + : path.join(updateRoot, 'venv', 'bin', 'hermes') +} + +// Best-effort lock probe mirroring the Rust updater's is_locked(): a running +// .exe on Windows refuses an O_RDWR open with a sharing violation. On POSIX +// this practically always succeeds (no mandatory locking), so it returns false +// — correct, since the shim-contention brick is Windows-only. +function isShimLocked(shimPath) { + if (!IS_WINDOWS) return false + let fd + try { + fd = fs.openSync(shimPath, 'r+') + return false + } catch (err) { + // ENOENT ⇒ not there ⇒ nothing locking it. Anything else (EBUSY/EPERM/ + // EACCES) on Windows means a live handle holds it. + return err && err.code !== 'ENOENT' + } finally { + if (fd !== undefined) { + try { + fs.closeSync(fd) + } catch { + void 0 + } + } + } +} + +// Force-kill the entire process TREE rooted at each PID. Node's child.kill() +// only signals the direct child, so on Windows a backend `hermes.exe` that +// spawned its own grandchildren (a `hermes` REPL, a pty terminal session, the +// gateway) would survive and keep the venv shim locked. taskkill /T /F reaps +// the whole tree synchronously. Windows-only: this is called solely from the +// Windows shim-unlock path, and the backend is NOT spawned detached (so it's +// not a process-group leader — a POSIX negative-pgid kill would be meaningless +// here anyway). POSIX teardown stays with the existing before-quit SIGTERM. +function forceKillProcessTree(pid) { + if (!IS_WINDOWS) return + if (!Number.isInteger(pid) || pid <= 0) return + try { + execFileSync('taskkill', ['/PID', String(pid), '/T', '/F'], { stdio: 'ignore' }) + } catch { + // Already gone, or no permission — best effort; the unlock wait below is + // the real gate. + } +} + +// Before handing off the update on Windows, the desktop MUST stop every backend +// it spawned and WAIT for the venv shim to actually unlock. The old code did +// `hermesProcess.kill('SIGTERM')` + `app.quit()` fire-and-forget: SIGTERM on +// Windows doesn't reap the backend's grandchildren, and quit didn't wait for +// teardown, so the updater raced a still-locked `hermes.exe`, the quarantine +// rename failed, uv's `pip install` hit "Access is denied", and the git path +// bailed into a full ZIP re-download that ALSO couldn't write the locked shim — +// a half-applied install (ryanc's update.log). Here we tree-kill the primary + +// pool backends and poll the shim until it's writable (or a bounded timeout), +// so by the time we spawn the updater the lock is genuinely gone. +// +// Windows-only: the venv-shim mandatory lock is a Windows phenomenon. On +// macOS/Linux there's no REPLACE-on-running-exe block, the existing before-quit +// SIGTERM + app.quit() teardown already works (the macOS path is flawless), and +// aggressively SIGKILL-ing the backend here would be an untested behavior change +// for no benefit. So we no-op off Windows and leave that path exactly as it was. +async function releaseBackendLockForUpdate(updateRoot) { + if (!IS_WINDOWS) return { unlocked: true } + + // Collect every backend PID the desktop owns: primary window backend + pool. + const pids = [] + if (hermesProcess && Number.isInteger(hermesProcess.pid)) pids.push(hermesProcess.pid) + for (const entry of backendPool.values()) { + if (entry.process && Number.isInteger(entry.process.pid)) pids.push(entry.process.pid) + } + + // Graceful first (lets Python flush), then tree-kill to catch grandchildren. + if (hermesProcess && !hermesProcess.killed) { + try { + hermesProcess.kill('SIGTERM') + } catch { + void 0 + } + } + stopAllPoolBackends() + for (const pid of pids) forceKillProcessTree(pid) + + const shim = venvHermesShimPath(updateRoot) + const deadlineMs = Date.now() + 15000 + while (Date.now() < deadlineMs) { + if (!isShimLocked(shim)) { + rememberLog('[updates] venv shim unlocked; safe to hand off the update') + return { unlocked: true } + } + await new Promise(r => setTimeout(r, 300)) + } + // Timed out: the updater's own wait_for_venv_free + force-kill is the second + // line of defense, and we pass --force so the guard won't dead-end. Log it. + rememberLog('[updates] venv shim still locked after 15s; handing off anyway (updater will force)') + return { unlocked: false } +} + // applyUpdates — hand off to the installer's --update flow, then exit. // // The desktop is a pure consumer: it does NOT git pull / pip install / rebuild @@ -1379,6 +1484,12 @@ async function applyUpdates(opts = {}) { } const venvBin = path.join(updateRoot, 'venv', IS_WINDOWS ? 'Scripts' : 'bin') + // Stop our own backend(s) and wait for the venv shim to unlock BEFORE we + // spawn the updater. Without this the updater races a still-locked + // hermes.exe (held by the backend child / its grandchildren) and the update + // bricks. See releaseBackendLockForUpdate for the full failure analysis. + await releaseBackendLockForUpdate(updateRoot) + // Detached so the updater outlives this process — it needs us GONE before // `hermes update` will run (the venv shim is locked while we live). const child = spawn(updater, updaterArgs, {