From aa53a78d6703ebdb0a9e05bc4d8878c1720930dd Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 13 Jun 2026 05:54:32 -0700 Subject: [PATCH] fix(desktop): hand off Windows bootstrap recovery (#45594) --- .../src-tauri/src/update.rs | 126 +++++++++++++----- apps/desktop/electron/main.cjs | 46 +++++++ .../electron/windows-child-process.test.cjs | 3 + 3 files changed, 145 insertions(+), 30 deletions(-) diff --git a/apps/bootstrap-installer/src-tauri/src/update.rs b/apps/bootstrap-installer/src-tauri/src/update.rs index 658bff6c540..40d136f960d 100644 --- a/apps/bootstrap-installer/src-tauri/src/update.rs +++ b/apps/bootstrap-installer/src-tauri/src/update.rs @@ -3,8 +3,9 @@ //! Driven when the installer is launched as `Hermes-Setup.exe --update` (see //! `AppMode` in lib.rs). The desktop app hands off to us — it exits, then we: //! -//! 1. wait for the old Hermes desktop process to fully exit (so the venv -//! shim is free; otherwise `hermes update` aborts with exit code 2), +//! 1. wait for the old Hermes desktop process to fully exit (so both the +//! venv shim and packaged app.asar are free; otherwise `hermes update` +//! or repair bootstrap can race locked files), //! 2. run `hermes update --yes --gateway` (Python/repo update; this does NOT //! rebuild apps/desktop by design — see cmd_update in hermes_cli/main.py), //! 3. run `hermes desktop --build-only` (the rebuild step update skips), @@ -38,8 +39,8 @@ use crate::events::{BootstrapEvent, LogStream, StageInfo, StageState}; /// hermes_cli/main.py (sys.exit(2)). We surface a targeted message for this. const UPDATE_EXIT_CONCURRENT: i32 = 2; -/// How long to wait for the old desktop process to release the venv shim -/// before giving up and letting `hermes update`'s own guard decide. +/// How long to wait for the old desktop process to release files under the +/// install tree before giving up and letting `hermes update`'s own guard decide. const DESKTOP_EXIT_WAIT: Duration = Duration::from_secs(20); const DESKTOP_EXIT_POLL: Duration = Duration::from_millis(500); @@ -150,8 +151,10 @@ async fn run_update(app: AppHandle) -> Result<()> { // ---- pre-step: wait for the old desktop to die ----------------------- // The desktop exec'd us then called app.exit(), but process teardown is // async on Windows. If it still holds the venv shim, `hermes update` - // aborts with exit 2. Give it a bounded window to clear. - wait_for_venv_free(&install_root, &app).await; + // aborts with exit 2. If it still holds the packaged app.asar, + // install.ps1's repair/re-clone path cannot move/remove the install tree. + // Give both handles a bounded window to clear. + wait_for_install_locks_free(&install_root, &app, "update").await; // ---- stage 1: hermes update ----------------------------------------- // Pass --branch so `hermes update` targets the branch this installer was @@ -173,8 +176,8 @@ async fn run_update(app: AppHandle) -> Result<()> { vec!["update".into(), "--yes".into(), "--gateway".into()]; // --force skips `hermes update`'s Windows running-exe guard (which would // `sys.exit(2)` and dead-end the handoff). By contract the desktop has - // already exited and waited for the venv shim to unlock before launching - // us, and wait_for_venv_free below force-kills any straggler — so by the + // already exited and waited for the install locks to clear before launching + // us, and wait_for_install_locks_free below force-kills any straggler — so by the // time `hermes update` runs there is no legitimate hermes.exe to protect, // and the guard would only produce a false "Hermes is still running" stop. update_args.push("--force".into()); @@ -391,48 +394,57 @@ async fn run_update(app: AppHandle) -> Result<()> { Ok(()) } -/// Poll until the venv shim is no longer locked (Windows) or a bounded timeout -/// elapses. On non-Windows this is a short fixed grace since file locking -/// isn't the failure mode there. -async fn wait_for_venv_free(install_root: &Path, app: &AppHandle) { - let shim = venv_hermes(install_root); +/// Poll until the venv shim AND packaged desktop app bundle are no longer locked +/// (Windows) or a bounded timeout elapses. On non-Windows this is a short fixed +/// grace since file locking isn't the failure mode there. +pub(crate) async fn wait_for_install_locks_free(install_root: &Path, app: &AppHandle, stage: &str) { + let lock_targets = install_lock_probe_paths(install_root); let deadline = Instant::now() + DESKTOP_EXIT_WAIT; - emit_log(app, Some("update"), LogStream::Stdout, "[update] waiting for Hermes to exit…"); + emit_log(app, Some(stage), LogStream::Stdout, "[handoff] waiting for Hermes to exit…"); loop { - if !is_locked(&shim) { + let locked = locked_paths(&lock_targets); + if locked.is_empty() { return; } if Instant::now() >= deadline { - // Last resort: a backend hermes.exe (or a grandchild it spawned) - // is still holding the shim. The desktop should have reaped its - // tree before handing off, but SIGTERM races / detached - // grandchildren / AV handles can leave a straggler. Rather than - // "proceed anyway" straight into uv's "Access is denied", force-kill - // every hermes.exe except ourselves, then give the OS a beat to - // unload the image. + // Last resort: a backend hermes.exe (or the desktop Hermes.exe + // itself) is still holding one of the update-sensitive files. The + // desktop should have reaped its tree before handing off, but + // SIGTERM races / detached grandchildren / AV handles can leave a + // straggler. Rather than "proceed anyway" straight into uv's + // "Access is denied" or install.ps1's locked app.asar failure, + // force-kill every Hermes.exe except ourselves, then give the OS a + // beat to unload the image. emit_log( app, - Some("update"), + Some(stage), LogStream::Stdout, - "[update] Hermes still holding the venv shim; force-killing stragglers…", + &format!( + "[handoff] Hermes still holding install files ({}); force-killing stragglers…", + format_locked_paths(&locked) + ), ); force_kill_other_hermes(); tokio::time::sleep(Duration::from_millis(800)).await; - if !is_locked(&shim) { + let locked_after_kill = locked_paths(&lock_targets); + if locked_after_kill.is_empty() { emit_log( app, - Some("update"), + Some(stage), LogStream::Stdout, - "[update] venv shim freed after force-kill", + "[handoff] install files freed after force-kill", ); } else { emit_log( app, - Some("update"), + Some(stage), LogStream::Stdout, - "[update] venv shim still locked; proceeding (--force + quarantine will handle it)", + &format!( + "[handoff] install files still locked ({}); proceeding (--force + quarantine will handle it)", + format_locked_paths(&locked_after_kill) + ), ); } return; @@ -441,13 +453,44 @@ async fn wait_for_venv_free(install_root: &Path, app: &AppHandle) { } } +fn install_lock_probe_paths(install_root: &Path) -> Vec { + let mut paths = vec![venv_hermes(install_root)]; + paths.extend(desktop_app_payload_paths(install_root)); + paths +} + +fn desktop_app_payload_paths(install_root: &Path) -> Vec { + let release = install_root.join("apps").join("desktop").join("release"); + if cfg!(target_os = "windows") { + vec![ + release.join("win-unpacked").join("resources").join("app.asar"), + release.join("win-arm64-unpacked").join("resources").join("app.asar"), + ] + } else if cfg!(target_os = "macos") { + vec![ + release.join("mac").join("Hermes.app").join("Contents").join("Resources").join("app.asar"), + release.join("mac-arm64").join("Hermes.app").join("Contents").join("Resources").join("app.asar"), + ] + } else { + vec![release.join("linux-unpacked").join("resources").join("app.asar")] + } +} + +fn locked_paths(paths: &[PathBuf]) -> Vec { + paths.iter().filter(|p| is_locked(p)).cloned().collect() +} + +fn format_locked_paths(paths: &[PathBuf]) -> String { + paths.iter().map(|p| p.display().to_string()).collect::>().join(", ") +} + /// Force-kill any `hermes.exe` other than this process. Windows-only; a no-op /// elsewhere (POSIX has no mandatory-lock contention). We can't selectively /// target "the backend" by PID here — the desktop already exited and we never /// knew its children — so we kill the whole `hermes.exe` image tree via /// taskkill, excluding our own PID. /// -/// Safe w.r.t. our own update child: this runs inside `wait_for_venv_free`, +/// Safe w.r.t. our own update child: this runs inside the install-lock wait, /// which completes BEFORE we spawn `venv\Scripts\hermes.exe update`. At this /// point no update-driven hermes.exe exists yet, so the only hermes.exe images /// are stragglers from the old desktop — exactly what we want gone. (`/FI PID @@ -891,6 +934,29 @@ mod tests { assert!(!is_locked(Path::new("/nonexistent/does/not/exist/xyz"))); } + #[test] + fn lock_probe_paths_include_desktop_app_payload() { + let root = Path::new("/x/hermes-agent"); + let probes = install_lock_probe_paths(root); + + assert!( + probes.iter().any(|p| p == &venv_hermes(root)), + "venv shim remains part of the update lock probe" + ); + assert!( + probes.iter().any(|p| p.ends_with(Path::new("resources/app.asar"))), + "packaged app.asar must be probed so repair/re-clone waits for the old desktop to exit" + ); + } + + #[test] + fn locked_paths_ignores_missing_payloads() { + let root = Path::new("/nonexistent/hermes-agent"); + let probes = install_lock_probe_paths(root); + + assert!(locked_paths(&probes).is_empty()); + } + #[test] fn parses_update_branch_from_space_or_equals_args() { assert_eq!( diff --git a/apps/desktop/electron/main.cjs b/apps/desktop/electron/main.cjs index 8286630b954..1d04aca9555 100644 --- a/apps/desktop/electron/main.cjs +++ b/apps/desktop/electron/main.cjs @@ -1835,6 +1835,44 @@ async function applyUpdates(opts = {}) { } } +async function handOffWindowsBootstrapRecovery(reason) { + if (!IS_WINDOWS || !IS_PACKAGED) return false + + const updater = resolveUpdaterBinary() + if (!updater) return false + + const updateRoot = resolveUpdateRoot() + const { branch: configuredBranch } = readDesktopUpdateConfig() + const branch = directoryExists(path.join(updateRoot, '.git')) + ? await resolveHealedBranch(updateRoot, configuredBranch || DEFAULT_UPDATE_BRANCH) + : configuredBranch || DEFAULT_UPDATE_BRANCH + const venvBin = path.join(updateRoot, 'venv', IS_WINDOWS ? 'Scripts' : 'bin') + const venvHermes = path.join(venvBin, IS_WINDOWS ? 'hermes.exe' : 'hermes') + const updaterArgs = fileExists(venvHermes) ? ['--update', '--branch', branch] : ['--repair', '--branch', branch] + + await releaseBackendLockForUpdate(updateRoot) + + const child = spawn(updater, updaterArgs, { + cwd: HERMES_HOME, + env: { + ...process.env, + HERMES_HOME, + PATH: [path.join(HERMES_HOME, 'node', 'bin'), venvBin, process.env.PATH].filter(Boolean).join(path.delimiter) + }, + detached: true, + stdio: 'ignore', + windowsHide: false + }) + child.unref() + + rememberLog(`[bootstrap] handed off ${reason} recovery to updater: ${updater} ${updaterArgs.join(' ')}; exiting desktop to release app.asar`) + setTimeout(() => { + app.quit() + }, 600) + + return true +} + // Resolve the hermes CLI to drive an in-app update: prefer the venv shim in // the install we're updating, fall back to `hermes` on PATH. function resolveHermesCliBinary(updateRoot) { @@ -2432,6 +2470,14 @@ async function ensureRuntime(backend) { if (backend.kind === 'bootstrap-needed') { rememberLog('[bootstrap] no Hermes install found; starting first-launch bootstrap') + if (await handOffWindowsBootstrapRecovery('bootstrap-needed')) { + const handoffError = new Error('Hermes recovery was handed off to Hermes Setup. The desktop will restart when recovery completes.') + handoffError.isBootstrapFailure = true + handoffError.bootstrapHandedOff = true + bootstrapFailure = handoffError + throw handoffError + } + // Eagerly flip the bootstrap UI state to 'active' so the renderer // shows the install overlay BEFORE the runner finishes fetching the // manifest (which on slow networks can take tens of seconds and would diff --git a/apps/desktop/electron/windows-child-process.test.cjs b/apps/desktop/electron/windows-child-process.test.cjs index 92989c978bb..4239da56e23 100644 --- a/apps/desktop/electron/windows-child-process.test.cjs +++ b/apps/desktop/electron/windows-child-process.test.cjs @@ -42,6 +42,9 @@ test('intentional or interactive desktop child processes stay documented', () => const source = readElectronFile('main.cjs') assert.match(source, /windowsHide: false/) + assert.match(source, /handOffWindowsBootstrapRecovery/) + assert.match(source, /'--repair', '--branch'/) + assert.match(source, /'--update', '--branch'/) assert.match(source, /nodePty\.spawn\(command, args/) assert.match(source, /spawn\('cmd\.exe', \['\/c', 'start'/) })