mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(desktop/windows): stop racing our own backend during in-app update (#39828)
* fix(desktop/windows): stop racing our own backend during in-app update The Windows in-app update (Update button -> hermes-setup.exe --update handoff) bricked because it raced a still-locked hermes.exe: the desktop quit fire-and-forget without reaping its backend child + grandchildren, so when the updater ran `hermes update`, the venv shim was still open. The quarantine rename then failed, uv's `pip install -e .` hit "Access is denied", the git path bailed to a full ZIP re-download, and the deps still couldn't write the locked shim -- leaving a half-applied install. macOS is fine because it never blocks REPLACE on a running executable. Three coordinated fixes restore Mac-style parity (click Update -> progress -> relaunch, no terminal): A. Desktop (main.cjs): before spawning the updater, releaseBackendLockForUpdate() tree-kills the primary + pool backends (taskkill /T /F on Windows, to catch REPL/pty/gateway grandchildren that SIGTERM misses) and polls the venv shim until it is actually writable (bounded 15s) -- so the lock is gone before we hand off. Also fixes resolveHermesCliBinary to use venv\Scripts\hermes.exe on Windows. B. Updater (update.rs): wait_for_venv_free no longer "proceeds anyway" on timeout -- it force-kills any lingering hermes.exe (excluding itself) and re-checks, so a straggler can't doom the install. C. Updater (update.rs): pass --force to `hermes update`. By contract the desktop has exited + waited, and the wait force-kills stragglers, so the running-exe guard would only produce a false "Hermes is still running" dead-end. Verified: node --check on main.cjs, cargo check on the updater (clean), and the Windows-gated taskkill body type-checks standalone. Field repro: ryanc's update.log (manual + handoff both hit the same lock cascade). * review: scope backend kill+wait to Windows; drop meaningless POSIX pgid kill
This commit is contained in:
parent
d880b5be09
commit
98528c78c1
2 changed files with 178 additions and 1 deletions
|
|
@ -171,6 +171,13 @@ async fn run_update(app: AppHandle) -> Result<()> {
|
|||
let child_env = update_child_env(&install_root);
|
||||
let mut update_args: Vec<String> =
|
||||
vec!["update".into(), "--yes".into(), "--gateway".into()];
|
||||
// --force skips `hermes update`'s Windows running-exe guard (which would
|
||||
// `sys.exit(2)` and dead-end the handoff). By contract the desktop has
|
||||
// already exited and waited for the venv shim to unlock before launching
|
||||
// us, and wait_for_venv_free below force-kills any straggler — so by the
|
||||
// time `hermes update` runs there is no legitimate hermes.exe to protect,
|
||||
// and the guard would only produce a false "Hermes is still running" stop.
|
||||
update_args.push("--force".into());
|
||||
update_args.push("--branch".into());
|
||||
update_args.push(update_branch);
|
||||
|
||||
|
|
@ -366,18 +373,77 @@ async fn wait_for_venv_free(install_root: &Path, app: &AppHandle) {
|
|||
return;
|
||||
}
|
||||
if Instant::now() >= deadline {
|
||||
// Last resort: a backend hermes.exe (or a grandchild it spawned)
|
||||
// is still holding the shim. The desktop should have reaped its
|
||||
// tree before handing off, but SIGTERM races / detached
|
||||
// grandchildren / AV handles can leave a straggler. Rather than
|
||||
// "proceed anyway" straight into uv's "Access is denied", force-kill
|
||||
// every hermes.exe except ourselves, then give the OS a beat to
|
||||
// unload the image.
|
||||
emit_log(
|
||||
app,
|
||||
Some("update"),
|
||||
LogStream::Stdout,
|
||||
"[update] timed out waiting for Hermes to exit; proceeding anyway",
|
||||
"[update] Hermes still holding the venv shim; force-killing stragglers…",
|
||||
);
|
||||
force_kill_other_hermes();
|
||||
tokio::time::sleep(Duration::from_millis(800)).await;
|
||||
if !is_locked(&shim) {
|
||||
emit_log(
|
||||
app,
|
||||
Some("update"),
|
||||
LogStream::Stdout,
|
||||
"[update] venv shim freed after force-kill",
|
||||
);
|
||||
} else {
|
||||
emit_log(
|
||||
app,
|
||||
Some("update"),
|
||||
LogStream::Stdout,
|
||||
"[update] venv shim still locked; proceeding (--force + quarantine will handle it)",
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
tokio::time::sleep(DESKTOP_EXIT_POLL).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Force-kill any `hermes.exe` other than this process. Windows-only; a no-op
|
||||
/// elsewhere (POSIX has no mandatory-lock contention). We can't selectively
|
||||
/// target "the backend" by PID here — the desktop already exited and we never
|
||||
/// knew its children — so we kill the whole `hermes.exe` image tree via
|
||||
/// taskkill, excluding our own PID.
|
||||
///
|
||||
/// Safe w.r.t. our own update child: this runs inside `wait_for_venv_free`,
|
||||
/// which completes BEFORE we spawn `venv\Scripts\hermes.exe update`. At this
|
||||
/// point no update-driven hermes.exe exists yet, so the only hermes.exe images
|
||||
/// are stragglers from the old desktop — exactly what we want gone. (`/FI PID
|
||||
/// ne <self>` also spares this Tauri process, though it isn't named
|
||||
/// hermes.exe.)
|
||||
fn force_kill_other_hermes() {
|
||||
if !cfg!(target_os = "windows") {
|
||||
return;
|
||||
}
|
||||
#[cfg(target_os = "windows")]
|
||||
{
|
||||
let my_pid = std::process::id();
|
||||
// /FI excludes our own PID; /T kills the tree; /F forces.
|
||||
let _ = std::process::Command::new("taskkill")
|
||||
.args([
|
||||
"/F",
|
||||
"/T",
|
||||
"/IM",
|
||||
"hermes.exe",
|
||||
"/FI",
|
||||
&format!("PID ne {my_pid}"),
|
||||
])
|
||||
.stdout(std::process::Stdio::null())
|
||||
.stderr(std::process::Stdio::null())
|
||||
.status();
|
||||
}
|
||||
}
|
||||
|
||||
/// Best-effort lock probe: try to open the file for read+write. On Windows an
|
||||
/// exclusively-held running .exe refuses the open with a sharing violation.
|
||||
/// On Unix this almost always succeeds (no mandatory locking), which is fine —
|
||||
|
|
|
|||
|
|
@ -1313,6 +1313,111 @@ function resolveUpdaterBinary() {
|
|||
return fileExists(candidate) ? candidate : null
|
||||
}
|
||||
|
||||
// Path to the venv shim whose lock decides whether `hermes update` can write
|
||||
// fresh entry points. On Windows this is the file the running backend
|
||||
// `hermes.exe` holds open; on POSIX it's never mandatory-locked.
|
||||
function venvHermesShimPath(updateRoot) {
|
||||
return IS_WINDOWS
|
||||
? path.join(updateRoot, 'venv', 'Scripts', 'hermes.exe')
|
||||
: path.join(updateRoot, 'venv', 'bin', 'hermes')
|
||||
}
|
||||
|
||||
// Best-effort lock probe mirroring the Rust updater's is_locked(): a running
|
||||
// .exe on Windows refuses an O_RDWR open with a sharing violation. On POSIX
|
||||
// this practically always succeeds (no mandatory locking), so it returns false
|
||||
// — correct, since the shim-contention brick is Windows-only.
|
||||
function isShimLocked(shimPath) {
|
||||
if (!IS_WINDOWS) return false
|
||||
let fd
|
||||
try {
|
||||
fd = fs.openSync(shimPath, 'r+')
|
||||
return false
|
||||
} catch (err) {
|
||||
// ENOENT ⇒ not there ⇒ nothing locking it. Anything else (EBUSY/EPERM/
|
||||
// EACCES) on Windows means a live handle holds it.
|
||||
return err && err.code !== 'ENOENT'
|
||||
} finally {
|
||||
if (fd !== undefined) {
|
||||
try {
|
||||
fs.closeSync(fd)
|
||||
} catch {
|
||||
void 0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Force-kill the entire process TREE rooted at each PID. Node's child.kill()
|
||||
// only signals the direct child, so on Windows a backend `hermes.exe` that
|
||||
// spawned its own grandchildren (a `hermes` REPL, a pty terminal session, the
|
||||
// gateway) would survive and keep the venv shim locked. taskkill /T /F reaps
|
||||
// the whole tree synchronously. Windows-only: this is called solely from the
|
||||
// Windows shim-unlock path, and the backend is NOT spawned detached (so it's
|
||||
// not a process-group leader — a POSIX negative-pgid kill would be meaningless
|
||||
// here anyway). POSIX teardown stays with the existing before-quit SIGTERM.
|
||||
function forceKillProcessTree(pid) {
|
||||
if (!IS_WINDOWS) return
|
||||
if (!Number.isInteger(pid) || pid <= 0) return
|
||||
try {
|
||||
execFileSync('taskkill', ['/PID', String(pid), '/T', '/F'], { stdio: 'ignore' })
|
||||
} catch {
|
||||
// Already gone, or no permission — best effort; the unlock wait below is
|
||||
// the real gate.
|
||||
}
|
||||
}
|
||||
|
||||
// Before handing off the update on Windows, the desktop MUST stop every backend
|
||||
// it spawned and WAIT for the venv shim to actually unlock. The old code did
|
||||
// `hermesProcess.kill('SIGTERM')` + `app.quit()` fire-and-forget: SIGTERM on
|
||||
// Windows doesn't reap the backend's grandchildren, and quit didn't wait for
|
||||
// teardown, so the updater raced a still-locked `hermes.exe`, the quarantine
|
||||
// rename failed, uv's `pip install` hit "Access is denied", and the git path
|
||||
// bailed into a full ZIP re-download that ALSO couldn't write the locked shim —
|
||||
// a half-applied install (ryanc's update.log). Here we tree-kill the primary +
|
||||
// pool backends and poll the shim until it's writable (or a bounded timeout),
|
||||
// so by the time we spawn the updater the lock is genuinely gone.
|
||||
//
|
||||
// Windows-only: the venv-shim mandatory lock is a Windows phenomenon. On
|
||||
// macOS/Linux there's no REPLACE-on-running-exe block, the existing before-quit
|
||||
// SIGTERM + app.quit() teardown already works (the macOS path is flawless), and
|
||||
// aggressively SIGKILL-ing the backend here would be an untested behavior change
|
||||
// for no benefit. So we no-op off Windows and leave that path exactly as it was.
|
||||
async function releaseBackendLockForUpdate(updateRoot) {
|
||||
if (!IS_WINDOWS) return { unlocked: true }
|
||||
|
||||
// Collect every backend PID the desktop owns: primary window backend + pool.
|
||||
const pids = []
|
||||
if (hermesProcess && Number.isInteger(hermesProcess.pid)) pids.push(hermesProcess.pid)
|
||||
for (const entry of backendPool.values()) {
|
||||
if (entry.process && Number.isInteger(entry.process.pid)) pids.push(entry.process.pid)
|
||||
}
|
||||
|
||||
// Graceful first (lets Python flush), then tree-kill to catch grandchildren.
|
||||
if (hermesProcess && !hermesProcess.killed) {
|
||||
try {
|
||||
hermesProcess.kill('SIGTERM')
|
||||
} catch {
|
||||
void 0
|
||||
}
|
||||
}
|
||||
stopAllPoolBackends()
|
||||
for (const pid of pids) forceKillProcessTree(pid)
|
||||
|
||||
const shim = venvHermesShimPath(updateRoot)
|
||||
const deadlineMs = Date.now() + 15000
|
||||
while (Date.now() < deadlineMs) {
|
||||
if (!isShimLocked(shim)) {
|
||||
rememberLog('[updates] venv shim unlocked; safe to hand off the update')
|
||||
return { unlocked: true }
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 300))
|
||||
}
|
||||
// Timed out: the updater's own wait_for_venv_free + force-kill is the second
|
||||
// line of defense, and we pass --force so the guard won't dead-end. Log it.
|
||||
rememberLog('[updates] venv shim still locked after 15s; handing off anyway (updater will force)')
|
||||
return { unlocked: false }
|
||||
}
|
||||
|
||||
// applyUpdates — hand off to the installer's --update flow, then exit.
|
||||
//
|
||||
// The desktop is a pure consumer: it does NOT git pull / pip install / rebuild
|
||||
|
|
@ -1379,6 +1484,12 @@ async function applyUpdates(opts = {}) {
|
|||
}
|
||||
const venvBin = path.join(updateRoot, 'venv', IS_WINDOWS ? 'Scripts' : 'bin')
|
||||
|
||||
// Stop our own backend(s) and wait for the venv shim to unlock BEFORE we
|
||||
// spawn the updater. Without this the updater races a still-locked
|
||||
// hermes.exe (held by the backend child / its grandchildren) and the update
|
||||
// bricks. See releaseBackendLockForUpdate for the full failure analysis.
|
||||
await releaseBackendLockForUpdate(updateRoot)
|
||||
|
||||
// Detached so the updater outlives this process — it needs us GONE before
|
||||
// `hermes update` will run (the venv shim is locked while we live).
|
||||
const child = spawn(updater, updaterArgs, {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue