fix(desktop): recover from corrupt cached Electron download on build

hermes desktop failed on Linux with an ENOENT renaming release/linux-unpacked/electron -> Hermes. Root cause is a corrupt cached Electron zip (~/.cache/electron/electron-*.zip): app-builder unpack-electron extracts a partial tree from the bad zip that is missing the electron binary, so electron-builder dies on the final rename. Re-running repeats the broken extraction, leaving the desktop app permanently unlaunchable until the cache is manually purged. - Add _electron_download_cache_dirs() + _purge_corrupt_electron_cache() to hermes_cli/main.py: validate every electron-*.zip via zipfile.testzip() and delete corrupt ones; honor electron_config_cache / ELECTRON_CACHE overrides with per-OS defaults. - Wire purge + single retry into cmd_gui packaged-build failure path so a poisoned download self-heals (electron re-downloads clean). - Add beforePack hook (apps/desktop/scripts/before-pack.cjs) to wipe the target unpacked dir before staging, making packaging idempotent across interrupted runs. Cross-platform, best-effort. - Tests: corrupt-zip detector, cmd_gui purge/retry/launch path, no-retry-when-clean path, and node --test for the cleanup helper.
2026-07-28 18:19:28 +00:00 · 2026-06-03 00:43:07 +07:00 · 2026-06-03 00:43:07 +07:00 · f583c6ebd5
commit f583c6ebd5
parent e003c53b06
5 changed files with 311 additions and 0 deletions
--- a/apps/desktop/package.json
+++ b/apps/desktop/package.json
@ -146,6 +146,7 @@
      "package.json"
    ],
    "beforeBuild": "scripts/before-build.cjs",
+    "beforePack": "scripts/before-pack.cjs",
    "afterPack": "scripts/after-pack.cjs",
    "extraResources": [
      {
--- a/apps/desktop/scripts/before-pack.cjs
+++ b/apps/desktop/scripts/before-pack.cjs
@ -0,0 +1,78 @@
+'use strict'
+
+/**
+ * before-pack.cjs — electron-builder beforePack hook.
+ *
+ * Removes any stale unpacked app directory (`appOutDir`) before
+ * electron-builder stages the Electron binaries into it.
+ *
+ * WHY THIS EXISTS
+ * ---------------
+ * electron-builder's final packaging step copies the stock `electron`
+ * binary into `release/<platform>-unpacked/` and then renames it to the
+ * product name (`Hermes`). If a PREVIOUS `npm run pack` was interrupted
+ * (Ctrl-C, OOM kill, crash, full disk) the unpacked directory is left in a
+ * corrupted partial state: it keeps the already-renamed `LICENSE.electron.txt`
+ * and the Chromium payload (.pak/.so/icudtl.dat/chrome-sandbox) but is MISSING
+ * the `electron` binary itself.
+ *
+ * On the next run, electron-builder sees the destination directory already
+ * populated, skips re-copying the binary it thinks is present, then tries to
+ * rename a `electron` file that no longer exists. The build dies with:
+ *
+ *   ENOENT: no such file or directory, rename
+ *   '.../release/linux-unpacked/electron' -> '.../release/linux-unpacked/Hermes'
+ *
+ * This is a hard failure with no obvious cause for the user — `hermes desktop`
+ * just prints "Desktop GUI build failed" and the only fix is to manually
+ * `rm -rf` the release directory, which a normal user has no way to know.
+ *
+ * The packaging step is not idempotent across an interrupted run, so we make
+ * it idempotent ourselves: wipe the target unpacked directory up front so
+ * electron-builder always stages into a clean tree. This is safe — the
+ * directory is a pure build artifact that electron-builder fully recreates
+ * on every pack; nothing else depends on its prior contents.
+ *
+ * Cross-platform: the same partial-state trap exists on macOS
+ * (the mac-unpacked Hermes.app bundle) and Windows (win-unpacked), so we
+ * clean whatever `appOutDir` electron-builder hands us regardless of platform.
+ *
+ * Best-effort: a cleanup failure must never mask the real build. We log and
+ * resolve rather than throw — worst case electron-builder hits the original
+ * ENOENT, which is no worse than not having this hook at all.
+ *
+ * electron-builder passes a context with:
+ *   - appOutDir:            the unpacked app directory about to be staged
+ *   - electronPlatformName: 'win32' | 'darwin' | 'linux'
+ */
+
+const fs = require('node:fs')
+
+function cleanStaleAppOutDir(appOutDir) {
+  if (!appOutDir || typeof appOutDir !== 'string') {
+    return false
+  }
+  if (!fs.existsSync(appOutDir)) {
+    return false
+  }
+  // Recursive + force so a half-written tree (read-only bits, partial files)
+  // can't block the wipe. retry/maxRetries rides out transient EBUSY on
+  // Windows where an AV/indexer may briefly hold a handle.
+  fs.rmSync(appOutDir, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
+  return true
+}
+
+exports.cleanStaleAppOutDir = cleanStaleAppOutDir
+
+exports.default = async function beforePack(context) {
+  const appOutDir = context && context.appOutDir
+  try {
+    if (cleanStaleAppOutDir(appOutDir)) {
+      console.log(`[before-pack] removed stale unpacked dir before staging: ${appOutDir}`)
+    }
+  } catch (err) {
+    // Never fail the build over cleanup; surface why so a genuinely stuck
+    // directory (permissions, mount) is still diagnosable.
+    console.warn(`[before-pack] could not clean ${appOutDir} (${err.message}); continuing`)
+  }
+}
--- a/apps/desktop/scripts/before-pack.test.cjs
+++ b/apps/desktop/scripts/before-pack.test.cjs
@ -0,0 +1,53 @@
+const assert = require('node:assert/strict')
+const fs = require('node:fs')
+const os = require('node:os')
+const path = require('node:path')
+const test = require('node:test')
+
+const { cleanStaleAppOutDir } = require('../scripts/before-pack.cjs')
+
+test('cleanStaleAppOutDir removes a populated unpacked directory', () => {
+  const tempRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-before-pack-'))
+  try {
+    const appOutDir = path.join(tempRoot, 'linux-unpacked')
+    fs.mkdirSync(appOutDir, { recursive: true })
+    // Reproduce the corrupted partial state: license + payload present,
+    // electron binary missing — exactly what trips the ENOENT rename.
+    fs.writeFileSync(path.join(appOutDir, 'LICENSE.electron.txt'), 'x', 'utf8')
+    fs.writeFileSync(path.join(appOutDir, 'resources.pak'), 'x', 'utf8')
+    fs.mkdirSync(path.join(appOutDir, 'resources'), { recursive: true })
+    fs.writeFileSync(path.join(appOutDir, 'resources', 'app.asar'), 'x', 'utf8')
+
+    const removed = cleanStaleAppOutDir(appOutDir)
+
+    assert.equal(removed, true)
+    assert.equal(fs.existsSync(appOutDir), false)
+  } finally {
+    fs.rmSync(tempRoot, { recursive: true, force: true })
+  }
+})
+
+test('cleanStaleAppOutDir is a no-op when the directory is absent', () => {
+  const tempRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'hermes-before-pack-'))
+  try {
+    const missing = path.join(tempRoot, 'does-not-exist')
+    assert.equal(cleanStaleAppOutDir(missing), false)
+  } finally {
+    fs.rmSync(tempRoot, { recursive: true, force: true })
+  }
+})
+
+test('cleanStaleAppOutDir ignores empty or invalid input', () => {
+  assert.equal(cleanStaleAppOutDir(''), false)
+  assert.equal(cleanStaleAppOutDir(undefined), false)
+  assert.equal(cleanStaleAppOutDir(null), false)
+  assert.equal(cleanStaleAppOutDir(42), false)
+})
+
+test('beforePack default export resolves even when cleanup throws', async () => {
+  const { default: beforePack } = require('../scripts/before-pack.cjs')
+  // A directory path that rmSync can't remove is simulated by passing a
+  // context whose appOutDir is a file the hook will try (and be allowed) to
+  // remove; the contract under test is that the hook never rejects.
+  await assert.doesNotReject(beforePack({ appOutDir: '', electronPlatformName: 'linux' }))
+})
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -7235,6 +7235,85 @@ def _desktop_packaged_executable(desktop_dir: Path) -> Optional[Path]:
    return max(existing, key=lambda p: p.stat().st_mtime)


+def _electron_download_cache_dirs() -> list[Path]:
+    """Return the per-user Electron download cache directories for this OS.
+
+    electron-builder's ``app-builder unpack-electron`` extracts the Electron
+    distribution from a zip stored in this cache (NOT from node_modules), so a
+    corrupt zip here — not a bad workspace install — is what poisons the build.
+    Honors the ``electron_config_cache`` / ``ELECTRON_CACHE`` overrides that
+    ``@electron/get`` respects, then falls back to the platform defaults.
+    """
+    home = Path.home()
+    candidates: list[Path] = []
+    override = os.environ.get("electron_config_cache") or os.environ.get("ELECTRON_CACHE")
+    if override:
+        candidates.append(Path(override))
+    if sys.platform == "darwin":
+        candidates.append(home / "Library" / "Caches" / "electron")
+    elif sys.platform == "win32":
+        local = os.environ.get("LOCALAPPDATA")
+        if local:
+            candidates.append(Path(local) / "electron" / "Cache")
+        candidates.append(home / "AppData" / "Local" / "electron" / "Cache")
+    else:
+        xdg = os.environ.get("XDG_CACHE_HOME")
+        if xdg:
+            candidates.append(Path(xdg) / "electron")
+        candidates.append(home / ".cache" / "electron")
+
+    seen: set[Path] = set()
+    out: list[Path] = []
+    for c in candidates:
+        rc = c.expanduser()
+        if rc not in seen:
+            seen.add(rc)
+            out.append(rc)
+    return out
+
+
+def _purge_corrupt_electron_cache() -> list[Path]:
+    """Delete corrupt cached Electron zips so the next build re-downloads cleanly.
+
+    Root cause of the ``ENOENT … rename '…/linux-unpacked/electron' ->
+    '…/linux-unpacked/Hermes'`` desktop build failure: a truncated/duplicated
+    download leaves a corrupt zip in the Electron cache. ``unpack-electron``
+    extracts a partial tree from it that is MISSING the ``electron`` binary, so
+    electron-builder dies on the final rename. The packed dir then looks plausible
+    (LICENSE, .pak files, chrome-sandbox) but has no launchable binary.
+
+    Validates every ``electron-*.zip`` in the cache with the stdlib zip reader
+    (``testzip()`` does a full per-entry CRC check) and removes the bad ones.
+    Best-effort: never raises; returns the list of removed paths so the caller
+    can decide whether a retry is worthwhile.
+    """
+    import zipfile
+
+    removed: list[Path] = []
+    for cache_dir in _electron_download_cache_dirs():
+        if not cache_dir.is_dir():
+            continue
+        for zip_path in sorted(cache_dir.rglob("electron-*.zip")):
+            corrupt = False
+            try:
+                with zipfile.ZipFile(zip_path) as zf:
+                    # testzip() returns the first bad member, or None if every
+                    # CRC checks out. A raise here means it isn't a readable zip.
+                    corrupt = zf.testzip() is not None
+            except Exception:
+                corrupt = True
+            if not corrupt:
+                continue
+            try:
+                zip_path.unlink()
+                removed.append(zip_path)
+            except OSError:
+                # A locked/permission-denied cache entry is out of our hands;
+                # surface nothing and let the build report its own error.
+                pass
+    return removed
+
+
 def _desktop_macos_relaunchable_fixup(desktop_dir: Path) -> None:
    """Make a locally-built (unsigned) macOS desktop app survive in-place self-update.

@ -7391,6 +7470,18 @@ def cmd_gui(args: argparse.Namespace):
            print(f"→ Building desktop {build_label}...")
            build_script = "build" if source_mode else "pack"
            build_result = subprocess.run([npm, "run", build_script], cwd=desktop_dir, env=env, check=False)
+            if build_result.returncode != 0 and not source_mode:
+                # A corrupt cached Electron zip makes `pack` fail with an ENOENT on
+                # the final `electron` -> `Hermes` rename: unpack-electron extracted
+                # a partial tree from the bad zip. Purge the bad cache entry and
+                # retry once so a poisoned download self-heals instead of wedging
+                # every future `hermes desktop` run.
+                purged = _purge_corrupt_electron_cache()
+                if purged:
+                    print(f"  ⚠ Detected corrupt cached Electron download ({len(purged)} file(s)); removed and retrying build...")
+                    for p in purged:
+                        print(f"    - {p}")
+                    build_result = subprocess.run([npm, "run", build_script], cwd=desktop_dir, env=env, check=False)
            if build_result.returncode != 0:
                print("✗ Desktop GUI build failed")
                print(f"  Run manually:  cd apps/desktop && npm run {build_script}")
--- a/tests/hermes_cli/test_gui_command.py
+++ b/tests/hermes_cli/test_gui_command.py
@ -411,3 +411,91 @@ def test_compute_desktop_content_hash_respects_gitignore(tmp_path, monkeypatch):
    cli_main._DESKTOP_STAMP_SPEC = None
    h3 = cli_main._compute_desktop_content_hash(root)
    assert h1 != h3, "changing a tracked file should change the hash"
+
+
+
+def _write_zip(path: Path) -> None:
+    import zipfile
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with zipfile.ZipFile(path, "w") as zf:
+        zf.writestr("electron", "fake binary payload")
+
+
+def test_purge_corrupt_electron_cache_removes_only_bad_zips(tmp_path, monkeypatch):
+    cache = tmp_path / "electron-cache"
+    good = cache / "electron-v40.9.3-linux-x64.zip"
+    bad = cache / "hashdir" / "electron-v40.9.3-linux-x64.zip"
+    _write_zip(good)
+    _write_zip(bad)
+    # Corrupt the second zip by truncating its central directory.
+    bad.write_bytes(bad.read_bytes()[:20])
+
+    monkeypatch.setattr(cli_main, "_electron_download_cache_dirs", lambda: [cache])
+
+    removed = cli_main._purge_corrupt_electron_cache()
+
+    assert removed == [bad]
+    assert good.exists()
+    assert not bad.exists()
+
+
+def test_purge_corrupt_electron_cache_noop_when_all_valid(tmp_path, monkeypatch):
+    cache = tmp_path / "electron-cache"
+    good = cache / "electron-v40.9.3-linux-x64.zip"
+    _write_zip(good)
+    monkeypatch.setattr(cli_main, "_electron_download_cache_dirs", lambda: [cache])
+
+    assert cli_main._purge_corrupt_electron_cache() == []
+    assert good.exists()
+
+
+def test_gui_retries_pack_after_purging_corrupt_electron_cache(tmp_path, monkeypatch):
+    root = _make_desktop_tree(tmp_path)
+    desktop_dir = root / "apps" / "desktop"
+    monkeypatch.setattr(cli_main, "PROJECT_ROOT", root)
+    packaged_exe = _make_packaged_executable(root, monkeypatch, platform="linux")
+
+    install_ok = subprocess.CompletedProcess(["npm", "ci"], 0)
+    pack_fail = subprocess.CompletedProcess(["npm", "run", "pack"], 1)
+    pack_ok = subprocess.CompletedProcess(["npm", "run", "pack"], 0)
+    launch_ok = subprocess.CompletedProcess([str(packaged_exe)], 0)
+
+    with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
+         patch("hermes_cli.main._run_npm_install_deterministic", return_value=install_ok), \
+         patch("hermes_cli.main._desktop_macos_relaunchable_fixup"), \
+         patch("hermes_cli.main._purge_corrupt_electron_cache", return_value=[Path("/c/electron.zip")]) as mock_purge, \
+         patch("hermes_cli.main.subprocess.run", side_effect=[pack_fail, pack_ok, launch_ok]) as mock_run, \
+         pytest.raises(SystemExit) as exc:
+        cli_main.cmd_gui(_ns())
+
+    assert exc.value.code == 0
+    mock_purge.assert_called_once()
+    # First pack fails, purge runs, second pack succeeds, then launch.
+    assert mock_run.call_count == 3
+    assert mock_run.call_args_list[0].args[0] == ["/usr/bin/npm", "run", "pack"]
+    assert mock_run.call_args_list[1].args[0] == ["/usr/bin/npm", "run", "pack"]
+    assert mock_run.call_args_list[2].args[0] == [str(packaged_exe)]
+
+
+def test_gui_does_not_retry_when_cache_is_clean(tmp_path, monkeypatch, capsys):
+    root = _make_desktop_tree(tmp_path)
+    monkeypatch.setattr(cli_main, "PROJECT_ROOT", root)
+    _make_packaged_executable(root, monkeypatch, platform="linux")
+
+    install_ok = subprocess.CompletedProcess(["npm", "ci"], 0)
+    pack_fail = subprocess.CompletedProcess(["npm", "run", "pack"], 1)
+
+    with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
+         patch("hermes_cli.main._run_npm_install_deterministic", return_value=install_ok), \
+         patch("hermes_cli.main._desktop_macos_relaunchable_fixup"), \
+         patch("hermes_cli.main._purge_corrupt_electron_cache", return_value=[]) as mock_purge, \
+         patch("hermes_cli.main.subprocess.run", side_effect=[pack_fail]) as mock_run, \
+         pytest.raises(SystemExit) as exc:
+        cli_main.cmd_gui(_ns())
+
+    assert exc.value.code == 1
+    # Purge was attempted but found nothing, so no retry pack runs.
+    mock_purge.assert_called_once()
+    assert mock_run.call_count == 1
+    assert "Desktop GUI build failed" in capsys.readouterr().out