fix(desktop): purge electron cache unconditionally, not via stdlib zipfile gate

The salvaged detector validated each cached electron-*.zip with zipfile.testzip() and only purged ones it judged corrupt. But stdlib zipfile reads from the end-of-central-directory backward, so it silently tolerates prepended/concatenated junk — which is exactly the corruption the bug report names ('86257938 extra bytes at beginning or within zipfile', a partial download resumed into the same file). testzip() returns clean on those zips, so the self-heal never fired for the reported failure mode. Drop the self-rolled validator: on any packaged-build failure, purge the version's cached zips AND the half-written unpacked dir, then retry once. @electron/get re-downloads with its own SHASUM verification — the real source of truth, which catches prepend/concat/truncate alike. An unrelated failure just costs one clean re-download and fails the same way. Verified empirically: zipfile.testzip() returns None (clean) on a prepended-junk zip; the unconditional purge removes it correctly.
2026-07-27 17:58:07 +00:00 · 2026-06-04 05:40:03 -07:00 · 2026-06-04 05:40:03 -07:00 · fef04a197e
commit fef04a197e
parent f583c6ebd5
2 changed files with 100 additions and 55 deletions
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -7272,45 +7272,62 @@ def _electron_download_cache_dirs() -> list[Path]:
    return out


-def _purge_corrupt_electron_cache() -> list[Path]:
-    """Delete corrupt cached Electron zips so the next build re-downloads cleanly.
+def _purge_electron_build_cache(desktop_dir: Path) -> list[Path]:
+    """Clear the cached Electron download + half-written unpacked dir so the
+    next ``pack`` re-downloads and re-stages from scratch.

    Root cause of the ``ENOENT … rename '…/linux-unpacked/electron' ->
-    '…/linux-unpacked/Hermes'`` desktop build failure: a truncated/duplicated
-    download leaves a corrupt zip in the Electron cache. ``unpack-electron``
-    extracts a partial tree from it that is MISSING the ``electron`` binary, so
-    electron-builder dies on the final rename. The packed dir then looks plausible
-    (LICENSE, .pak files, chrome-sandbox) but has no launchable binary.
+    '…/linux-unpacked/Hermes'`` desktop build failure: a corrupt zip in the
+    per-user Electron download cache (a partial download resumed into the same
+    file leaves prepended/concatenated junk, or an interrupted write truncates
+    it). electron-builder's ``app-builder unpack-electron`` extracts the
+    distribution from that cached zip (NOT from node_modules); a bad zip yields
+    a partial tree MISSING the 193 MB ``electron`` binary, so the final rename
+    dies. Re-running repeats the same broken extraction forever.

-    Validates every ``electron-*.zip`` in the cache with the stdlib zip reader
-    (``testzip()`` does a full per-entry CRC check) and removes the bad ones.
-    Best-effort: never raises; returns the list of removed paths so the caller
-    can decide whether a retry is worthwhile.
+    We deliberately do NOT try to detect corruption ourselves. stdlib
+    ``zipfile`` silently tolerates the prepended/concatenated junk that is the
+    most common corruption here — it reads from the end-of-central-directory
+    backward, so ``testzip()`` returns clean on exactly the zips ``unzip -t``
+    and ``@electron/get`` reject. Gating the purge on a self-rolled validator
+    would therefore skip the real-world case and never self-heal. Instead, on a
+    packaged-build failure we unconditionally remove the version's cached zips
+    and the stale unpacked dir, then let the caller retry once: ``@electron/get``
+    re-downloads with its own SHASUM verification (the real source of truth),
+    and ``before-pack.cjs`` re-wipes the unpacked dir. If the failure was
+    unrelated, a clean re-download is harmless and the retry fails the same way.
+
+    Best-effort: never raises. Returns the paths removed so the caller can log
+    them and decide whether a retry is worthwhile (empty list ⇒ nothing to
+    clear, so no point retrying).
    """
-    import zipfile
-
    removed: list[Path] = []
+
    for cache_dir in _electron_download_cache_dirs():
        if not cache_dir.is_dir():
            continue
        for zip_path in sorted(cache_dir.rglob("electron-*.zip")):
-            corrupt = False
-            try:
-                with zipfile.ZipFile(zip_path) as zf:
-                    # testzip() returns the first bad member, or None if every
-                    # CRC checks out. A raise here means it isn't a readable zip.
-                    corrupt = zf.testzip() is not None
-            except Exception:
-                corrupt = True
-            if not corrupt:
-                continue
            try:
                zip_path.unlink()
                removed.append(zip_path)
            except OSError:
-                # A locked/permission-denied cache entry is out of our hands;
-                # surface nothing and let the build report its own error.
+                # Locked/permission-denied entry is out of our hands; let the
+                # build report its own error rather than masking it.
                pass
+
+    # Drop the half-written unpacked dir too: an interrupted prior pack leaves
+    # a partial tree that poisons the rename even after the zip is fixed.
+    # (before-pack.cjs also handles this, but clearing it here makes the retry
+    # robust even if the hook is somehow skipped.)
+    release_dir = desktop_dir / "release"
+    if release_dir.is_dir():
+        for unpacked in release_dir.glob("*-unpacked"):
+            try:
+                shutil.rmtree(unpacked, ignore_errors=True)
+                removed.append(unpacked)
+            except OSError:
+                pass
+
    return removed


@ -7471,14 +7488,22 @@ def cmd_gui(args: argparse.Namespace):
            build_script = "build" if source_mode else "pack"
            build_result = subprocess.run([npm, "run", build_script], cwd=desktop_dir, env=env, check=False)
            if build_result.returncode != 0 and not source_mode:
-                # A corrupt cached Electron zip makes `pack` fail with an ENOENT on
-                # the final `electron` -> `Hermes` rename: unpack-electron extracted
-                # a partial tree from the bad zip. Purge the bad cache entry and
-                # retry once so a poisoned download self-heals instead of wedging
-                # every future `hermes desktop` run.
-                purged = _purge_corrupt_electron_cache()
+                # A corrupt cached Electron zip makes `pack` fail with an ENOENT
+                # on the final `electron` -> `Hermes` rename: unpack-electron
+                # extracted a partial tree (missing the 193 MB binary) from the
+                # bad zip. We do NOT try to prove the zip is corrupt ourselves —
+                # stdlib zipfile silently tolerates the prepended/concatenated
+                # junk that is the most common corruption (a partial download
+                # resumed into the same file), so a `testzip()` gate would pass
+                # and never self-heal. Instead, on any packaged-build failure we
+                # purge the version's cached zip + the half-written unpacked dir
+                # and retry once: @electron/get re-downloads with its own SHASUM
+                # verification, which is the real source of truth. If the
+                # failure was something else, the clean re-download is harmless
+                # and the retry fails the same way.
+                purged = _purge_electron_build_cache(desktop_dir)
                if purged:
-                    print(f"  ⚠ Detected corrupt cached Electron download ({len(purged)} file(s)); removed and retrying build...")
+                    print("  ⚠ Desktop build failed; cleared cached Electron download and retrying once...")
                    for p in purged:
                        print(f"    - {p}")
                    build_result = subprocess.run([npm, "run", build_script], cwd=desktop_dir, env=env, check=False)
--- a/tests/hermes_cli/test_gui_command.py
+++ b/tests/hermes_cli/test_gui_command.py
@ -413,6 +413,8 @@ def test_compute_desktop_content_hash_respects_gitignore(tmp_path, monkeypatch):
    assert h1 != h3, "changing a tracked file should change the hash"


+# ── Electron build-cache recovery tests ───────────────────────────────
+

 def _write_zip(path: Path) -> None:
    import zipfile
@ -422,37 +424,53 @@ def _write_zip(path: Path) -> None:
        zf.writestr("electron", "fake binary payload")


-def test_purge_corrupt_electron_cache_removes_only_bad_zips(tmp_path, monkeypatch):
+def test_purge_electron_build_cache_clears_all_zips_and_unpacked_dir(tmp_path, monkeypatch):
+    """Purge is unconditional: it removes every electron-*.zip (regardless of
+    whether stdlib zipfile thinks it's corrupt) plus the half-written unpacked
+    dir, because @electron/get's own SHASUM check on re-download is the real
+    validator — not a self-rolled one."""
    cache = tmp_path / "electron-cache"
-    good = cache / "electron-v40.9.3-linux-x64.zip"
-    bad = cache / "hashdir" / "electron-v40.9.3-linux-x64.zip"
-    _write_zip(good)
-    _write_zip(bad)
-    # Corrupt the second zip by truncating its central directory.
-    bad.write_bytes(bad.read_bytes()[:20])
+    # A "clean" zip and a prepended-junk zip — the latter is the real-world
+    # corruption that zipfile.testzip() silently passes (it reads from the
+    # end-of-central-directory backward), which is why we don't gate on it.
+    clean = cache / "electron-v40.9.3-linux-x64.zip"
+    prepended = cache / "hashdir" / "electron-v40.9.3-linux-x64.zip"
+    _write_zip(clean)
+    _write_zip(prepended)
+    prepended.write_bytes(b"\x00" * 4096 + prepended.read_bytes())
+
+    desktop_dir = tmp_path / "apps" / "desktop"
+    unpacked = desktop_dir / "release" / "linux-unpacked"
+    unpacked.mkdir(parents=True)
+    (unpacked / "LICENSE.electron.txt").write_text("x", encoding="utf-8")
+    (unpacked / "resources.pak").write_text("x", encoding="utf-8")

    monkeypatch.setattr(cli_main, "_electron_download_cache_dirs", lambda: [cache])

-    removed = cli_main._purge_corrupt_electron_cache()
+    removed = cli_main._purge_electron_build_cache(desktop_dir)

-    assert removed == [bad]
-    assert good.exists()
-    assert not bad.exists()
+    assert clean in removed
+    assert prepended in removed
+    assert unpacked in removed
+    assert not clean.exists()
+    assert not prepended.exists()
+    assert not unpacked.exists()


-def test_purge_corrupt_electron_cache_noop_when_all_valid(tmp_path, monkeypatch):
+def test_purge_electron_build_cache_empty_when_nothing_present(tmp_path, monkeypatch):
+    """No cached zips and no unpacked dir → nothing removed, so the caller
+    knows a retry is pointless."""
    cache = tmp_path / "electron-cache"
-    good = cache / "electron-v40.9.3-linux-x64.zip"
-    _write_zip(good)
+    cache.mkdir()
+    desktop_dir = tmp_path / "apps" / "desktop"
    monkeypatch.setattr(cli_main, "_electron_download_cache_dirs", lambda: [cache])

-    assert cli_main._purge_corrupt_electron_cache() == []
-    assert good.exists()
+    assert cli_main._purge_electron_build_cache(desktop_dir) == []


-def test_gui_retries_pack_after_purging_corrupt_electron_cache(tmp_path, monkeypatch):
+def test_gui_retries_pack_once_after_purging_build_cache(tmp_path, monkeypatch):
+    """First pack fails, purge clears the cache, second pack succeeds, launch."""
    root = _make_desktop_tree(tmp_path)
-    desktop_dir = root / "apps" / "desktop"
    monkeypatch.setattr(cli_main, "PROJECT_ROOT", root)
    packaged_exe = _make_packaged_executable(root, monkeypatch, platform="linux")

@ -464,21 +482,24 @@ def test_gui_retries_pack_after_purging_corrupt_electron_cache(tmp_path, monkeyp
    with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
         patch("hermes_cli.main._run_npm_install_deterministic", return_value=install_ok), \
         patch("hermes_cli.main._desktop_macos_relaunchable_fixup"), \
-         patch("hermes_cli.main._purge_corrupt_electron_cache", return_value=[Path("/c/electron.zip")]) as mock_purge, \
+         patch("hermes_cli.main._desktop_linux_sandbox_fixup", return_value=True), \
+         patch("hermes_cli.main._write_desktop_build_stamp"), \
+         patch("hermes_cli.main._purge_electron_build_cache", return_value=[Path("/c/electron.zip")]) as mock_purge, \
         patch("hermes_cli.main.subprocess.run", side_effect=[pack_fail, pack_ok, launch_ok]) as mock_run, \
         pytest.raises(SystemExit) as exc:
        cli_main.cmd_gui(_ns())

    assert exc.value.code == 0
    mock_purge.assert_called_once()
-    # First pack fails, purge runs, second pack succeeds, then launch.
+    # pack(fail) → purge → pack(ok) → launch = 3 subprocess.run calls
    assert mock_run.call_count == 3
    assert mock_run.call_args_list[0].args[0] == ["/usr/bin/npm", "run", "pack"]
    assert mock_run.call_args_list[1].args[0] == ["/usr/bin/npm", "run", "pack"]
    assert mock_run.call_args_list[2].args[0] == [str(packaged_exe)]


-def test_gui_does_not_retry_when_cache_is_clean(tmp_path, monkeypatch, capsys):
+def test_gui_does_not_retry_when_purge_finds_nothing(tmp_path, monkeypatch, capsys):
+    """If the purge clears nothing, there's no point retrying — fail fast."""
    root = _make_desktop_tree(tmp_path)
    monkeypatch.setattr(cli_main, "PROJECT_ROOT", root)
    _make_packaged_executable(root, monkeypatch, platform="linux")
@ -489,13 +510,12 @@ def test_gui_does_not_retry_when_cache_is_clean(tmp_path, monkeypatch, capsys):
    with patch("hermes_cli.main.shutil.which", return_value="/usr/bin/npm"), \
         patch("hermes_cli.main._run_npm_install_deterministic", return_value=install_ok), \
         patch("hermes_cli.main._desktop_macos_relaunchable_fixup"), \
-         patch("hermes_cli.main._purge_corrupt_electron_cache", return_value=[]) as mock_purge, \
+         patch("hermes_cli.main._purge_electron_build_cache", return_value=[]) as mock_purge, \
         patch("hermes_cli.main.subprocess.run", side_effect=[pack_fail]) as mock_run, \
         pytest.raises(SystemExit) as exc:
        cli_main.cmd_gui(_ns())

    assert exc.value.code == 1
-    # Purge was attempted but found nothing, so no retry pack runs.
    mock_purge.assert_called_once()
    assert mock_run.call_count == 1
    assert "Desktop GUI build failed" in capsys.readouterr().out