fix(cli): make ZIP-update directory replace atomic so it can't delete ui-tui

Root cause of #49145: the Windows ZIP-update path did rmtree(dst) then
copytree(src, dst). If the copy failed partway — common on that path,
which only runs because file I/O is already flaky on the machine — the
directory was left deleted with nothing copied back. ui-tui/ vanishing
is what broke 'hermes --tui' (WinError 267), but the bug hit every
top-level directory.

_atomic_replace_dir stages the new copy into a sibling temp dir and only
swaps it in on full success, restoring the original on failure. A failed
update now leaves the live tree untouched instead of half-deleted.
This commit is contained in:
teknium1 2026-06-21 12:49:13 -07:00 committed by Teknium
parent db097fb088
commit 6902eb3913
2 changed files with 124 additions and 3 deletions

View file

@ -5999,6 +5999,43 @@ def _kill_stale_dashboard_processes(
_warn_stale_dashboard_processes = _kill_stale_dashboard_processes
def _atomic_replace_dir(src: str, dst: str) -> None:
"""Replace directory *dst* with *src* without leaving *dst* half-deleted.
The naive ``rmtree(dst); copytree(src, dst)`` has a destructive window: if
the copy fails partway (common on the Windows ZIP-update path, which only
runs because file I/O is already flaky on that machine), the old directory
is already gone and nothing replaced it the install is left with a
deleted tree (issue #49145, where ``ui-tui/`` vanished and broke the TUI).
Instead, stage the new copy into a sibling temp dir first; only once that
fully succeeds do we swap it in. A failure during staging raises with the
original *dst* still intact.
"""
staging = f"{dst}.hermes-update-staging"
backup = f"{dst}.hermes-update-old"
# Clear any leftovers from a previously-interrupted update.
for leftover in (staging, backup):
if os.path.exists(leftover):
shutil.rmtree(leftover, ignore_errors=True)
# 1. Stage the new copy. If this fails, dst is untouched.
shutil.copytree(src, staging)
# 2. Swap: move the live dir aside, move staging into place. Both moves are
# same-filesystem renames; if the second fails we restore the backup.
if os.path.exists(dst):
os.rename(dst, backup)
try:
os.rename(staging, dst)
except OSError:
if os.path.exists(backup) and not os.path.exists(dst):
os.rename(backup, dst) # roll back to the original
raise
# 3. New dir is in place; drop the old one (best-effort — never fatal).
if os.path.exists(backup):
shutil.rmtree(backup, ignore_errors=True)
def _update_via_zip(args):
"""Update Hermes Agent by downloading a ZIP archive.
@ -6084,9 +6121,9 @@ def _update_via_zip(args):
src = os.path.join(extracted, item)
dst = os.path.join(str(PROJECT_ROOT), item)
if os.path.isdir(src):
if os.path.exists(dst):
shutil.rmtree(dst)
shutil.copytree(src, dst)
# Atomic-ish replace: never leave dst half-deleted if the copy
# fails partway (the failure mode behind #49145 on Windows).
_atomic_replace_dir(src, dst)
else:
shutil.copy2(src, dst)
update_count += 1

View file

@ -0,0 +1,84 @@
"""Regression: the ZIP-update directory replace must never leave a half-deleted tree.
Issue #49145: on Windows the ZIP-update path did ``rmtree(dst); copytree(...)``.
A copy that failed partway (file locks / flaky I/O the very conditions the ZIP
path exists to work around) left the directory deleted with nothing copied back,
which broke ``hermes --tui`` because ``ui-tui/`` had vanished.
``_atomic_replace_dir`` stages the new copy first and only swaps it in on full
success, so a mid-copy failure leaves the original directory intact.
"""
from __future__ import annotations
import shutil
from pathlib import Path
import pytest
from hermes_cli.main import _atomic_replace_dir
def test_atomic_replace_swaps_content_on_success(tmp_path: Path) -> None:
src = tmp_path / "src" / "ui-tui"
src.mkdir(parents=True)
(src / "new.txt").write_text("NEW")
dst = tmp_path / "install" / "ui-tui"
dst.mkdir(parents=True)
(dst / "old.txt").write_text("OLD")
_atomic_replace_dir(str(src), str(dst))
assert (dst / "new.txt").read_text() == "NEW"
assert not (dst / "old.txt").exists()
# No staging/backup siblings left behind.
assert not (dst.parent / "ui-tui.hermes-update-staging").exists()
assert not (dst.parent / "ui-tui.hermes-update-old").exists()
def test_atomic_replace_leaves_original_intact_when_copy_fails(
tmp_path: Path, monkeypatch
) -> None:
src = tmp_path / "src" / "ui-tui"
src.mkdir(parents=True)
(src / "a.txt").write_text("A")
dst = tmp_path / "install" / "ui-tui"
dst.mkdir(parents=True)
(dst / "keep.txt").write_text("PRECIOUS")
def boom(*_a, **_k):
raise OSError("[WinError 5] Access is denied")
monkeypatch.setattr(shutil, "copytree", boom)
with pytest.raises(OSError):
_atomic_replace_dir(str(src), str(dst))
# The whole point: the live directory survives a failed update untouched.
assert dst.is_dir()
assert (dst / "keep.txt").read_text() == "PRECIOUS"
assert not (dst.parent / "ui-tui.hermes-update-staging").exists()
def test_atomic_replace_clears_stale_staging_leftovers(tmp_path: Path) -> None:
"""A previously-interrupted update can leave staging/backup dirs behind."""
src = tmp_path / "src" / "ui-tui"
src.mkdir(parents=True)
(src / "new.txt").write_text("NEW")
dst = tmp_path / "install" / "ui-tui"
dst.mkdir(parents=True)
stale_staging = dst.parent / "ui-tui.hermes-update-staging"
stale_backup = dst.parent / "ui-tui.hermes-update-old"
stale_staging.mkdir()
stale_backup.mkdir()
(stale_staging / "junk").write_text("junk")
_atomic_replace_dir(str(src), str(dst))
assert (dst / "new.txt").read_text() == "NEW"
assert not stale_staging.exists()
assert not stale_backup.exists()