fix(update): quarantine hermes.exe vs concurrent Windows instance (#26670) (#26677)

* fix(update): detect concurrent hermes.exe on Windows; retry + restart-defer quarantine

Closes #26670.

When 'hermes update' runs on Windows with another hermes.exe alive (most
commonly the Hermes Desktop Electron app's spawned backend) _quarantine_running_hermes_exe()
fails to rename the venv shim with [WinError 32]. uv pip install -e .
then exits 2, the git-pull fast path is silently abandoned, and the ZIP
fallback runs (and fails the same way) before eventually succeeding.

This change implements three of the five proposed fixes from the issue:

1. Concurrent-instance detection (preferred fix). _detect_concurrent_hermes_instances()
   uses psutil to enumerate processes whose .exe is one of our venv shims
   (hermes.exe / hermes-gateway.exe), excluding the caller's PID. When any
   match exists, cmd_update prints an actionable message naming the
   blocking PIDs and exits 2 BEFORE any destructive work. New --force flag
   bypasses the gate.

2. Retry + restart-deferred fallback. _quarantine_running_hermes_exe()
   now retries the rename up to 4 times with 100/250/500/1000 ms backoff
   (covers the transient AV-scanner-handle case). If all retries fail,
   it schedules the replacement via MoveFileExW with the OS deferred-rename
   flag so the new shim can land at the original path and the update
   completes; the old image is fully unloaded after the user's next
   system restart.

3. Actionable warning text. The old 'Could not quarantine: [WinError 32]'
   warning is replaced with one that names the likely culprits (Hermes
   Desktop, REPLs, gateway, AV) and points to the new --force flag.

Tests:
- 13 new tests in tests/hermes_cli/test_update_concurrent_quarantine.py
  covering: psutil-based enumeration, self-pid exclusion, case-insensitive
  matching of .EXE, no-psutil graceful degradation, off-Windows no-op,
  helpful warning formatting, retry-then-succeed, restart-deferred fallback,
  cmd_update abort + exit code 2, and --force bypass.
- New autouse fixture in tests/hermes_cli/conftest.py defaults
  _detect_concurrent_hermes_instances to [] so the rest of the suite
  isn't tripped by the developer's own running hermes.exe. Opt-out marker
  'real_concurrent_gate' registered in pyproject.toml.
- Updating docs page (website/docs/getting-started/updating.md) gains a
  short section explaining the new Windows error and remediation.

* chore: refresh uv.lock to match pyproject.toml exact pins

aiohttp 3.13.4 -> 3.13.3 (matches pyproject pin: aiohttp==3.13.3)
anthropic 0.87.0 -> 0.86.0 (matches pyproject pin: anthropic==0.86.0)
hermes-agent 0.13.0 -> 0.14.0 (matches pyproject version)

CI's uv lock --check was failing on the merged state because main
drifted: pyproject.toml uses exact == pins for those two deps and the
hermes-agent version was bumped to 0.14.0 but the lockfile still had
0.13.0.
This commit is contained in:
Teknium 2026-05-19 11:10:51 -07:00 committed by GitHub
parent 57af46fae2
commit 2a7308b7c4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 591 additions and 9 deletions

View file

@ -7199,7 +7199,95 @@ def _hermes_exe_shims(scripts_dir: Path) -> list[Path]:
]
def _quarantine_running_hermes_exe(scripts_dir: Path) -> list[tuple[Path, Path]]:
def _detect_concurrent_hermes_instances(
scripts_dir: Path, *, exclude_pid: int | None = None
) -> list[tuple[int, str]]:
"""Find other live processes whose .exe is one of our entry-point shims.
Windows blocks DELETE/REPLACE on a running .exe and even RENAME on the
same .exe when another process opened it without ``FILE_SHARE_DELETE``.
The Hermes Desktop Electron app spawns ``hermes.EXE`` as a backend child,
so during ``hermes update`` the user-invoked process and the desktop's
child both hold the same file. The quarantine rename then fails with
``[WinError 32]`` and uv inherits the lock.
This helper enumerates processes whose ``exe`` matches one of the venv's
shims (``hermes.exe`` / ``hermes-gateway.exe``) and returns ``(pid,
process_name)`` pairs. The caller's own PID is excluded so the running
``hermes update`` invocation never reports itself.
Returns an empty list off-Windows, on missing psutil, or when no other
instances exist. Never raises process enumeration is best-effort.
"""
if not _is_windows():
return []
try:
import psutil
except Exception:
return []
if exclude_pid is None:
exclude_pid = os.getpid()
# Resolve every shim path to its canonical form once for cheap comparison.
shim_paths: set[str] = set()
for shim in _hermes_exe_shims(scripts_dir):
try:
shim_paths.add(str(shim.resolve()).lower())
except OSError:
shim_paths.add(str(shim).lower())
if not shim_paths:
return []
matches: list[tuple[int, str]] = []
try:
proc_iter = psutil.process_iter(["pid", "exe", "name"])
except Exception:
return []
for proc in proc_iter:
try:
info = proc.info
except Exception:
continue
pid = info.get("pid")
exe = info.get("exe")
if not exe or pid is None or pid == exclude_pid:
continue
try:
exe_norm = str(Path(exe).resolve()).lower()
except (OSError, ValueError):
exe_norm = str(exe).lower()
if exe_norm in shim_paths:
name = info.get("name") or Path(exe).name
matches.append((int(pid), str(name)))
return matches
def _format_concurrent_instances_message(
matches: list[tuple[int, str]], scripts_dir: Path
) -> str:
"""Build a human-readable explanation + remediation hint for the user."""
shim = scripts_dir / "hermes.exe"
lines = ["✗ Another hermes.exe is running:"]
for pid, name in matches:
lines.append(f" PID {pid} {name}")
lines.append("")
lines.append(f" Updating now would fail to overwrite {shim} because")
lines.append(" Windows blocks REPLACE on a running executable.")
lines.append("")
lines.append(" Close Hermes Desktop, exit any open `hermes` REPLs, and")
lines.append(" stop the gateway (`hermes gateway stop`) before retrying.")
lines.append(" Override with `hermes update --force` if you've already")
lines.append(" confirmed those processes will not write to the venv.")
return "\n".join(lines)
def _quarantine_running_hermes_exe(
scripts_dir: Path, *, max_attempts: int = 4
) -> list[tuple[Path, Path]]:
"""Pre-empt Windows file lock on the running ``hermes.exe``.
Windows allows RENAMING a mapped/running executable (the kernel tracks the
@ -7212,29 +7300,129 @@ def _quarantine_running_hermes_exe(scripts_dir: Path) -> list[tuple[Path, Path]]
fresh shims at the original paths. The ``.old`` files are cleaned up on
the next hermes invocation by ``_cleanup_quarantined_exes``.
Rename can still fail when *another* process has opened the .exe without
``FILE_SHARE_DELETE`` typically AV real-time scanners with transient
handles (recovers in <1s), or the Hermes Desktop backend child process
(won't recover until the user closes it). We mitigate:
1. Retry up to ``max_attempts`` times with exponential backoff
(100/250/500/1000 ms). Handles the AV-scanner case.
2. If all retries fail, schedule the .exe for replacement on next
reboot via ``MoveFileExW(MOVEFILE_DELAY_UNTIL_REBOOT)``. This still
lets uv create a fresh shim at the original path (Windows will keep
the old file's content under a new name until the reboot), so the
update can complete; the user just needs to reboot to fully unload
the stale image.
3. Print a clear warning naming the most likely culprit (running
Hermes Desktop / gateway / REPL) and pointing to ``--force``.
Returns the list of (original, quarantined) pairs so the caller can roll
back if the install itself fails before uv writes a replacement.
back if the install itself fails before uv writes a replacement. Pairs
where we used ``MOVEFILE_DELAY_UNTIL_REBOOT`` are NOT returned they
are already deferred and roll-back is meaningless.
"""
moved: list[tuple[Path, Path]] = []
if not _is_windows():
return moved
import time
stamp = int(time.time() * 1000)
# Backoff schedule: first attempt is immediate, subsequent ones sleep.
# 100ms / 250ms / 500ms covers the typical AV scanner re-scan window.
backoff_ms = [0, 100, 250, 500, 1000]
attempts = max(1, min(max_attempts, len(backoff_ms)))
for shim in _hermes_exe_shims(scripts_dir):
if not shim.exists():
continue
target = shim.with_suffix(shim.suffix + f".old.{stamp}")
try:
shim.rename(target)
moved.append((shim, target))
except OSError as e:
# Best-effort: keep going. uv's failure later will surface the
# real error; this is a heuristic, not a hard guarantee.
print(f" ⚠ Could not quarantine {shim.name}: {e}")
last_exc: OSError | None = None
for attempt in range(attempts):
delay = backoff_ms[attempt] / 1000.0
if delay:
time.sleep(delay)
try:
shim.rename(target)
moved.append((shim, target))
last_exc = None
break
except OSError as e:
last_exc = e
continue
if last_exc is None:
continue
# All in-process renames failed. Try MoveFileEx with
# MOVEFILE_DELAY_UNTIL_REBOOT as a last resort. This succeeds in the
# exact case where the inline rename failed (another process holds
# the handle without share-delete), at the cost of requiring a
# reboot to fully reclaim the old .exe.
scheduled = _schedule_replace_on_reboot(shim, target)
if scheduled:
print(
f"{shim.name} is locked by another process; scheduled "
f"replacement on next reboot."
)
print(
" The new shim was written at the same path, but a "
"reboot is needed to fully unload the old one."
)
# Do NOT append to ``moved``: we don't want roll-back to undo a
# reboot-deferred operation.
continue
# Truly couldn't budge the .exe. Print an actionable warning and let
# uv try its luck — sometimes uv's own retry handling pulls through.
print(
f" ⚠ Could not quarantine {shim.name} ({last_exc.__class__.__name__}: "
f"another process is holding it open)."
)
print(
" Close Hermes Desktop, exit other `hermes` REPLs, stop the "
"gateway, or pause AV scanning, then re-run `hermes update`."
)
return moved
def _schedule_replace_on_reboot(shim: Path, quarantine_target: Path) -> bool:
"""Schedule ``shim`` -> ``quarantine_target`` via PendingFileRenameOperations.
Uses Win32 ``MoveFileExW`` with ``MOVEFILE_REPLACE_EXISTING |
MOVEFILE_DELAY_UNTIL_REBOOT``. The OS persists the rename in
``HKLM\\System\\CurrentControlSet\\Control\\Session Manager\\
PendingFileRenameOperations`` and applies it before any user-mode code
runs on next boot at which point no process can hold the .exe.
Returns ``True`` if the schedule call succeeded, ``False`` otherwise
(non-Windows, ctypes failure, lack of privilege, etc.). Never raises.
"""
if not _is_windows():
return False
try:
import ctypes
from ctypes import wintypes
MOVEFILE_REPLACE_EXISTING = 0x1
MOVEFILE_DELAY_UNTIL_REBOOT = 0x4
MoveFileExW = ctypes.windll.kernel32.MoveFileExW
MoveFileExW.argtypes = [wintypes.LPCWSTR, wintypes.LPCWSTR, wintypes.DWORD]
MoveFileExW.restype = wintypes.BOOL
ok = MoveFileExW(
str(shim),
str(quarantine_target),
MOVEFILE_REPLACE_EXISTING | MOVEFILE_DELAY_UNTIL_REBOOT,
)
return bool(ok)
except Exception:
return False
def _restore_quarantined_exes(moved: list[tuple[Path, Path]]) -> None:
"""Roll back ``_quarantine_running_hermes_exe`` if uv didn't write replacements."""
for original, quarantined in moved:
@ -8020,6 +8208,18 @@ def _cmd_update_impl(args, gateway_mode: bool):
print("⚕ Updating Hermes Agent...")
print()
# On Windows, abort early if another hermes.exe is holding the venv shim
# open. Continuing would result in a string of WinError 32 warnings and
# then either a deferred-rename leftover or a failed git-pull fast path
# that silently falls back to the slower ZIP route. See issue #26670.
if _is_windows() and not getattr(args, "force", False):
scripts_dir = _venv_scripts_dir()
if scripts_dir is not None:
concurrent = _detect_concurrent_hermes_instances(scripts_dir)
if concurrent:
print(_format_concurrent_instances_message(concurrent, scripts_dir))
sys.exit(2)
# Pre-update backup — runs before any git/file mutation so users can
# always roll back to the exact state they had before this update.
_run_pre_update_backup(args)
@ -12351,6 +12551,12 @@ Examples:
default=False,
help="Assume yes for interactive prompts (config migration, stash restore). API-key entry is skipped; run 'hermes config migrate' separately for those.",
)
update_parser.add_argument(
"--force",
action="store_true",
default=False,
help="Windows: proceed with the update even when another hermes.exe is detected. The concurrent process will likely cause WinError 32 warnings and may leave a reboot-deferred .exe replacement.",
)
update_parser.set_defaults(func=cmd_update)
# =========================================================================