fix(update): self-heal a venv left half-built by an interrupted install (#42172)

* fix(update): self-heal a venv left half-built by an interrupted install

An update killed mid dependency-install (Ctrl-C, terminal close, WSL OOM)
could leave the venv with pip wiped and core deps (e.g. Pillow) missing,
with no automatic recovery — the user had to manually run ensurepip +
reinstall.

Drop an install-scoped .update-incomplete breadcrumb right before the dep
install and clear it only after core-dependency verification passes. On the
next launch (any command except 'update' itself), if the marker is present,
unconditionally bootstrap pip via ensurepip then re-run the .[all] install +
verification, then clear the marker. Failure leaves the marker for retry and
prints the manual recovery command. Never raises — recovery cannot block
launch.

* fix(update): address review — stderr-only recovery output, single-flight lock, gitignore marker

- Route all recovery output (status lines + streamed pip/uv install via
  fd-level dup2) to stderr so protocol-on-stdout launches (hermes acp)
  never get install noise on the JSON-RPC stream.
- Single-flight O_EXCL lockfile (.update-incomplete.lock) so a gateway
  start + CLI launch (or two profiles) can't run concurrent installs
  into the shared venv; stale locks (>1h) are broken for the next launch.
- gitignore .update-incomplete + lock so source-tree installs keep a
  clean git status and update's autostash skips them.
- Document why the loose 'update' argv substring match is intentional
  (over-match defers one launch; under-match would race the real update).
- 4 new tests: lock held → skip, stale lock broken, lock released,
  output lands on stderr only.
This commit is contained in:
Teknium 2026-06-10 02:57:05 -07:00 committed by GitHub
parent 15813336cc
commit a5c32cdf30
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 414 additions and 0 deletions

6
.gitignore vendored
View file

@ -114,6 +114,12 @@ docs/superpowers/*
# treat it as a local edit and autostash it on every run (#38529).
.hermes-bootstrap-complete
# Interrupted-update breadcrumb + recovery lock written next to the shared venv
# by `hermes update` / launch-time self-heal. Runtime state, never a code change
# — ignore so `git status` stays clean and update's autostash skips them.
.update-incomplete
.update-incomplete.lock
# Tool Search live-test harness output — non-deterministic model transcripts,
# regenerated by scripts/tool_search_livetest.py. Never an artifact of the repo.
scripts/out/

View file

@ -6392,6 +6392,167 @@ def _load_installable_optional_extras(group: str = "all") -> list[str]:
return referenced
# Install-scoped breadcrumb dropped right before ``hermes update`` mutates the
# venv and cleared only after the dependency install verifies clean. If a user
# kills the update mid-install (Ctrl-C, terminal close, WSL OOM), the marker
# survives and the next ``hermes`` launch finishes the install instead of
# limping along on a half-built venv (e.g. pip wiped, a core dep like Pillow
# never landed). Lives next to the venv (not under $HERMES_HOME) because the
# venv is shared across all profiles, so a single marker covers every profile.
def _update_marker_path() -> Path:
return PROJECT_ROOT / ".update-incomplete"
def _write_update_incomplete_marker() -> None:
"""Drop the interrupted-install breadcrumb. Never raises."""
try:
_update_marker_path().write_text(
f"started={_time.time()}\npid={os.getpid()}\n", encoding="utf-8"
)
except OSError as exc:
logger.debug("Could not write update-incomplete marker: %s", exc)
def _clear_update_incomplete_marker() -> None:
"""Remove the interrupted-install breadcrumb. Never raises."""
try:
_update_marker_path().unlink()
except FileNotFoundError:
pass
except OSError as exc:
logger.debug("Could not clear update-incomplete marker: %s", exc)
def _recover_from_interrupted_install() -> None:
"""Finish a dependency install that a prior ``hermes update`` left half-done.
Triggered on launch when ``.update-incomplete`` is present meaning the
code was pulled but the dep install was killed before it verified clean.
Unconditionally bootstraps pip via ``ensurepip`` (a killed ``pip install``
can wipe pip from the venv entirely, which blocks the venv from recovering
on its own), then re-runs the editable ``.[all]`` install + core-dependency
verification, then clears the marker.
Never raises: a recovery failure must not block launch. If it can't
self-heal it prints the one-line manual command and leaves the marker so
the next launch tries again.
Concurrency: the marker lives next to the shared venv, so a gateway start
plus a CLI launch (or two profiles starting at once) can both see it. An
``O_EXCL`` lockfile ensures only one process runs the reinstall; the
others skip and let the winner clear the marker.
Output: everything our status lines AND the streamed pip/uv install
(which inherits fd 1) is routed to stderr. Launches whose stdout is a
protocol stream (``hermes acp`` speaks JSON-RPC on stdout) must never get
install noise on stdout.
"""
if not _update_marker_path().exists():
return
# Skip in managed/Docker installs and on PyPI installs with no git checkout:
# those don't run the source-tree update path, so a stray marker is not ours
# to act on. Just clear it.
if not (PROJECT_ROOT / "pyproject.toml").is_file():
_clear_update_incomplete_marker()
return
# Single-flight guard: atomically claim the recovery lock. If another
# process holds it, skip — it is running the same reinstall into the same
# shared venv right now. A crashed holder leaves a stale lock; break it
# after an hour (well past any realistic install) so recovery can't be
# wedged forever.
lock_path = PROJECT_ROOT / ".update-incomplete.lock"
try:
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
os.write(fd, f"{os.getpid()}\n".encode())
os.close(fd)
except FileExistsError:
try:
if _time.time() - lock_path.stat().st_mtime > 3600:
lock_path.unlink()
except OSError:
pass
return
except OSError as exc:
# Couldn't create the lock (read-only fs, perms). Proceed unlocked —
# the install itself will surface the real problem.
logger.debug("Could not create install-recovery lock: %s", exc)
saved_stdout_fd = None
saved_sys_stdout = sys.stdout
try:
# Route Python-level prints AND subprocess-inherited fd 1 to stderr
# for the duration of recovery (see docstring: ACP stdout safety).
try:
saved_stdout_fd = os.dup(1)
os.dup2(2, 1)
except OSError:
saved_stdout_fd = None
sys.stdout = sys.stderr
print(
"⚠ A previous `hermes update` was interrupted mid-install — "
"finishing dependency installation now..."
)
try:
from hermes_cli.managed_uv import ensure_uv
# Always bootstrap pip first: a killed install can leave the venv with
# no pip module at all, and uv may also be gone. ensurepip restores a
# known-good pip so at least the plain-pip path below can proceed.
try:
subprocess.run(
[sys.executable, "-m", "ensurepip", "--upgrade", "--default-pip"],
cwd=PROJECT_ROOT,
capture_output=True,
)
except Exception as exc:
logger.debug("ensurepip during install recovery failed: %s", exc)
uv_bin = ensure_uv()
if uv_bin:
uv_env = {**os.environ, "VIRTUAL_ENV": str(PROJECT_ROOT / "venv")}
if _is_termux_env(uv_env):
uv_env.pop("PYTHONPATH", None)
uv_env.pop("PYTHONHOME", None)
_install_python_dependencies_with_optional_fallback(
[uv_bin, "pip"],
env=uv_env,
group="termux-all" if _is_termux_env(uv_env) else "all",
)
else:
_install_python_dependencies_with_optional_fallback(
[sys.executable, "-m", "pip"],
group="termux-all" if _is_termux_env() else "all",
)
_clear_update_incomplete_marker()
print("✓ Dependency installation recovered — your install is healthy again.")
except Exception as exc:
# Leave the marker in place so the next launch retries. Give the user
# the exact manual recovery command in the meantime.
logger.debug("Interrupted-install recovery failed: %s", exc)
print("✗ Could not auto-recover the interrupted install.")
print(" Recover manually with:")
print(f" cd {PROJECT_ROOT}")
print(f" {sys.executable} -m ensurepip --upgrade")
print(f" {sys.executable} -m pip install -e '.[all]'")
finally:
sys.stdout = saved_sys_stdout
if saved_stdout_fd is not None:
try:
os.dup2(saved_stdout_fd, 1)
os.close(saved_stdout_fd)
except OSError:
pass
try:
lock_path.unlink()
except OSError:
pass
def _run_install_with_heartbeat(
cmd: list[str],
*,
@ -8323,6 +8484,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
# Reinstall Python dependencies. Prefer .[all], but if one optional extra
# breaks on this machine, keep base deps and reinstall the remaining extras
# individually so update does not silently strip working capabilities.
#
# Drop the interrupted-install breadcrumb BEFORE touching the venv. If
# the install is killed mid-flight (Ctrl-C, terminal close, WSL OOM),
# the marker survives and the next ``hermes`` launch finishes the
# install via ``_recover_from_interrupted_install``. Cleared only after
# the install + core-dependency verification completes below.
_write_update_incomplete_marker()
print("→ Updating Python dependencies...")
from hermes_cli.managed_uv import ensure_uv, update_managed_uv
@ -8376,6 +8544,12 @@ def _cmd_update_impl(args, gateway_mode: bool):
_install_psutil_android_compat(pip_cmd)
_install_python_dependencies_with_optional_fallback(pip_cmd, group=install_group)
# Core Python deps installed AND verified (the fallback helper runs
# _verify_core_dependencies_installed). Clear the interrupted-install
# breadcrumb now — the remaining steps (lazy refresh, node deps, web
# UI, desktop rebuild) are non-core and can't brick the venv.
_clear_update_incomplete_marker()
_refresh_active_lazy_features()
_update_node_dependencies()
@ -10690,6 +10864,22 @@ def main():
except Exception:
pass
# Self-heal a venv left half-built by an interrupted ``hermes update``
# (Ctrl-C, terminal close, WSL OOM mid-install). Skip when the user is
# *running* update — that flow writes and clears its own marker, and we
# don't want a recovery install racing the real one. Never raises.
#
# The substring match is deliberately loose: argv isn't parsed yet at this
# point, and the failure modes are asymmetric. Over-matching (e.g.
# ``hermes skills install update``) merely defers recovery one launch;
# under-matching (missing ``hermes -p work update``) would race a recovery
# install against the real one. Loose wins.
try:
if "update" not in sys.argv[1:]:
_recover_from_interrupted_install()
except Exception:
pass
if _try_termux_fast_tui_launch():
return
if _try_termux_fast_cli_launch():

View file

@ -0,0 +1,218 @@
"""Tests for interrupted-install self-heal (the ``.update-incomplete`` marker).
Covers the breadcrumb lifecycle and the launch-time recovery guard added so a
``hermes update`` killed mid-install (Ctrl-C, terminal close, WSL OOM) gets
finished automatically on the next launch instead of leaving a half-built venv.
"""
from __future__ import annotations
from pathlib import Path
import hermes_cli.main as m
def test_marker_round_trip(tmp_path, monkeypatch):
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
marker = m._update_marker_path()
assert marker == tmp_path / ".update-incomplete"
assert not marker.exists()
m._write_update_incomplete_marker()
assert marker.exists()
body = marker.read_text()
assert "started=" in body
assert "pid=" in body
m._clear_update_incomplete_marker()
assert not marker.exists()
def test_clear_when_absent_is_noop(tmp_path, monkeypatch):
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
# Must not raise when the marker was never written.
m._clear_update_incomplete_marker()
assert not m._update_marker_path().exists()
def test_recovery_noop_without_marker(tmp_path, monkeypatch):
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
called = {"install": False}
monkeypatch.setattr(
m,
"_install_python_dependencies_with_optional_fallback",
lambda *a, **k: called.__setitem__("install", True),
)
m._recover_from_interrupted_install()
assert called["install"] is False, "recovery must not install when no marker"
def test_recovery_clears_stray_marker_without_pyproject(tmp_path, monkeypatch):
# No pyproject.toml (PyPI/Docker install) — a stray marker is not ours to
# act on; recovery should just clear it without trying to install.
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
m._write_update_incomplete_marker()
called = {"install": False}
monkeypatch.setattr(
m,
"_install_python_dependencies_with_optional_fallback",
lambda *a, **k: called.__setitem__("install", True),
)
m._recover_from_interrupted_install()
assert called["install"] is False
assert not m._update_marker_path().exists()
def test_recovery_runs_install_and_clears_marker(tmp_path, monkeypatch):
# Source-tree install (pyproject present) with marker set → recovery should
# run the dep install and clear the marker on success.
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
m._write_update_incomplete_marker()
seen = {"ensurepip": False, "install": False}
def fake_run(cmd, *a, **k):
if "ensurepip" in cmd:
seen["ensurepip"] = True
class R:
returncode = 0
return R()
monkeypatch.setattr(m.subprocess, "run", fake_run)
monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
monkeypatch.setattr(
m,
"_install_python_dependencies_with_optional_fallback",
lambda *a, **k: seen.__setitem__("install", True),
)
m._recover_from_interrupted_install()
assert seen["ensurepip"] is True, "ensurepip must run unconditionally first"
assert seen["install"] is True, "dep install must run"
assert not m._update_marker_path().exists(), "marker cleared on success"
def test_recovery_keeps_marker_on_failure(tmp_path, monkeypatch):
# If the install itself blows up, the marker must survive so the next
# launch retries — and recovery must not raise.
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
m._write_update_incomplete_marker()
class R:
returncode = 0
monkeypatch.setattr(m.subprocess, "run", lambda *a, **k: R())
monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
def boom(*a, **k):
raise RuntimeError("install died")
monkeypatch.setattr(
m, "_install_python_dependencies_with_optional_fallback", boom
)
# Must not raise.
m._recover_from_interrupted_install()
assert m._update_marker_path().exists(), "marker preserved for retry on failure"
def _stub_install_env(monkeypatch, m, seen):
"""Common stubs so recovery's install path is inert and observable."""
class R:
returncode = 0
monkeypatch.setattr(m.subprocess, "run", lambda *a, **k: R())
monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
monkeypatch.setattr(
m,
"_install_python_dependencies_with_optional_fallback",
lambda *a, **k: seen.__setitem__("install", True),
)
def test_recovery_skips_when_lock_held(tmp_path, monkeypatch):
# Another process is mid-recovery (fresh lockfile) — this launch must skip
# the install entirely and leave both marker and lock untouched.
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
m._write_update_incomplete_marker()
lock = tmp_path / ".update-incomplete.lock"
lock.write_text("12345\n")
seen = {"install": False}
_stub_install_env(monkeypatch, m, seen)
m._recover_from_interrupted_install()
assert seen["install"] is False, "must not install while another holds the lock"
assert m._update_marker_path().exists(), "marker left for the lock holder"
assert lock.exists(), "fresh lock must not be broken"
def test_recovery_breaks_stale_lock(tmp_path, monkeypatch):
# A lock older than an hour is from a crashed holder — it gets removed so
# the NEXT launch can recover (this launch still skips).
import os as _os
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
m._write_update_incomplete_marker()
lock = tmp_path / ".update-incomplete.lock"
lock.write_text("12345\n")
stale = m._time.time() - 7200
_os.utime(lock, (stale, stale))
seen = {"install": False}
_stub_install_env(monkeypatch, m, seen)
m._recover_from_interrupted_install()
assert not lock.exists(), "stale lock must be broken"
assert m._update_marker_path().exists()
# Next launch proceeds normally.
m._recover_from_interrupted_install()
assert seen["install"] is True
assert not m._update_marker_path().exists()
assert not lock.exists(), "lock released after recovery"
def test_recovery_releases_lock_after_run(tmp_path, monkeypatch):
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
m._write_update_incomplete_marker()
seen = {"install": False}
_stub_install_env(monkeypatch, m, seen)
m._recover_from_interrupted_install()
assert seen["install"] is True
assert not (tmp_path / ".update-incomplete.lock").exists()
def test_recovery_output_goes_to_stderr(tmp_path, monkeypatch, capfd):
# ACP speaks JSON-RPC on stdout — recovery output (including the streamed
# install, which inherits fd 1) must land on stderr only.
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
m._write_update_incomplete_marker()
seen = {"install": False}
_stub_install_env(monkeypatch, m, seen)
m._recover_from_interrupted_install()
out, err = capfd.readouterr()
assert "interrupted mid-install" not in out
assert "interrupted mid-install" in err
assert "recovered" in err