fix(update): self-heal a venv left half-built by an interrupted install (#42172)

* fix(update): self-heal a venv left half-built by an interrupted install An update killed mid dependency-install (Ctrl-C, terminal close, WSL OOM) could leave the venv with pip wiped and core deps (e.g. Pillow) missing, with no automatic recovery — the user had to manually run ensurepip + reinstall. Drop an install-scoped .update-incomplete breadcrumb right before the dep install and clear it only after core-dependency verification passes. On the next launch (any command except 'update' itself), if the marker is present, unconditionally bootstrap pip via ensurepip then re-run the .[all] install + verification, then clear the marker. Failure leaves the marker for retry and prints the manual recovery command. Never raises — recovery cannot block launch. * fix(update): address review — stderr-only recovery output, single-flight lock, gitignore marker - Route all recovery output (status lines + streamed pip/uv install via fd-level dup2) to stderr so protocol-on-stdout launches (hermes acp) never get install noise on the JSON-RPC stream. - Single-flight O_EXCL lockfile (.update-incomplete.lock) so a gateway start + CLI launch (or two profiles) can't run concurrent installs into the shared venv; stale locks (>1h) are broken for the next launch. - gitignore .update-incomplete + lock so source-tree installs keep a clean git status and update's autostash skips them. - Document why the loose 'update' argv substring match is intentional (over-match defers one launch; under-match would race the real update). - 4 new tests: lock held → skip, stale lock broken, lock released, output lands on stderr only.
2026-06-14 09:11:54 +00:00 · 2026-06-10 02:57:05 -07:00 · 2026-06-10 02:57:05 -07:00 · a5c32cdf30
commit a5c32cdf30
parent 15813336cc
3 changed files with 414 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -114,6 +114,12 @@ docs/superpowers/*
 # treat it as a local edit and autostash it on every run (#38529).
 .hermes-bootstrap-complete

+# Interrupted-update breadcrumb + recovery lock written next to the shared venv
+# by `hermes update` / launch-time self-heal. Runtime state, never a code change
+# — ignore so `git status` stays clean and update's autostash skips them.
+.update-incomplete
+.update-incomplete.lock
+
 # Tool Search live-test harness output — non-deterministic model transcripts,
 # regenerated by scripts/tool_search_livetest.py. Never an artifact of the repo.
 scripts/out/
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -6392,6 +6392,167 @@ def _load_installable_optional_extras(group: str = "all") -> list[str]:
    return referenced


+# Install-scoped breadcrumb dropped right before ``hermes update`` mutates the
+# venv and cleared only after the dependency install verifies clean.  If a user
+# kills the update mid-install (Ctrl-C, terminal close, WSL OOM), the marker
+# survives and the next ``hermes`` launch finishes the install instead of
+# limping along on a half-built venv (e.g. pip wiped, a core dep like Pillow
+# never landed).  Lives next to the venv (not under $HERMES_HOME) because the
+# venv is shared across all profiles, so a single marker covers every profile.
+def _update_marker_path() -> Path:
+    return PROJECT_ROOT / ".update-incomplete"
+
+
+def _write_update_incomplete_marker() -> None:
+    """Drop the interrupted-install breadcrumb. Never raises."""
+    try:
+        _update_marker_path().write_text(
+            f"started={_time.time()}\npid={os.getpid()}\n", encoding="utf-8"
+        )
+    except OSError as exc:
+        logger.debug("Could not write update-incomplete marker: %s", exc)
+
+
+def _clear_update_incomplete_marker() -> None:
+    """Remove the interrupted-install breadcrumb. Never raises."""
+    try:
+        _update_marker_path().unlink()
+    except FileNotFoundError:
+        pass
+    except OSError as exc:
+        logger.debug("Could not clear update-incomplete marker: %s", exc)
+
+
+def _recover_from_interrupted_install() -> None:
+    """Finish a dependency install that a prior ``hermes update`` left half-done.
+
+    Triggered on launch when ``.update-incomplete`` is present — meaning the
+    code was pulled but the dep install was killed before it verified clean.
+    Unconditionally bootstraps pip via ``ensurepip`` (a killed ``pip install``
+    can wipe pip from the venv entirely, which blocks the venv from recovering
+    on its own), then re-runs the editable ``.[all]`` install + core-dependency
+    verification, then clears the marker.
+
+    Never raises: a recovery failure must not block launch.  If it can't
+    self-heal it prints the one-line manual command and leaves the marker so
+    the next launch tries again.
+
+    Concurrency: the marker lives next to the shared venv, so a gateway start
+    plus a CLI launch (or two profiles starting at once) can both see it.  An
+    ``O_EXCL`` lockfile ensures only one process runs the reinstall; the
+    others skip and let the winner clear the marker.
+
+    Output: everything — our status lines AND the streamed pip/uv install
+    (which inherits fd 1) — is routed to stderr.  Launches whose stdout is a
+    protocol stream (``hermes acp`` speaks JSON-RPC on stdout) must never get
+    install noise on stdout.
+    """
+    if not _update_marker_path().exists():
+        return
+
+    # Skip in managed/Docker installs and on PyPI installs with no git checkout:
+    # those don't run the source-tree update path, so a stray marker is not ours
+    # to act on. Just clear it.
+    if not (PROJECT_ROOT / "pyproject.toml").is_file():
+        _clear_update_incomplete_marker()
+        return
+
+    # Single-flight guard: atomically claim the recovery lock. If another
+    # process holds it, skip — it is running the same reinstall into the same
+    # shared venv right now. A crashed holder leaves a stale lock; break it
+    # after an hour (well past any realistic install) so recovery can't be
+    # wedged forever.
+    lock_path = PROJECT_ROOT / ".update-incomplete.lock"
+    try:
+        fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
+        os.write(fd, f"{os.getpid()}\n".encode())
+        os.close(fd)
+    except FileExistsError:
+        try:
+            if _time.time() - lock_path.stat().st_mtime > 3600:
+                lock_path.unlink()
+        except OSError:
+            pass
+        return
+    except OSError as exc:
+        # Couldn't create the lock (read-only fs, perms). Proceed unlocked —
+        # the install itself will surface the real problem.
+        logger.debug("Could not create install-recovery lock: %s", exc)
+
+    saved_stdout_fd = None
+    saved_sys_stdout = sys.stdout
+    try:
+        # Route Python-level prints AND subprocess-inherited fd 1 to stderr
+        # for the duration of recovery (see docstring: ACP stdout safety).
+        try:
+            saved_stdout_fd = os.dup(1)
+            os.dup2(2, 1)
+        except OSError:
+            saved_stdout_fd = None
+        sys.stdout = sys.stderr
+
+        print(
+            "⚠ A previous `hermes update` was interrupted mid-install — "
+            "finishing dependency installation now..."
+        )
+
+        try:
+            from hermes_cli.managed_uv import ensure_uv
+
+            # Always bootstrap pip first: a killed install can leave the venv with
+            # no pip module at all, and uv may also be gone. ensurepip restores a
+            # known-good pip so at least the plain-pip path below can proceed.
+            try:
+                subprocess.run(
+                    [sys.executable, "-m", "ensurepip", "--upgrade", "--default-pip"],
+                    cwd=PROJECT_ROOT,
+                    capture_output=True,
+                )
+            except Exception as exc:
+                logger.debug("ensurepip during install recovery failed: %s", exc)
+
+            uv_bin = ensure_uv()
+            if uv_bin:
+                uv_env = {**os.environ, "VIRTUAL_ENV": str(PROJECT_ROOT / "venv")}
+                if _is_termux_env(uv_env):
+                    uv_env.pop("PYTHONPATH", None)
+                    uv_env.pop("PYTHONHOME", None)
+                _install_python_dependencies_with_optional_fallback(
+                    [uv_bin, "pip"],
+                    env=uv_env,
+                    group="termux-all" if _is_termux_env(uv_env) else "all",
+                )
+            else:
+                _install_python_dependencies_with_optional_fallback(
+                    [sys.executable, "-m", "pip"],
+                    group="termux-all" if _is_termux_env() else "all",
+                )
+
+            _clear_update_incomplete_marker()
+            print("✓ Dependency installation recovered — your install is healthy again.")
+        except Exception as exc:
+            # Leave the marker in place so the next launch retries. Give the user
+            # the exact manual recovery command in the meantime.
+            logger.debug("Interrupted-install recovery failed: %s", exc)
+            print("✗ Could not auto-recover the interrupted install.")
+            print("  Recover manually with:")
+            print(f"    cd {PROJECT_ROOT}")
+            print(f"    {sys.executable} -m ensurepip --upgrade")
+            print(f"    {sys.executable} -m pip install -e '.[all]'")
+    finally:
+        sys.stdout = saved_sys_stdout
+        if saved_stdout_fd is not None:
+            try:
+                os.dup2(saved_stdout_fd, 1)
+                os.close(saved_stdout_fd)
+            except OSError:
+                pass
+        try:
+            lock_path.unlink()
+        except OSError:
+            pass
+
+
 def _run_install_with_heartbeat(
    cmd: list[str],
    *,
@ -8323,6 +8484,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
        # Reinstall Python dependencies. Prefer .[all], but if one optional extra
        # breaks on this machine, keep base deps and reinstall the remaining extras
        # individually so update does not silently strip working capabilities.
+        #
+        # Drop the interrupted-install breadcrumb BEFORE touching the venv. If
+        # the install is killed mid-flight (Ctrl-C, terminal close, WSL OOM),
+        # the marker survives and the next ``hermes`` launch finishes the
+        # install via ``_recover_from_interrupted_install``. Cleared only after
+        # the install + core-dependency verification completes below.
+        _write_update_incomplete_marker()
        print("→ Updating Python dependencies...")
        from hermes_cli.managed_uv import ensure_uv, update_managed_uv

@ -8376,6 +8544,12 @@ def _cmd_update_impl(args, gateway_mode: bool):
                _install_psutil_android_compat(pip_cmd)
            _install_python_dependencies_with_optional_fallback(pip_cmd, group=install_group)

+        # Core Python deps installed AND verified (the fallback helper runs
+        # _verify_core_dependencies_installed). Clear the interrupted-install
+        # breadcrumb now — the remaining steps (lazy refresh, node deps, web
+        # UI, desktop rebuild) are non-core and can't brick the venv.
+        _clear_update_incomplete_marker()
+
        _refresh_active_lazy_features()

        _update_node_dependencies()
@ -10690,6 +10864,22 @@ def main():
    except Exception:
        pass

+    # Self-heal a venv left half-built by an interrupted ``hermes update``
+    # (Ctrl-C, terminal close, WSL OOM mid-install). Skip when the user is
+    # *running* update — that flow writes and clears its own marker, and we
+    # don't want a recovery install racing the real one. Never raises.
+    #
+    # The substring match is deliberately loose: argv isn't parsed yet at this
+    # point, and the failure modes are asymmetric. Over-matching (e.g.
+    # ``hermes skills install update``) merely defers recovery one launch;
+    # under-matching (missing ``hermes -p work update``) would race a recovery
+    # install against the real one. Loose wins.
+    try:
+        if "update" not in sys.argv[1:]:
+            _recover_from_interrupted_install()
+    except Exception:
+        pass
+
    if _try_termux_fast_tui_launch():
        return
    if _try_termux_fast_cli_launch():
--- a/tests/hermes_cli/test_update_interrupted_recovery.py
+++ b/tests/hermes_cli/test_update_interrupted_recovery.py
@ -0,0 +1,218 @@
+"""Tests for interrupted-install self-heal (the ``.update-incomplete`` marker).
+
+Covers the breadcrumb lifecycle and the launch-time recovery guard added so a
+``hermes update`` killed mid-install (Ctrl-C, terminal close, WSL OOM) gets
+finished automatically on the next launch instead of leaving a half-built venv.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import hermes_cli.main as m
+
+
+def test_marker_round_trip(tmp_path, monkeypatch):
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    marker = m._update_marker_path()
+    assert marker == tmp_path / ".update-incomplete"
+    assert not marker.exists()
+
+    m._write_update_incomplete_marker()
+    assert marker.exists()
+    body = marker.read_text()
+    assert "started=" in body
+    assert "pid=" in body
+
+    m._clear_update_incomplete_marker()
+    assert not marker.exists()
+
+
+def test_clear_when_absent_is_noop(tmp_path, monkeypatch):
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    # Must not raise when the marker was never written.
+    m._clear_update_incomplete_marker()
+    assert not m._update_marker_path().exists()
+
+
+def test_recovery_noop_without_marker(tmp_path, monkeypatch):
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    called = {"install": False}
+    monkeypatch.setattr(
+        m,
+        "_install_python_dependencies_with_optional_fallback",
+        lambda *a, **k: called.__setitem__("install", True),
+    )
+    m._recover_from_interrupted_install()
+    assert called["install"] is False, "recovery must not install when no marker"
+
+
+def test_recovery_clears_stray_marker_without_pyproject(tmp_path, monkeypatch):
+    # No pyproject.toml (PyPI/Docker install) — a stray marker is not ours to
+    # act on; recovery should just clear it without trying to install.
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    m._write_update_incomplete_marker()
+    called = {"install": False}
+    monkeypatch.setattr(
+        m,
+        "_install_python_dependencies_with_optional_fallback",
+        lambda *a, **k: called.__setitem__("install", True),
+    )
+    m._recover_from_interrupted_install()
+    assert called["install"] is False
+    assert not m._update_marker_path().exists()
+
+
+def test_recovery_runs_install_and_clears_marker(tmp_path, monkeypatch):
+    # Source-tree install (pyproject present) with marker set → recovery should
+    # run the dep install and clear the marker on success.
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+    m._write_update_incomplete_marker()
+
+    seen = {"ensurepip": False, "install": False}
+
+    def fake_run(cmd, *a, **k):
+        if "ensurepip" in cmd:
+            seen["ensurepip"] = True
+
+        class R:
+            returncode = 0
+
+        return R()
+
+    monkeypatch.setattr(m.subprocess, "run", fake_run)
+    monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
+    monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
+    monkeypatch.setattr(
+        m,
+        "_install_python_dependencies_with_optional_fallback",
+        lambda *a, **k: seen.__setitem__("install", True),
+    )
+
+    m._recover_from_interrupted_install()
+
+    assert seen["ensurepip"] is True, "ensurepip must run unconditionally first"
+    assert seen["install"] is True, "dep install must run"
+    assert not m._update_marker_path().exists(), "marker cleared on success"
+
+
+def test_recovery_keeps_marker_on_failure(tmp_path, monkeypatch):
+    # If the install itself blows up, the marker must survive so the next
+    # launch retries — and recovery must not raise.
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+    m._write_update_incomplete_marker()
+
+    class R:
+        returncode = 0
+
+    monkeypatch.setattr(m.subprocess, "run", lambda *a, **k: R())
+    monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
+    monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
+
+    def boom(*a, **k):
+        raise RuntimeError("install died")
+
+    monkeypatch.setattr(
+        m, "_install_python_dependencies_with_optional_fallback", boom
+    )
+
+    # Must not raise.
+    m._recover_from_interrupted_install()
+    assert m._update_marker_path().exists(), "marker preserved for retry on failure"
+
+
+def _stub_install_env(monkeypatch, m, seen):
+    """Common stubs so recovery's install path is inert and observable."""
+
+    class R:
+        returncode = 0
+
+    monkeypatch.setattr(m.subprocess, "run", lambda *a, **k: R())
+    monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
+    monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
+    monkeypatch.setattr(
+        m,
+        "_install_python_dependencies_with_optional_fallback",
+        lambda *a, **k: seen.__setitem__("install", True),
+    )
+
+
+def test_recovery_skips_when_lock_held(tmp_path, monkeypatch):
+    # Another process is mid-recovery (fresh lockfile) — this launch must skip
+    # the install entirely and leave both marker and lock untouched.
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+    m._write_update_incomplete_marker()
+    lock = tmp_path / ".update-incomplete.lock"
+    lock.write_text("12345\n")
+
+    seen = {"install": False}
+    _stub_install_env(monkeypatch, m, seen)
+
+    m._recover_from_interrupted_install()
+
+    assert seen["install"] is False, "must not install while another holds the lock"
+    assert m._update_marker_path().exists(), "marker left for the lock holder"
+    assert lock.exists(), "fresh lock must not be broken"
+
+
+def test_recovery_breaks_stale_lock(tmp_path, monkeypatch):
+    # A lock older than an hour is from a crashed holder — it gets removed so
+    # the NEXT launch can recover (this launch still skips).
+    import os as _os
+
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+    m._write_update_incomplete_marker()
+    lock = tmp_path / ".update-incomplete.lock"
+    lock.write_text("12345\n")
+    stale = m._time.time() - 7200
+    _os.utime(lock, (stale, stale))
+
+    seen = {"install": False}
+    _stub_install_env(monkeypatch, m, seen)
+
+    m._recover_from_interrupted_install()
+
+    assert not lock.exists(), "stale lock must be broken"
+    assert m._update_marker_path().exists()
+
+    # Next launch proceeds normally.
+    m._recover_from_interrupted_install()
+    assert seen["install"] is True
+    assert not m._update_marker_path().exists()
+    assert not lock.exists(), "lock released after recovery"
+
+
+def test_recovery_releases_lock_after_run(tmp_path, monkeypatch):
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+    m._write_update_incomplete_marker()
+
+    seen = {"install": False}
+    _stub_install_env(monkeypatch, m, seen)
+
+    m._recover_from_interrupted_install()
+
+    assert seen["install"] is True
+    assert not (tmp_path / ".update-incomplete.lock").exists()
+
+
+def test_recovery_output_goes_to_stderr(tmp_path, monkeypatch, capfd):
+    # ACP speaks JSON-RPC on stdout — recovery output (including the streamed
+    # install, which inherits fd 1) must land on stderr only.
+    monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
+    (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
+    m._write_update_incomplete_marker()
+
+    seen = {"install": False}
+    _stub_install_env(monkeypatch, m, seen)
+
+    m._recover_from_interrupted_install()
+
+    out, err = capfd.readouterr()
+    assert "interrupted mid-install" not in out
+    assert "interrupted mid-install" in err
+    assert "recovered" in err