diff --git a/.gitignore b/.gitignore index 1efce4b83f..fa4d64049b 100644 --- a/.gitignore +++ b/.gitignore @@ -114,6 +114,12 @@ docs/superpowers/* # treat it as a local edit and autostash it on every run (#38529). .hermes-bootstrap-complete +# Interrupted-update breadcrumb + recovery lock written next to the shared venv +# by `hermes update` / launch-time self-heal. Runtime state, never a code change +# — ignore so `git status` stays clean and update's autostash skips them. +.update-incomplete +.update-incomplete.lock + # Tool Search live-test harness output — non-deterministic model transcripts, # regenerated by scripts/tool_search_livetest.py. Never an artifact of the repo. scripts/out/ diff --git a/hermes_cli/main.py b/hermes_cli/main.py index ad0861609c..34d563a669 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -6392,6 +6392,167 @@ def _load_installable_optional_extras(group: str = "all") -> list[str]: return referenced +# Install-scoped breadcrumb dropped right before ``hermes update`` mutates the +# venv and cleared only after the dependency install verifies clean. If a user +# kills the update mid-install (Ctrl-C, terminal close, WSL OOM), the marker +# survives and the next ``hermes`` launch finishes the install instead of +# limping along on a half-built venv (e.g. pip wiped, a core dep like Pillow +# never landed). Lives next to the venv (not under $HERMES_HOME) because the +# venv is shared across all profiles, so a single marker covers every profile. +def _update_marker_path() -> Path: + return PROJECT_ROOT / ".update-incomplete" + + +def _write_update_incomplete_marker() -> None: + """Drop the interrupted-install breadcrumb. Never raises.""" + try: + _update_marker_path().write_text( + f"started={_time.time()}\npid={os.getpid()}\n", encoding="utf-8" + ) + except OSError as exc: + logger.debug("Could not write update-incomplete marker: %s", exc) + + +def _clear_update_incomplete_marker() -> None: + """Remove the interrupted-install breadcrumb. Never raises.""" + try: + _update_marker_path().unlink() + except FileNotFoundError: + pass + except OSError as exc: + logger.debug("Could not clear update-incomplete marker: %s", exc) + + +def _recover_from_interrupted_install() -> None: + """Finish a dependency install that a prior ``hermes update`` left half-done. + + Triggered on launch when ``.update-incomplete`` is present — meaning the + code was pulled but the dep install was killed before it verified clean. + Unconditionally bootstraps pip via ``ensurepip`` (a killed ``pip install`` + can wipe pip from the venv entirely, which blocks the venv from recovering + on its own), then re-runs the editable ``.[all]`` install + core-dependency + verification, then clears the marker. + + Never raises: a recovery failure must not block launch. If it can't + self-heal it prints the one-line manual command and leaves the marker so + the next launch tries again. + + Concurrency: the marker lives next to the shared venv, so a gateway start + plus a CLI launch (or two profiles starting at once) can both see it. An + ``O_EXCL`` lockfile ensures only one process runs the reinstall; the + others skip and let the winner clear the marker. + + Output: everything — our status lines AND the streamed pip/uv install + (which inherits fd 1) — is routed to stderr. Launches whose stdout is a + protocol stream (``hermes acp`` speaks JSON-RPC on stdout) must never get + install noise on stdout. + """ + if not _update_marker_path().exists(): + return + + # Skip in managed/Docker installs and on PyPI installs with no git checkout: + # those don't run the source-tree update path, so a stray marker is not ours + # to act on. Just clear it. + if not (PROJECT_ROOT / "pyproject.toml").is_file(): + _clear_update_incomplete_marker() + return + + # Single-flight guard: atomically claim the recovery lock. If another + # process holds it, skip — it is running the same reinstall into the same + # shared venv right now. A crashed holder leaves a stale lock; break it + # after an hour (well past any realistic install) so recovery can't be + # wedged forever. + lock_path = PROJECT_ROOT / ".update-incomplete.lock" + try: + fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY) + os.write(fd, f"{os.getpid()}\n".encode()) + os.close(fd) + except FileExistsError: + try: + if _time.time() - lock_path.stat().st_mtime > 3600: + lock_path.unlink() + except OSError: + pass + return + except OSError as exc: + # Couldn't create the lock (read-only fs, perms). Proceed unlocked — + # the install itself will surface the real problem. + logger.debug("Could not create install-recovery lock: %s", exc) + + saved_stdout_fd = None + saved_sys_stdout = sys.stdout + try: + # Route Python-level prints AND subprocess-inherited fd 1 to stderr + # for the duration of recovery (see docstring: ACP stdout safety). + try: + saved_stdout_fd = os.dup(1) + os.dup2(2, 1) + except OSError: + saved_stdout_fd = None + sys.stdout = sys.stderr + + print( + "⚠ A previous `hermes update` was interrupted mid-install — " + "finishing dependency installation now..." + ) + + try: + from hermes_cli.managed_uv import ensure_uv + + # Always bootstrap pip first: a killed install can leave the venv with + # no pip module at all, and uv may also be gone. ensurepip restores a + # known-good pip so at least the plain-pip path below can proceed. + try: + subprocess.run( + [sys.executable, "-m", "ensurepip", "--upgrade", "--default-pip"], + cwd=PROJECT_ROOT, + capture_output=True, + ) + except Exception as exc: + logger.debug("ensurepip during install recovery failed: %s", exc) + + uv_bin = ensure_uv() + if uv_bin: + uv_env = {**os.environ, "VIRTUAL_ENV": str(PROJECT_ROOT / "venv")} + if _is_termux_env(uv_env): + uv_env.pop("PYTHONPATH", None) + uv_env.pop("PYTHONHOME", None) + _install_python_dependencies_with_optional_fallback( + [uv_bin, "pip"], + env=uv_env, + group="termux-all" if _is_termux_env(uv_env) else "all", + ) + else: + _install_python_dependencies_with_optional_fallback( + [sys.executable, "-m", "pip"], + group="termux-all" if _is_termux_env() else "all", + ) + + _clear_update_incomplete_marker() + print("✓ Dependency installation recovered — your install is healthy again.") + except Exception as exc: + # Leave the marker in place so the next launch retries. Give the user + # the exact manual recovery command in the meantime. + logger.debug("Interrupted-install recovery failed: %s", exc) + print("✗ Could not auto-recover the interrupted install.") + print(" Recover manually with:") + print(f" cd {PROJECT_ROOT}") + print(f" {sys.executable} -m ensurepip --upgrade") + print(f" {sys.executable} -m pip install -e '.[all]'") + finally: + sys.stdout = saved_sys_stdout + if saved_stdout_fd is not None: + try: + os.dup2(saved_stdout_fd, 1) + os.close(saved_stdout_fd) + except OSError: + pass + try: + lock_path.unlink() + except OSError: + pass + + def _run_install_with_heartbeat( cmd: list[str], *, @@ -8323,6 +8484,13 @@ def _cmd_update_impl(args, gateway_mode: bool): # Reinstall Python dependencies. Prefer .[all], but if one optional extra # breaks on this machine, keep base deps and reinstall the remaining extras # individually so update does not silently strip working capabilities. + # + # Drop the interrupted-install breadcrumb BEFORE touching the venv. If + # the install is killed mid-flight (Ctrl-C, terminal close, WSL OOM), + # the marker survives and the next ``hermes`` launch finishes the + # install via ``_recover_from_interrupted_install``. Cleared only after + # the install + core-dependency verification completes below. + _write_update_incomplete_marker() print("→ Updating Python dependencies...") from hermes_cli.managed_uv import ensure_uv, update_managed_uv @@ -8376,6 +8544,12 @@ def _cmd_update_impl(args, gateway_mode: bool): _install_psutil_android_compat(pip_cmd) _install_python_dependencies_with_optional_fallback(pip_cmd, group=install_group) + # Core Python deps installed AND verified (the fallback helper runs + # _verify_core_dependencies_installed). Clear the interrupted-install + # breadcrumb now — the remaining steps (lazy refresh, node deps, web + # UI, desktop rebuild) are non-core and can't brick the venv. + _clear_update_incomplete_marker() + _refresh_active_lazy_features() _update_node_dependencies() @@ -10690,6 +10864,22 @@ def main(): except Exception: pass + # Self-heal a venv left half-built by an interrupted ``hermes update`` + # (Ctrl-C, terminal close, WSL OOM mid-install). Skip when the user is + # *running* update — that flow writes and clears its own marker, and we + # don't want a recovery install racing the real one. Never raises. + # + # The substring match is deliberately loose: argv isn't parsed yet at this + # point, and the failure modes are asymmetric. Over-matching (e.g. + # ``hermes skills install update``) merely defers recovery one launch; + # under-matching (missing ``hermes -p work update``) would race a recovery + # install against the real one. Loose wins. + try: + if "update" not in sys.argv[1:]: + _recover_from_interrupted_install() + except Exception: + pass + if _try_termux_fast_tui_launch(): return if _try_termux_fast_cli_launch(): diff --git a/tests/hermes_cli/test_update_interrupted_recovery.py b/tests/hermes_cli/test_update_interrupted_recovery.py new file mode 100644 index 0000000000..aed84f71c8 --- /dev/null +++ b/tests/hermes_cli/test_update_interrupted_recovery.py @@ -0,0 +1,218 @@ +"""Tests for interrupted-install self-heal (the ``.update-incomplete`` marker). + +Covers the breadcrumb lifecycle and the launch-time recovery guard added so a +``hermes update`` killed mid-install (Ctrl-C, terminal close, WSL OOM) gets +finished automatically on the next launch instead of leaving a half-built venv. +""" + +from __future__ import annotations + +from pathlib import Path + +import hermes_cli.main as m + + +def test_marker_round_trip(tmp_path, monkeypatch): + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + marker = m._update_marker_path() + assert marker == tmp_path / ".update-incomplete" + assert not marker.exists() + + m._write_update_incomplete_marker() + assert marker.exists() + body = marker.read_text() + assert "started=" in body + assert "pid=" in body + + m._clear_update_incomplete_marker() + assert not marker.exists() + + +def test_clear_when_absent_is_noop(tmp_path, monkeypatch): + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + # Must not raise when the marker was never written. + m._clear_update_incomplete_marker() + assert not m._update_marker_path().exists() + + +def test_recovery_noop_without_marker(tmp_path, monkeypatch): + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + called = {"install": False} + monkeypatch.setattr( + m, + "_install_python_dependencies_with_optional_fallback", + lambda *a, **k: called.__setitem__("install", True), + ) + m._recover_from_interrupted_install() + assert called["install"] is False, "recovery must not install when no marker" + + +def test_recovery_clears_stray_marker_without_pyproject(tmp_path, monkeypatch): + # No pyproject.toml (PyPI/Docker install) — a stray marker is not ours to + # act on; recovery should just clear it without trying to install. + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + m._write_update_incomplete_marker() + called = {"install": False} + monkeypatch.setattr( + m, + "_install_python_dependencies_with_optional_fallback", + lambda *a, **k: called.__setitem__("install", True), + ) + m._recover_from_interrupted_install() + assert called["install"] is False + assert not m._update_marker_path().exists() + + +def test_recovery_runs_install_and_clears_marker(tmp_path, monkeypatch): + # Source-tree install (pyproject present) with marker set → recovery should + # run the dep install and clear the marker on success. + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + m._write_update_incomplete_marker() + + seen = {"ensurepip": False, "install": False} + + def fake_run(cmd, *a, **k): + if "ensurepip" in cmd: + seen["ensurepip"] = True + + class R: + returncode = 0 + + return R() + + monkeypatch.setattr(m.subprocess, "run", fake_run) + monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False) + monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None) + monkeypatch.setattr( + m, + "_install_python_dependencies_with_optional_fallback", + lambda *a, **k: seen.__setitem__("install", True), + ) + + m._recover_from_interrupted_install() + + assert seen["ensurepip"] is True, "ensurepip must run unconditionally first" + assert seen["install"] is True, "dep install must run" + assert not m._update_marker_path().exists(), "marker cleared on success" + + +def test_recovery_keeps_marker_on_failure(tmp_path, monkeypatch): + # If the install itself blows up, the marker must survive so the next + # launch retries — and recovery must not raise. + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + m._write_update_incomplete_marker() + + class R: + returncode = 0 + + monkeypatch.setattr(m.subprocess, "run", lambda *a, **k: R()) + monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False) + monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None) + + def boom(*a, **k): + raise RuntimeError("install died") + + monkeypatch.setattr( + m, "_install_python_dependencies_with_optional_fallback", boom + ) + + # Must not raise. + m._recover_from_interrupted_install() + assert m._update_marker_path().exists(), "marker preserved for retry on failure" + + +def _stub_install_env(monkeypatch, m, seen): + """Common stubs so recovery's install path is inert and observable.""" + + class R: + returncode = 0 + + monkeypatch.setattr(m.subprocess, "run", lambda *a, **k: R()) + monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False) + monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None) + monkeypatch.setattr( + m, + "_install_python_dependencies_with_optional_fallback", + lambda *a, **k: seen.__setitem__("install", True), + ) + + +def test_recovery_skips_when_lock_held(tmp_path, monkeypatch): + # Another process is mid-recovery (fresh lockfile) — this launch must skip + # the install entirely and leave both marker and lock untouched. + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + m._write_update_incomplete_marker() + lock = tmp_path / ".update-incomplete.lock" + lock.write_text("12345\n") + + seen = {"install": False} + _stub_install_env(monkeypatch, m, seen) + + m._recover_from_interrupted_install() + + assert seen["install"] is False, "must not install while another holds the lock" + assert m._update_marker_path().exists(), "marker left for the lock holder" + assert lock.exists(), "fresh lock must not be broken" + + +def test_recovery_breaks_stale_lock(tmp_path, monkeypatch): + # A lock older than an hour is from a crashed holder — it gets removed so + # the NEXT launch can recover (this launch still skips). + import os as _os + + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + m._write_update_incomplete_marker() + lock = tmp_path / ".update-incomplete.lock" + lock.write_text("12345\n") + stale = m._time.time() - 7200 + _os.utime(lock, (stale, stale)) + + seen = {"install": False} + _stub_install_env(monkeypatch, m, seen) + + m._recover_from_interrupted_install() + + assert not lock.exists(), "stale lock must be broken" + assert m._update_marker_path().exists() + + # Next launch proceeds normally. + m._recover_from_interrupted_install() + assert seen["install"] is True + assert not m._update_marker_path().exists() + assert not lock.exists(), "lock released after recovery" + + +def test_recovery_releases_lock_after_run(tmp_path, monkeypatch): + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + m._write_update_incomplete_marker() + + seen = {"install": False} + _stub_install_env(monkeypatch, m, seen) + + m._recover_from_interrupted_install() + + assert seen["install"] is True + assert not (tmp_path / ".update-incomplete.lock").exists() + + +def test_recovery_output_goes_to_stderr(tmp_path, monkeypatch, capfd): + # ACP speaks JSON-RPC on stdout — recovery output (including the streamed + # install, which inherits fd 1) must land on stderr only. + monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path) + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + m._write_update_incomplete_marker() + + seen = {"install": False} + _stub_install_env(monkeypatch, m, seen) + + m._recover_from_interrupted_install() + + out, err = capfd.readouterr() + assert "interrupted mid-install" not in out + assert "interrupted mid-install" in err + assert "recovered" in err