mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-14 09:11:54 +00:00
fix(update): self-heal a venv left half-built by an interrupted install (#42172)
* fix(update): self-heal a venv left half-built by an interrupted install An update killed mid dependency-install (Ctrl-C, terminal close, WSL OOM) could leave the venv with pip wiped and core deps (e.g. Pillow) missing, with no automatic recovery — the user had to manually run ensurepip + reinstall. Drop an install-scoped .update-incomplete breadcrumb right before the dep install and clear it only after core-dependency verification passes. On the next launch (any command except 'update' itself), if the marker is present, unconditionally bootstrap pip via ensurepip then re-run the .[all] install + verification, then clear the marker. Failure leaves the marker for retry and prints the manual recovery command. Never raises — recovery cannot block launch. * fix(update): address review — stderr-only recovery output, single-flight lock, gitignore marker - Route all recovery output (status lines + streamed pip/uv install via fd-level dup2) to stderr so protocol-on-stdout launches (hermes acp) never get install noise on the JSON-RPC stream. - Single-flight O_EXCL lockfile (.update-incomplete.lock) so a gateway start + CLI launch (or two profiles) can't run concurrent installs into the shared venv; stale locks (>1h) are broken for the next launch. - gitignore .update-incomplete + lock so source-tree installs keep a clean git status and update's autostash skips them. - Document why the loose 'update' argv substring match is intentional (over-match defers one launch; under-match would race the real update). - 4 new tests: lock held → skip, stale lock broken, lock released, output lands on stderr only.
This commit is contained in:
parent
15813336cc
commit
a5c32cdf30
3 changed files with 414 additions and 0 deletions
6
.gitignore
vendored
6
.gitignore
vendored
|
|
@ -114,6 +114,12 @@ docs/superpowers/*
|
|||
# treat it as a local edit and autostash it on every run (#38529).
|
||||
.hermes-bootstrap-complete
|
||||
|
||||
# Interrupted-update breadcrumb + recovery lock written next to the shared venv
|
||||
# by `hermes update` / launch-time self-heal. Runtime state, never a code change
|
||||
# — ignore so `git status` stays clean and update's autostash skips them.
|
||||
.update-incomplete
|
||||
.update-incomplete.lock
|
||||
|
||||
# Tool Search live-test harness output — non-deterministic model transcripts,
|
||||
# regenerated by scripts/tool_search_livetest.py. Never an artifact of the repo.
|
||||
scripts/out/
|
||||
|
|
|
|||
|
|
@ -6392,6 +6392,167 @@ def _load_installable_optional_extras(group: str = "all") -> list[str]:
|
|||
return referenced
|
||||
|
||||
|
||||
# Install-scoped breadcrumb dropped right before ``hermes update`` mutates the
|
||||
# venv and cleared only after the dependency install verifies clean. If a user
|
||||
# kills the update mid-install (Ctrl-C, terminal close, WSL OOM), the marker
|
||||
# survives and the next ``hermes`` launch finishes the install instead of
|
||||
# limping along on a half-built venv (e.g. pip wiped, a core dep like Pillow
|
||||
# never landed). Lives next to the venv (not under $HERMES_HOME) because the
|
||||
# venv is shared across all profiles, so a single marker covers every profile.
|
||||
def _update_marker_path() -> Path:
|
||||
return PROJECT_ROOT / ".update-incomplete"
|
||||
|
||||
|
||||
def _write_update_incomplete_marker() -> None:
|
||||
"""Drop the interrupted-install breadcrumb. Never raises."""
|
||||
try:
|
||||
_update_marker_path().write_text(
|
||||
f"started={_time.time()}\npid={os.getpid()}\n", encoding="utf-8"
|
||||
)
|
||||
except OSError as exc:
|
||||
logger.debug("Could not write update-incomplete marker: %s", exc)
|
||||
|
||||
|
||||
def _clear_update_incomplete_marker() -> None:
|
||||
"""Remove the interrupted-install breadcrumb. Never raises."""
|
||||
try:
|
||||
_update_marker_path().unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except OSError as exc:
|
||||
logger.debug("Could not clear update-incomplete marker: %s", exc)
|
||||
|
||||
|
||||
def _recover_from_interrupted_install() -> None:
|
||||
"""Finish a dependency install that a prior ``hermes update`` left half-done.
|
||||
|
||||
Triggered on launch when ``.update-incomplete`` is present — meaning the
|
||||
code was pulled but the dep install was killed before it verified clean.
|
||||
Unconditionally bootstraps pip via ``ensurepip`` (a killed ``pip install``
|
||||
can wipe pip from the venv entirely, which blocks the venv from recovering
|
||||
on its own), then re-runs the editable ``.[all]`` install + core-dependency
|
||||
verification, then clears the marker.
|
||||
|
||||
Never raises: a recovery failure must not block launch. If it can't
|
||||
self-heal it prints the one-line manual command and leaves the marker so
|
||||
the next launch tries again.
|
||||
|
||||
Concurrency: the marker lives next to the shared venv, so a gateway start
|
||||
plus a CLI launch (or two profiles starting at once) can both see it. An
|
||||
``O_EXCL`` lockfile ensures only one process runs the reinstall; the
|
||||
others skip and let the winner clear the marker.
|
||||
|
||||
Output: everything — our status lines AND the streamed pip/uv install
|
||||
(which inherits fd 1) — is routed to stderr. Launches whose stdout is a
|
||||
protocol stream (``hermes acp`` speaks JSON-RPC on stdout) must never get
|
||||
install noise on stdout.
|
||||
"""
|
||||
if not _update_marker_path().exists():
|
||||
return
|
||||
|
||||
# Skip in managed/Docker installs and on PyPI installs with no git checkout:
|
||||
# those don't run the source-tree update path, so a stray marker is not ours
|
||||
# to act on. Just clear it.
|
||||
if not (PROJECT_ROOT / "pyproject.toml").is_file():
|
||||
_clear_update_incomplete_marker()
|
||||
return
|
||||
|
||||
# Single-flight guard: atomically claim the recovery lock. If another
|
||||
# process holds it, skip — it is running the same reinstall into the same
|
||||
# shared venv right now. A crashed holder leaves a stale lock; break it
|
||||
# after an hour (well past any realistic install) so recovery can't be
|
||||
# wedged forever.
|
||||
lock_path = PROJECT_ROOT / ".update-incomplete.lock"
|
||||
try:
|
||||
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
||||
os.write(fd, f"{os.getpid()}\n".encode())
|
||||
os.close(fd)
|
||||
except FileExistsError:
|
||||
try:
|
||||
if _time.time() - lock_path.stat().st_mtime > 3600:
|
||||
lock_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
return
|
||||
except OSError as exc:
|
||||
# Couldn't create the lock (read-only fs, perms). Proceed unlocked —
|
||||
# the install itself will surface the real problem.
|
||||
logger.debug("Could not create install-recovery lock: %s", exc)
|
||||
|
||||
saved_stdout_fd = None
|
||||
saved_sys_stdout = sys.stdout
|
||||
try:
|
||||
# Route Python-level prints AND subprocess-inherited fd 1 to stderr
|
||||
# for the duration of recovery (see docstring: ACP stdout safety).
|
||||
try:
|
||||
saved_stdout_fd = os.dup(1)
|
||||
os.dup2(2, 1)
|
||||
except OSError:
|
||||
saved_stdout_fd = None
|
||||
sys.stdout = sys.stderr
|
||||
|
||||
print(
|
||||
"⚠ A previous `hermes update` was interrupted mid-install — "
|
||||
"finishing dependency installation now..."
|
||||
)
|
||||
|
||||
try:
|
||||
from hermes_cli.managed_uv import ensure_uv
|
||||
|
||||
# Always bootstrap pip first: a killed install can leave the venv with
|
||||
# no pip module at all, and uv may also be gone. ensurepip restores a
|
||||
# known-good pip so at least the plain-pip path below can proceed.
|
||||
try:
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "ensurepip", "--upgrade", "--default-pip"],
|
||||
cwd=PROJECT_ROOT,
|
||||
capture_output=True,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("ensurepip during install recovery failed: %s", exc)
|
||||
|
||||
uv_bin = ensure_uv()
|
||||
if uv_bin:
|
||||
uv_env = {**os.environ, "VIRTUAL_ENV": str(PROJECT_ROOT / "venv")}
|
||||
if _is_termux_env(uv_env):
|
||||
uv_env.pop("PYTHONPATH", None)
|
||||
uv_env.pop("PYTHONHOME", None)
|
||||
_install_python_dependencies_with_optional_fallback(
|
||||
[uv_bin, "pip"],
|
||||
env=uv_env,
|
||||
group="termux-all" if _is_termux_env(uv_env) else "all",
|
||||
)
|
||||
else:
|
||||
_install_python_dependencies_with_optional_fallback(
|
||||
[sys.executable, "-m", "pip"],
|
||||
group="termux-all" if _is_termux_env() else "all",
|
||||
)
|
||||
|
||||
_clear_update_incomplete_marker()
|
||||
print("✓ Dependency installation recovered — your install is healthy again.")
|
||||
except Exception as exc:
|
||||
# Leave the marker in place so the next launch retries. Give the user
|
||||
# the exact manual recovery command in the meantime.
|
||||
logger.debug("Interrupted-install recovery failed: %s", exc)
|
||||
print("✗ Could not auto-recover the interrupted install.")
|
||||
print(" Recover manually with:")
|
||||
print(f" cd {PROJECT_ROOT}")
|
||||
print(f" {sys.executable} -m ensurepip --upgrade")
|
||||
print(f" {sys.executable} -m pip install -e '.[all]'")
|
||||
finally:
|
||||
sys.stdout = saved_sys_stdout
|
||||
if saved_stdout_fd is not None:
|
||||
try:
|
||||
os.dup2(saved_stdout_fd, 1)
|
||||
os.close(saved_stdout_fd)
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
lock_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _run_install_with_heartbeat(
|
||||
cmd: list[str],
|
||||
*,
|
||||
|
|
@ -8323,6 +8484,13 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
# Reinstall Python dependencies. Prefer .[all], but if one optional extra
|
||||
# breaks on this machine, keep base deps and reinstall the remaining extras
|
||||
# individually so update does not silently strip working capabilities.
|
||||
#
|
||||
# Drop the interrupted-install breadcrumb BEFORE touching the venv. If
|
||||
# the install is killed mid-flight (Ctrl-C, terminal close, WSL OOM),
|
||||
# the marker survives and the next ``hermes`` launch finishes the
|
||||
# install via ``_recover_from_interrupted_install``. Cleared only after
|
||||
# the install + core-dependency verification completes below.
|
||||
_write_update_incomplete_marker()
|
||||
print("→ Updating Python dependencies...")
|
||||
from hermes_cli.managed_uv import ensure_uv, update_managed_uv
|
||||
|
||||
|
|
@ -8376,6 +8544,12 @@ def _cmd_update_impl(args, gateway_mode: bool):
|
|||
_install_psutil_android_compat(pip_cmd)
|
||||
_install_python_dependencies_with_optional_fallback(pip_cmd, group=install_group)
|
||||
|
||||
# Core Python deps installed AND verified (the fallback helper runs
|
||||
# _verify_core_dependencies_installed). Clear the interrupted-install
|
||||
# breadcrumb now — the remaining steps (lazy refresh, node deps, web
|
||||
# UI, desktop rebuild) are non-core and can't brick the venv.
|
||||
_clear_update_incomplete_marker()
|
||||
|
||||
_refresh_active_lazy_features()
|
||||
|
||||
_update_node_dependencies()
|
||||
|
|
@ -10690,6 +10864,22 @@ def main():
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
# Self-heal a venv left half-built by an interrupted ``hermes update``
|
||||
# (Ctrl-C, terminal close, WSL OOM mid-install). Skip when the user is
|
||||
# *running* update — that flow writes and clears its own marker, and we
|
||||
# don't want a recovery install racing the real one. Never raises.
|
||||
#
|
||||
# The substring match is deliberately loose: argv isn't parsed yet at this
|
||||
# point, and the failure modes are asymmetric. Over-matching (e.g.
|
||||
# ``hermes skills install update``) merely defers recovery one launch;
|
||||
# under-matching (missing ``hermes -p work update``) would race a recovery
|
||||
# install against the real one. Loose wins.
|
||||
try:
|
||||
if "update" not in sys.argv[1:]:
|
||||
_recover_from_interrupted_install()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if _try_termux_fast_tui_launch():
|
||||
return
|
||||
if _try_termux_fast_cli_launch():
|
||||
|
|
|
|||
218
tests/hermes_cli/test_update_interrupted_recovery.py
Normal file
218
tests/hermes_cli/test_update_interrupted_recovery.py
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
"""Tests for interrupted-install self-heal (the ``.update-incomplete`` marker).
|
||||
|
||||
Covers the breadcrumb lifecycle and the launch-time recovery guard added so a
|
||||
``hermes update`` killed mid-install (Ctrl-C, terminal close, WSL OOM) gets
|
||||
finished automatically on the next launch instead of leaving a half-built venv.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import hermes_cli.main as m
|
||||
|
||||
|
||||
def test_marker_round_trip(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
marker = m._update_marker_path()
|
||||
assert marker == tmp_path / ".update-incomplete"
|
||||
assert not marker.exists()
|
||||
|
||||
m._write_update_incomplete_marker()
|
||||
assert marker.exists()
|
||||
body = marker.read_text()
|
||||
assert "started=" in body
|
||||
assert "pid=" in body
|
||||
|
||||
m._clear_update_incomplete_marker()
|
||||
assert not marker.exists()
|
||||
|
||||
|
||||
def test_clear_when_absent_is_noop(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
# Must not raise when the marker was never written.
|
||||
m._clear_update_incomplete_marker()
|
||||
assert not m._update_marker_path().exists()
|
||||
|
||||
|
||||
def test_recovery_noop_without_marker(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
called = {"install": False}
|
||||
monkeypatch.setattr(
|
||||
m,
|
||||
"_install_python_dependencies_with_optional_fallback",
|
||||
lambda *a, **k: called.__setitem__("install", True),
|
||||
)
|
||||
m._recover_from_interrupted_install()
|
||||
assert called["install"] is False, "recovery must not install when no marker"
|
||||
|
||||
|
||||
def test_recovery_clears_stray_marker_without_pyproject(tmp_path, monkeypatch):
|
||||
# No pyproject.toml (PyPI/Docker install) — a stray marker is not ours to
|
||||
# act on; recovery should just clear it without trying to install.
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
m._write_update_incomplete_marker()
|
||||
called = {"install": False}
|
||||
monkeypatch.setattr(
|
||||
m,
|
||||
"_install_python_dependencies_with_optional_fallback",
|
||||
lambda *a, **k: called.__setitem__("install", True),
|
||||
)
|
||||
m._recover_from_interrupted_install()
|
||||
assert called["install"] is False
|
||||
assert not m._update_marker_path().exists()
|
||||
|
||||
|
||||
def test_recovery_runs_install_and_clears_marker(tmp_path, monkeypatch):
|
||||
# Source-tree install (pyproject present) with marker set → recovery should
|
||||
# run the dep install and clear the marker on success.
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
|
||||
m._write_update_incomplete_marker()
|
||||
|
||||
seen = {"ensurepip": False, "install": False}
|
||||
|
||||
def fake_run(cmd, *a, **k):
|
||||
if "ensurepip" in cmd:
|
||||
seen["ensurepip"] = True
|
||||
|
||||
class R:
|
||||
returncode = 0
|
||||
|
||||
return R()
|
||||
|
||||
monkeypatch.setattr(m.subprocess, "run", fake_run)
|
||||
monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
|
||||
monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
|
||||
monkeypatch.setattr(
|
||||
m,
|
||||
"_install_python_dependencies_with_optional_fallback",
|
||||
lambda *a, **k: seen.__setitem__("install", True),
|
||||
)
|
||||
|
||||
m._recover_from_interrupted_install()
|
||||
|
||||
assert seen["ensurepip"] is True, "ensurepip must run unconditionally first"
|
||||
assert seen["install"] is True, "dep install must run"
|
||||
assert not m._update_marker_path().exists(), "marker cleared on success"
|
||||
|
||||
|
||||
def test_recovery_keeps_marker_on_failure(tmp_path, monkeypatch):
|
||||
# If the install itself blows up, the marker must survive so the next
|
||||
# launch retries — and recovery must not raise.
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
|
||||
m._write_update_incomplete_marker()
|
||||
|
||||
class R:
|
||||
returncode = 0
|
||||
|
||||
monkeypatch.setattr(m.subprocess, "run", lambda *a, **k: R())
|
||||
monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
|
||||
monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
|
||||
|
||||
def boom(*a, **k):
|
||||
raise RuntimeError("install died")
|
||||
|
||||
monkeypatch.setattr(
|
||||
m, "_install_python_dependencies_with_optional_fallback", boom
|
||||
)
|
||||
|
||||
# Must not raise.
|
||||
m._recover_from_interrupted_install()
|
||||
assert m._update_marker_path().exists(), "marker preserved for retry on failure"
|
||||
|
||||
|
||||
def _stub_install_env(monkeypatch, m, seen):
|
||||
"""Common stubs so recovery's install path is inert and observable."""
|
||||
|
||||
class R:
|
||||
returncode = 0
|
||||
|
||||
monkeypatch.setattr(m.subprocess, "run", lambda *a, **k: R())
|
||||
monkeypatch.setattr(m, "_is_termux_env", lambda *a, **k: False)
|
||||
monkeypatch.setattr("hermes_cli.managed_uv.ensure_uv", lambda: None)
|
||||
monkeypatch.setattr(
|
||||
m,
|
||||
"_install_python_dependencies_with_optional_fallback",
|
||||
lambda *a, **k: seen.__setitem__("install", True),
|
||||
)
|
||||
|
||||
|
||||
def test_recovery_skips_when_lock_held(tmp_path, monkeypatch):
|
||||
# Another process is mid-recovery (fresh lockfile) — this launch must skip
|
||||
# the install entirely and leave both marker and lock untouched.
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
|
||||
m._write_update_incomplete_marker()
|
||||
lock = tmp_path / ".update-incomplete.lock"
|
||||
lock.write_text("12345\n")
|
||||
|
||||
seen = {"install": False}
|
||||
_stub_install_env(monkeypatch, m, seen)
|
||||
|
||||
m._recover_from_interrupted_install()
|
||||
|
||||
assert seen["install"] is False, "must not install while another holds the lock"
|
||||
assert m._update_marker_path().exists(), "marker left for the lock holder"
|
||||
assert lock.exists(), "fresh lock must not be broken"
|
||||
|
||||
|
||||
def test_recovery_breaks_stale_lock(tmp_path, monkeypatch):
|
||||
# A lock older than an hour is from a crashed holder — it gets removed so
|
||||
# the NEXT launch can recover (this launch still skips).
|
||||
import os as _os
|
||||
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
|
||||
m._write_update_incomplete_marker()
|
||||
lock = tmp_path / ".update-incomplete.lock"
|
||||
lock.write_text("12345\n")
|
||||
stale = m._time.time() - 7200
|
||||
_os.utime(lock, (stale, stale))
|
||||
|
||||
seen = {"install": False}
|
||||
_stub_install_env(monkeypatch, m, seen)
|
||||
|
||||
m._recover_from_interrupted_install()
|
||||
|
||||
assert not lock.exists(), "stale lock must be broken"
|
||||
assert m._update_marker_path().exists()
|
||||
|
||||
# Next launch proceeds normally.
|
||||
m._recover_from_interrupted_install()
|
||||
assert seen["install"] is True
|
||||
assert not m._update_marker_path().exists()
|
||||
assert not lock.exists(), "lock released after recovery"
|
||||
|
||||
|
||||
def test_recovery_releases_lock_after_run(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
|
||||
m._write_update_incomplete_marker()
|
||||
|
||||
seen = {"install": False}
|
||||
_stub_install_env(monkeypatch, m, seen)
|
||||
|
||||
m._recover_from_interrupted_install()
|
||||
|
||||
assert seen["install"] is True
|
||||
assert not (tmp_path / ".update-incomplete.lock").exists()
|
||||
|
||||
|
||||
def test_recovery_output_goes_to_stderr(tmp_path, monkeypatch, capfd):
|
||||
# ACP speaks JSON-RPC on stdout — recovery output (including the streamed
|
||||
# install, which inherits fd 1) must land on stderr only.
|
||||
monkeypatch.setattr(m, "PROJECT_ROOT", tmp_path)
|
||||
(tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n")
|
||||
m._write_update_incomplete_marker()
|
||||
|
||||
seen = {"install": False}
|
||||
_stub_install_env(monkeypatch, m, seen)
|
||||
|
||||
m._recover_from_interrupted_install()
|
||||
|
||||
out, err = capfd.readouterr()
|
||||
assert "interrupted mid-install" not in out
|
||||
assert "interrupted mid-install" in err
|
||||
assert "recovered" in err
|
||||
Loading…
Add table
Add a link
Reference in a new issue