From cbd6ba1bdd916d8342f6c741cb290b62dd89dbc4 Mon Sep 17 00:00:00 2001 From: Ben Date: Tue, 23 Jun 2026 20:47:01 +1000 Subject: [PATCH] fix(docker): redirect lazy installs to a durable target so opt-in backends work in the immutable image (#51136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The published Docker image seals the agent venv (root-owned, read-only /opt/hermes) and sets HERMES_DISABLE_LAZY_INSTALLS=1 so a runtime install can't mutate and brick the core. But opt-in backends (Firecrawl web search, Exa, Feishu, ...) deliberately keep their SDKs in tools/lazy_deps.py and out of [all] (pyproject policy 2026-05-12: one quarantined release must not break every install). The two policies collided: the SDK isn't baked in AND can't lazy-install, so the default Firecrawl web_search/web_extract fail out of the box in Docker (#51136), as do Exa (#49445) and Feishu (#50205). Fix the whole class instead of baking in one backend: when HERMES_LAZY_INSTALL_TARGET is set, lazy installs are redirected to a writable dir on the durable /opt/data volume via `pip/uv install --target`, and that dir is APPENDED to the end of sys.path. Because the core venv always wins name collisions, a package installed this way can only ADD new modules — it can never shadow, downgrade, or break a module the core ships. The worst a bad/incompatible backend package can do is fail to import and report itself unavailable; the agent core stays healthy. That structural guarantee is what made it safe to seal the venv, and it is preserved here even with installs re-enabled. - tools/lazy_deps.py: durable-target mode — `--target` install + core-pinned `--constraint` file (shared deps resolve to core's versions, conflicts fail loudly at install time), append-only sys.path activation, ABI/Python-version stamp that wipes the store if an image rebuild bumps the interpreter, and a reworked gate so HERMES_DISABLE_LAZY_INSTALLS=1 redirects (rather than hard- blocks) when a target is set. security.allow_lazy_installs=false still disables installs in every mode. - hermes_bootstrap.py: activate the durable target on sys.path at first import (before any backend imports its SDK) so packages installed on a previous run are importable on this run. - Dockerfile: set HERMES_LAZY_INSTALL_TARGET=/opt/data/lazy-packages. - docker/stage2-hook.sh: seed + chown the dir on the data volume. - tests: real-install E2E proving installs land in the target, import cleanly, don't leak into the sealed venv, and that a core package is never shadowed; ABI-stamp wipe/preserve; gate matrix; Dockerfile/stage2 contract test. Fixes #51136 --- Dockerfile | 13 + docker/stage2-hook.sh | 16 +- hermes_bootstrap.py | 30 ++ .../test_dockerfile_immutable_install.py | 33 ++ tests/tools/test_lazy_deps_durable_target.py | 260 +++++++++++++ tools/lazy_deps.py | 368 +++++++++++++++--- 6 files changed, 660 insertions(+), 60 deletions(-) create mode 100644 tests/tools/test_lazy_deps_durable_target.py diff --git a/Dockerfile b/Dockerfile index b4ebd093697..c01de9857bb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -290,6 +290,19 @@ ENV HERMES_TUI_DIR=/opt/hermes/ui-tui ENV HERMES_HOME=/opt/data ENV HERMES_WRITE_SAFE_ROOT=/opt/data ENV HERMES_DISABLE_LAZY_INSTALLS=1 +# The published image seals /opt/hermes (root-owned, read-only) so a runtime +# lazy install can't mutate the agent's own venv and brick it. But opt-in +# backends (Firecrawl web search, Exa, Feishu, …) keep their SDKs in +# tools/lazy_deps.py — deliberately NOT baked into [all] (see pyproject.toml +# policy 2026-05-12: one quarantined release must not break every install). +# Redirect those lazy installs to a writable dir on the durable data volume. +# lazy_deps appends this dir to the END of sys.path, so a package installed +# here can only ADD modules — it can never shadow or downgrade a core module, +# so the sealed-venv guarantee holds even with installs re-enabled. The dir +# is seeded + chowned to the hermes user by docker/stage2-hook.sh and lives +# on the /opt/data volume, so it persists across container recreates / image +# updates (an ABI stamp invalidates it if a rebuild bumps the interpreter). +ENV HERMES_LAZY_INSTALL_TARGET=/opt/data/lazy-packages # `docker exec` privilege-drop shim. When operators run # `docker exec hermes ...` they default to root, and any file the diff --git a/docker/stage2-hook.sh b/docker/stage2-hook.sh index 54b3bb0ac96..8d7258991e1 100755 --- a/docker/stage2-hook.sh +++ b/docker/stage2-hook.sh @@ -199,7 +199,7 @@ if [ "$needs_chown" = true ]; then # Hermes-owned subdirs: recursive chown is safe here because these are # created and managed exclusively by hermes (see the s6-setuidgid mkdir # -p block below for the canonical list). - for sub in cron sessions logs hooks memories skills skins plans workspace home profiles pairing platforms/pairing; do + for sub in cron sessions logs hooks memories skills skins plans workspace home profiles pairing platforms/pairing lazy-packages; do if [ -e "$HERMES_HOME/$sub" ]; then chown -R hermes:hermes "$HERMES_HOME/$sub" 2>/dev/null || \ echo "[stage2] Warning: chown $HERMES_HOME/$sub failed (rootless container?) — continuing" @@ -214,6 +214,17 @@ fi # HERMES_DISABLE_LAZY_INSTALLS=1. Keeping /opt/hermes root-owned and # non-writable prevents an agent session from self-modifying the installed # source, venv, TUI bundle, or node_modules and bricking the gateway. +# +# Lazy-installable optional backends (Firecrawl, Exa, Feishu, etc.) cannot +# install into the sealed venv, so they are redirected to the writable +# $HERMES_HOME/lazy-packages dir on the data volume (Dockerfile sets +# HERMES_LAZY_INSTALL_TARGET). That dir is appended to the END of sys.path, +# so a package installed there can only ADD modules — it can never shadow or +# break a core module, which is what keeps the sealed-venv guarantee intact +# even though installs are re-enabled. The dir is seeded + chowned to hermes +# in the mkdir/chown blocks above so first-use installs succeed as the +# unprivileged runtime user, and it persists across container recreates / +# image updates (an ABI stamp wipes it if a rebuild bumps the interpreter). # Always reset ownership of $HERMES_HOME/profiles to hermes on every # boot. Profile dirs and files can land owned by root when commands @@ -289,7 +300,8 @@ as_hermes mkdir -p \ "$HERMES_HOME/workspace" \ "$HERMES_HOME/home" \ "$HERMES_HOME/pairing" \ - "$HERMES_HOME/platforms/pairing" + "$HERMES_HOME/platforms/pairing" \ + "$HERMES_HOME/lazy-packages" # --- Install-method stamp --- # The 'docker' stamp is baked into the immutable install tree at diff --git a/hermes_bootstrap.py b/hermes_bootstrap.py index e43d9db80b9..ae23cc97629 100644 --- a/hermes_bootstrap.py +++ b/hermes_bootstrap.py @@ -158,8 +158,38 @@ def harden_import_path(src_root: str | None = None) -> None: sys.path.insert(0, root) +def activate_durable_lazy_target() -> None: + """Put the durable lazy-install dir on ``sys.path`` if one is configured. + + On immutable Docker images the agent venv is sealed and lazy installs + are redirected to a writable dir on the data volume + (``HERMES_LAZY_INSTALL_TARGET``, e.g. ``/opt/data/lazy-packages``). + Packages installed there on a previous run must be importable on this + run, so we activate the dir here — at the very first import, before any + backend module imports its SDK. + + The activation appends to the END of ``sys.path`` so the core venv + always wins name collisions (see ``tools.lazy_deps`` for the full + security rationale). Never raises; a missing/empty target is a no-op. + """ + if not os.environ.get("HERMES_LAZY_INSTALL_TARGET", "").strip(): + return + try: + from tools import lazy_deps + lazy_deps.activate_durable_lazy_target() + except Exception: + # Bootstrap must never crash an entry point. If activation fails the + # backend simply reports itself unavailable, exactly as before. + pass + + # Apply on import — entry points just need ``import hermes_bootstrap`` # (or ``from hermes_bootstrap import apply_windows_utf8_bootstrap``) at # the very top of their module, before importing anything else. The # import side effect does the right thing. apply_windows_utf8_bootstrap() + +# Activate the durable lazy-install target (immutable Docker images) so +# packages installed into the data volume on a previous run are importable +# this run, before any backend module imports its SDK. No-op when unset. +activate_durable_lazy_target() diff --git a/tests/tools/test_dockerfile_immutable_install.py b/tests/tools/test_dockerfile_immutable_install.py index fc7804f5d63..ffa039a854c 100644 --- a/tests/tools/test_dockerfile_immutable_install.py +++ b/tests/tools/test_dockerfile_immutable_install.py @@ -84,3 +84,36 @@ def test_dockerfile_bakes_code_scoped_install_method_stamp() -> None: "the code-scoped install-method stamp must be baked inside the " "immutable /opt/hermes block" ) + + +def test_dockerfile_redirects_lazy_installs_to_durable_target() -> None: + """Immutable image seals the venv but redirects lazy installs to the + writable data volume, so opt-in backends still install at first use + without being able to break the sealed core. + + Guards the contract between the Dockerfile env var, the stage2-hook + seeding, and tools/lazy_deps.py — these three must agree on the path. + """ + text = _dockerfile_text() + target = "/opt/data/lazy-packages" + + # The redirect target must be set AND must live under the data volume, + # never under the immutable /opt/hermes tree. + assert f"ENV HERMES_LAZY_INSTALL_TARGET={target}" in text + assert target.startswith("/opt/data/"), "target must be on the durable volume" + assert "ENV HERMES_LAZY_INSTALL_TARGET=/opt/hermes" not in text + + # The seal flag must still be present — the redirect rides on top of it, + # it does not replace it. + assert "ENV HERMES_DISABLE_LAZY_INSTALLS=1" in text + + # stage2-hook must seed + chown the target dir so first-use installs + # succeed as the unprivileged hermes runtime user. + stage2 = (REPO_ROOT / "docker" / "stage2-hook.sh").read_text() + assert '"$HERMES_HOME/lazy-packages"' in stage2, ( + "stage2-hook.sh must create the lazy-packages dir on the data volume" + ) + assert "lazy-packages" in stage2.split("for sub in", 1)[1].split(";", 1)[0], ( + "lazy-packages must be in the per-boot chown subdir list so it stays " + "hermes-owned" + ) diff --git a/tests/tools/test_lazy_deps_durable_target.py b/tests/tools/test_lazy_deps_durable_target.py new file mode 100644 index 00000000000..4532c5e0f01 --- /dev/null +++ b/tests/tools/test_lazy_deps_durable_target.py @@ -0,0 +1,260 @@ +"""Tests for the durable lazy-install target (immutable Docker images). + +These cover the mechanism that lets opt-in backends lazy-install on the +sealed-venv Docker image without being able to break the agent core: +installs are redirected to a writable dir on the data volume, and that dir +is appended to the END of ``sys.path`` so the core venv always wins name +collisions. + +The headline invariant — *a package in the durable store can never shadow +a core module* — is proved with a REAL install into a temp target (no +mocked pip), exercising the actual ``--target`` + sys.path-append path. +That E2E test is guarded by network availability; everything else is pure +unit logic with no network. +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import sysconfig +from pathlib import Path + +import pytest + +from tools import lazy_deps as ld + + +# --------------------------------------------------------------------------- +# Target resolution + gating +# --------------------------------------------------------------------------- + + +class TestTargetResolution: + def test_no_target_when_env_unset(self, monkeypatch): + monkeypatch.delenv(ld._LAZY_TARGET_ENV, raising=False) + assert ld._lazy_install_target() is None + + def test_no_target_when_env_blank(self, monkeypatch): + monkeypatch.setenv(ld._LAZY_TARGET_ENV, " ") + assert ld._lazy_install_target() is None + + def test_target_resolved_when_set(self, monkeypatch, tmp_path): + monkeypatch.setenv(ld._LAZY_TARGET_ENV, str(tmp_path / "lazy")) + assert ld._lazy_install_target() == tmp_path / "lazy" + + +class TestGatingWithTarget: + """``HERMES_DISABLE_LAZY_INSTALLS=1`` must STOP blocking once a durable + target is configured — the redirect is the safe path — but the config + kill switch still wins in every mode.""" + + def test_disable_env_blocks_without_target(self, monkeypatch): + monkeypatch.setenv("HERMES_DISABLE_LAZY_INSTALLS", "1") + monkeypatch.delenv(ld._LAZY_TARGET_ENV, raising=False) + # config unreadable → fails open on the config check, but the sealed + # env var with no target still blocks. + monkeypatch.setattr( + "hermes_cli.config.load_config", lambda: {}, raising=False + ) + assert ld._allow_lazy_installs() is False + + def test_disable_env_allows_with_target(self, monkeypatch, tmp_path): + monkeypatch.setenv("HERMES_DISABLE_LAZY_INSTALLS", "1") + monkeypatch.setenv(ld._LAZY_TARGET_ENV, str(tmp_path)) + monkeypatch.setattr( + "hermes_cli.config.load_config", lambda: {}, raising=False + ) + assert ld._allow_lazy_installs() is True + + def test_config_killswitch_wins_even_with_target(self, monkeypatch, tmp_path): + # Explicit opt-out must disable installs even when a target exists. + monkeypatch.setenv("HERMES_DISABLE_LAZY_INSTALLS", "1") + monkeypatch.setenv(ld._LAZY_TARGET_ENV, str(tmp_path)) + monkeypatch.setattr( + "hermes_cli.config.load_config", + lambda: {"security": {"allow_lazy_installs": False}}, + raising=False, + ) + assert ld._allow_lazy_installs() is False + + def test_normal_mode_unaffected(self, monkeypatch): + # No sealed env, no target → default allow (unchanged behaviour). + monkeypatch.delenv("HERMES_DISABLE_LAZY_INSTALLS", raising=False) + monkeypatch.delenv(ld._LAZY_TARGET_ENV, raising=False) + monkeypatch.setattr( + "hermes_cli.config.load_config", lambda: {}, raising=False + ) + assert ld._allow_lazy_installs() is True + + +# --------------------------------------------------------------------------- +# ABI stamp / durable-store rebuild safety +# --------------------------------------------------------------------------- + + +class TestAbiStamp: + def test_creates_dir_and_stamp(self, tmp_path): + target = tmp_path / "lazy" + err = ld._ensure_target_ready(target) + assert err is None + assert target.is_dir() + stamp = target / ld._TARGET_STAMP_NAME + assert stamp.read_text().strip() == ld._python_abi_tag() + + def test_matching_stamp_preserves_contents(self, tmp_path): + target = tmp_path / "lazy" + ld._ensure_target_ready(target) + # Drop a fake installed package. + (target / "somepkg").mkdir() + (target / "somepkg" / "__init__.py").write_text("x = 1\n") + # Re-run with the SAME abi → contents must survive. + err = ld._ensure_target_ready(target) + assert err is None + assert (target / "somepkg" / "__init__.py").exists() + + def test_mismatched_stamp_wipes_contents(self, tmp_path): + target = tmp_path / "lazy" + ld._ensure_target_ready(target) + (target / "stalepkg").mkdir() + (target / "stalepkg" / "mod.py").write_text("x = 1\n") + # Simulate an image rebuild onto a different interpreter ABI. + (target / ld._TARGET_STAMP_NAME).write_text("2.7:old-abi-tag") + err = ld._ensure_target_ready(target) + assert err is None + # Stale package wiped; stamp refreshed to current ABI. + assert not (target / "stalepkg").exists() + assert (target / ld._TARGET_STAMP_NAME).read_text().strip() == ld._python_abi_tag() + + def test_readonly_target_reports_error(self, tmp_path): + # A path under a non-writable parent should surface a clean error, + # not raise. + ro_parent = tmp_path / "ro" + ro_parent.mkdir() + os.chmod(ro_parent, 0o500) + try: + err = ld._ensure_target_ready(ro_parent / "lazy") + assert err is not None + assert "not writable" in err + finally: + os.chmod(ro_parent, 0o700) # let pytest clean up + + +# --------------------------------------------------------------------------- +# sys.path append ordering (the core-wins invariant, unit level) +# --------------------------------------------------------------------------- + + +class TestSysPathAppend: + def test_target_appended_not_prepended(self, tmp_path, monkeypatch): + target = tmp_path / "lazy" + target.mkdir() + saved = list(sys.path) + try: + ld._activate_target_on_syspath(target) + assert str(target) in sys.path + # Must be at/after every pre-existing entry — i.e. core wins. + idx = sys.path.index(str(target)) + assert idx >= len(saved), ( + "durable target must be appended after all core entries" + ) + finally: + sys.path[:] = saved + + def test_activation_idempotent(self, tmp_path, monkeypatch): + target = tmp_path / "lazy" + target.mkdir() + saved = list(sys.path) + try: + ld._activate_target_on_syspath(target) + ld._activate_target_on_syspath(target) + assert sys.path.count(str(target)) == 1 + finally: + sys.path[:] = saved + + +# --------------------------------------------------------------------------- +# E2E: a REAL install into a durable target cannot shadow core. +# --------------------------------------------------------------------------- + + +def _network_available() -> bool: + try: + import urllib.request + urllib.request.urlopen("https://pypi.org/simple/", timeout=5) + return True + except Exception: + return False + + +@pytest.mark.skipif(not _network_available(), reason="needs PyPI network access") +class TestRealInstallCoreWins: + """Install a real package into a durable target and prove: + + 1. It lands in the target dir, NOT the core venv. + 2. It is importable via the appended sys.path entry. + 3. A package name that ALSO exists in core resolves to the CORE copy, + never the durable-store copy (the structural anti-shadow guarantee). + """ + + def test_install_lands_in_target_and_imports(self, tmp_path, monkeypatch): + target = tmp_path / "lazy-packages" + monkeypatch.setenv(ld._LAZY_TARGET_ENV, str(target)) + # 'isodate' is tiny, pure-python, and not shipped in the core venv, + # so a successful import must resolve to the durable target. + result = ld._venv_pip_install(("isodate==0.7.2",)) + assert result.success, f"install failed: {result.stderr}" + # Landed in the durable target, not the core venv. + installed = list(target.glob("isodate*")) + assert installed, f"isodate not found under target {target}: {list(target.iterdir())}" + # Importable now that the target is on sys.path. + import importlib + importlib.invalidate_caches() + mod = importlib.import_module("isodate") + assert mod.__file__ is not None + assert Path(mod.__file__).is_relative_to(target) + + def test_core_package_is_not_shadowed(self, tmp_path, monkeypatch): + """Force-install an OLD version of a package the core already ships + into the durable target, then assert the running interpreter still + imports the CORE version — proving append-ordering protects core. + + We use 'packaging', which is always present in the venv (transitive + of pip/build tooling). We install a deliberately old pin into the + target and check the resolved module path + version is core's. + """ + import packaging # core copy + core_path = Path(packaging.__file__).parent + core_version = __import__("importlib.metadata", fromlist=["version"]).version( + "packaging" + ) + + target = tmp_path / "lazy-packages" + monkeypatch.setenv(ld._LAZY_TARGET_ENV, str(target)) + # Install an old packaging into the target WITHOUT the core + # constraints file (bypass the tidy resolver) so a shadow copy + # genuinely exists on disk in the target — the worst case. + ld._ensure_target_ready(target) + subprocess.run( + [sys.executable, "-m", "pip", "install", "--target", str(target), + "--no-deps", "packaging==20.9"], + check=True, capture_output=True, text=True, + ) + assert list(target.glob("packaging*")), "shadow copy should exist on disk" + + # Activate the target (append) and re-resolve. + ld._activate_target_on_syspath(target) + import importlib + importlib.invalidate_caches() + importlib.reload(packaging) + # Core path + version must still win. + assert Path(packaging.__file__).parent == core_path, ( + "durable-store copy shadowed the core module — append ordering broke" + ) + new_version = __import__("importlib.metadata", fromlist=["version"]).version( + "packaging" + ) + assert new_version == core_version, ( + f"metadata resolved to shadow version {new_version}, expected core {core_version}" + ) diff --git a/tools/lazy_deps.py b/tools/lazy_deps.py index b7883aabafb..cec730dcb2b 100644 --- a/tools/lazy_deps.py +++ b/tools/lazy_deps.py @@ -24,8 +24,24 @@ remediation hint pointing at ``hermes tools`` or the manual pip command. Security model: -* **Venv-scoped only.** Installs target ``sys.executable`` in the active - venv. We never touch the system Python. +* **Venv-scoped by default.** Installs target ``sys.executable`` in the + active venv. We never touch the system Python. +* **Durable-target mode (immutable images).** When the deployment seals the + agent's own venv (the Docker image sets ``HERMES_DISABLE_LAZY_INSTALLS=1`` + and makes ``/opt/hermes`` read-only), setting + ``HERMES_LAZY_INSTALL_TARGET`` redirects lazy installs to a writable + directory on the durable data volume (e.g. ``/opt/data/lazy-packages``). + That directory is **appended to the end of ``sys.path``** — never + prepended, never exported via ``PYTHONPATH`` — so the agent's own + site-packages wins every name collision. A package installed this way can + only ADD new importable modules; it can never shadow, downgrade, or break + a module the core already ships. The worst a bad/incompatible backend + package can do is fail to import and report itself unavailable — the agent + core stays healthy. This is the structural guarantee that a lazily + installed package cannot brick Hermes, which is what made it safe to seal + the venv in the first place. Compiled-wheel safety across image rebuilds + is handled by an ABI/Python-version stamp on the target subdir (see + :func:`_ensure_target_ready`). * **PyPI by package name only.** Specs may be ``"package>=1.0,<2"`` etc. We do NOT support ``--index-url`` overrides, ``git+https://``, file: paths, or any other input that could be hijacked by a malicious config. @@ -33,9 +49,9 @@ Security model: installed via this path. A typo in feature name doesn't get the user install-anything semantics. * **Opt-out.** Setting ``security.allow_lazy_installs: false`` in - ``config.yaml`` disables runtime installs. Users in restricted networks - or strict security postures can pin themselves to whatever was installed - at setup time. + ``config.yaml`` disables runtime installs in BOTH modes. Users in + restricted networks or strict security postures can pin themselves to + whatever was installed at setup time. * **Offline detection.** If the install fails (offline, mirror down, PyPI 404 / quarantine), we surface the failure as :class:`FeatureUnavailable` with the actual pip stderr — no silent @@ -55,8 +71,10 @@ import logging import os import re import shutil +import site import subprocess import sys +import sysconfig from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, Optional @@ -243,23 +261,171 @@ class _InstallResult: # ============================================================================= +# Environment variable that redirects lazy installs away from the (sealed) +# agent venv and into a writable directory on a durable volume. Set by the +# Docker image to /opt/data/lazy-packages. This is an internal bridge var, +# not user-facing config: the user-facing knob remains +# security.allow_lazy_installs in config.yaml. When unset, lazy installs go +# into the active venv as before. +_LAZY_TARGET_ENV = "HERMES_LAZY_INSTALL_TARGET" + +# Name of the stamp file written into the target dir recording the Python +# X.Y + ABI it was populated for. If a container rebuild bumps the +# interpreter, compiled wheels (.so) in the durable store would be ABI- +# incompatible; we detect the mismatch and wipe the store so packages get +# re-resolved against the new interpreter rather than importing a stale .so. +_TARGET_STAMP_NAME = ".python-abi" + + +def _python_abi_tag() -> str: + """A stable token identifying the running interpreter's ABI. + + Combines the X.Y version with the EXT_SUFFIX (which encodes the ABI + tag and platform, e.g. ``cpython-313-x86_64-linux-gnu``). Two + interpreters that can share compiled wheels produce the same token. + """ + ver = f"{sys.version_info.major}.{sys.version_info.minor}" + ext = sysconfig.get_config_var("EXT_SUFFIX") or "" + return f"{ver}:{ext}" + + +def _lazy_install_target() -> Optional[Path]: + """Return the durable install-target dir, or None for venv-scoped mode. + + Returns a path only when :data:`_LAZY_TARGET_ENV` is set to a non-empty + value. The directory is created on demand by :func:`_ensure_target_ready`. + """ + raw = os.environ.get(_LAZY_TARGET_ENV, "").strip() + if not raw: + return None + return Path(raw) + + +def _ensure_target_ready(target: Path) -> Optional[str]: + """Create the target dir and validate its ABI stamp. + + If the stamp is missing it is written. If it is present but records a + different interpreter ABI than the one now running (e.g. the container + image was rebuilt onto a newer Python), the directory's contents are + wiped and the stamp rewritten, so stale compiled wheels can't be + imported against an incompatible interpreter. + + Returns ``None`` on success, or an error string if the directory can't + be created / written (e.g. read-only mount, permission error). + """ + want = _python_abi_tag() + stamp = target / _TARGET_STAMP_NAME + try: + if target.exists(): + have = "" + try: + have = stamp.read_text(encoding="utf-8").strip() + except (OSError, FileNotFoundError): + have = "" + if have and have != want: + logger.info( + "Lazy install target %s was built for ABI %r but running " + "ABI is %r; wiping stale packages.", + target, have, want, + ) + for child in target.iterdir(): + if child.is_dir() and not child.is_symlink(): + shutil.rmtree(child, ignore_errors=True) + else: + try: + child.unlink() + except OSError: + pass + target.mkdir(parents=True, exist_ok=True) + stamp.write_text(want, encoding="utf-8") + except OSError as e: + return f"lazy install target {target} is not writable: {e}" + return None + + +def _activate_target_on_syspath(target: Path) -> None: + """Append the durable target to ``sys.path`` so its packages import. + + Appended to the END (never prepended) so the agent's own venv + site-packages takes precedence on every name collision. Idempotent. + Uses :func:`site.addsitedir` so ``.pth`` files (namespace packages, + editable installs) inside the target are honoured, then enforces the + append ordering — ``addsitedir`` would otherwise insert near the front. + """ + target_str = str(target) + # Snapshot existing entries so we can restore precedence afterwards. + before = list(sys.path) + if target_str not in before: + site.addsitedir(target_str) + # site.addsitedir may have inserted target (and any .pth-added dirs) at + # the front. Move every newly-added entry to the end, preserving the + # core venv's precedence. New entries are those not present `before`. + new_entries = [p for p in sys.path if p not in before] + if new_entries: + sys.path[:] = [p for p in sys.path if p not in new_entries] + new_entries + # importlib.metadata caches the path-based distribution finder; clear it + # so a just-activated dir is visible to version() checks this process. + try: + import importlib + importlib.invalidate_caches() + except Exception: + pass + + +def activate_durable_lazy_target() -> None: + """Public: wire the durable lazy-install target onto ``sys.path``. + + Safe no-op when :data:`_LAZY_TARGET_ENV` is unset or the directory does + not yet exist. Called once early in process startup (before backends + import) so packages installed into the durable store on a previous run + are importable on this run. Never raises. + """ + target = _lazy_install_target() + if target is None: + return + try: + if target.exists(): + _activate_target_on_syspath(target) + except Exception as e: # pragma: no cover - defensive + logger.debug("Failed to activate durable lazy target %s: %s", target, e) + + def _allow_lazy_installs() -> bool: - """Return the ``security.allow_lazy_installs`` config flag. + """Return whether lazy installs are permitted in this environment. + + Resolution order: + + 1. ``security.allow_lazy_installs: false`` in config.yaml is an absolute + opt-out — it disables installs in BOTH venv-scoped and durable-target + modes. This is the user-facing kill switch. + 2. ``HERMES_DISABLE_LAZY_INSTALLS=1`` seals the *agent venv* (set by the + immutable Docker image). It blocks venv-scoped installs — UNLESS a + durable install target is configured, in which case installs are + redirected there (a path that structurally cannot break the sealed + venv) and are therefore allowed. Defaults to True. If config is unreadable we fail open (allow), because refusing to install would lock people out of their own backends; the decision to block is an explicit user opt-in. """ - if os.environ.get("HERMES_DISABLE_LAZY_INSTALLS") == "1": - return False + # (1) Config kill switch wins in every mode. try: from hermes_cli.config import load_config cfg = load_config() except Exception: - return True - sec = cfg.get("security") or {} - val = sec.get("allow_lazy_installs", True) - return bool(val) + cfg = None + if cfg is not None: + sec = cfg.get("security") or {} + if not bool(sec.get("allow_lazy_installs", True)): + return False + + # (2) Sealed-venv env var: blocks ONLY when there is no safe durable + # target to redirect into. With a target set, the install goes to the + # data volume (append-only on sys.path), so the seal is preserved. + if os.environ.get("HERMES_DISABLE_LAZY_INSTALLS") == "1": + return _lazy_install_target() is not None + + return True def _spec_is_safe(spec: str) -> bool: @@ -361,8 +527,66 @@ def _is_present(spec: str) -> bool: return False +def _core_constraints_file() -> Optional[Path]: + """Write a pip constraints file pinning every package already importable + in the core environment to its installed version. + + Passed as ``--constraint`` for durable-target installs so the resolver + pins shared transitive deps (httpx, pydantic, aiohttp, …) to the exact + versions the core venv already ships, instead of pulling newer copies + into the durable store. Two payoffs: + + * The durable store stays minimal — only genuinely-new packages land + there; shared deps resolve to "already satisfied" against core. + * A backend that *requires* a version conflicting with core fails loudly + at install time (resolver conflict) rather than silently installing a + shadowed copy that can never win on sys.path anyway. + + Returns the path to a temp constraints file, or None if enumeration + failed (in which case the caller installs without constraints — still + safe, just less tidy). + """ + try: + from importlib.metadata import distributions + except ImportError: + return None + try: + import tempfile + lines = [] + seen = set() + for dist in distributions(): + name = dist.metadata["Name"] if dist.metadata else None + ver = dist.version + if not name or not ver: + continue + key = name.lower() + if key in seen: + continue + seen.add(key) + lines.append(f"{name}=={ver}") + if not lines: + return None + fd, path = tempfile.mkstemp(prefix="hermes-core-constraints-", suffix=".txt") + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write("\n".join(sorted(lines)) + "\n") + return Path(path) + except Exception as e: + logger.debug("Could not build core constraints file: %s", e) + return None + + def _venv_pip_install(specs: tuple[str, ...], *, timeout: int = 300) -> _InstallResult: - """Install ``specs`` into the active venv using uv → pip → ensurepip ladder. + """Install ``specs`` using the uv → pip → ensurepip ladder. + + Two modes: + + * **Venv-scoped (default).** Installs into the active venv + (``sys.executable``). Used on normal installs. + * **Durable-target.** When :data:`_LAZY_TARGET_ENV` is set, installs into + that directory via ``--target`` and constrains shared deps to the + core venv's versions (see :func:`_core_constraints_file`). The target + is append-only on ``sys.path`` so it can never shadow core. Used by + the immutable Docker image to keep lazy installs off the sealed venv. Mirrors the strategy in ``hermes_cli.tools_config._pip_install`` but kept independent here so this module has no CLI dependency. @@ -370,56 +594,84 @@ def _venv_pip_install(specs: tuple[str, ...], *, timeout: int = 300) -> _Install if not specs: return _InstallResult(True, "", "") - venv_root = Path(sys.executable).parent.parent - uv_env = {**os.environ, "VIRTUAL_ENV": str(venv_root)} + target = _lazy_install_target() + constraints: Optional[Path] = None + + if target is not None: + err = _ensure_target_ready(target) + if err: + return _InstallResult(False, "", err) + constraints = _core_constraints_file() + + target_args: list[str] = [] + if target is not None: + # --target tells both uv and pip to install into an arbitrary dir. + target_args = ["--target", str(target)] + constraint_args: list[str] = [] + if constraints is not None: + constraint_args = ["--constraint", str(constraints)] + + try: + venv_root = Path(sys.executable).parent.parent + uv_env = {**os.environ, "VIRTUAL_ENV": str(venv_root)} + + # Tier 1: uv (preferred — fast, doesn't need pip in the venv) + uv_bin = shutil.which("uv") + if uv_bin: + try: + r = subprocess.run( + [uv_bin, "pip", "install", *target_args, *constraint_args, *specs], + capture_output=True, text=True, timeout=timeout, env=uv_env, + stdin=subprocess.DEVNULL, + ) + if r.returncode == 0: + if target is not None: + _activate_target_on_syspath(target) + return _InstallResult(True, r.stdout or "", r.stderr or "") + logger.debug("uv pip install failed: %s", r.stderr) + except (subprocess.TimeoutExpired, FileNotFoundError) as e: + logger.debug("uv invocation failed: %s", e) + + # Tier 2: python -m pip (with ensurepip bootstrap if needed) + pip_cmd = [sys.executable, "-m", "pip"] + try: + probe = subprocess.run( + pip_cmd + ["--version"], + capture_output=True, text=True, timeout=15, + stdin=subprocess.DEVNULL, + ) + if probe.returncode != 0: + raise FileNotFoundError("pip not in venv") + except (subprocess.TimeoutExpired, FileNotFoundError): + try: + subprocess.run( + [sys.executable, "-m", "ensurepip", "--upgrade", "--default-pip"], + capture_output=True, text=True, timeout=120, check=True, + stdin=subprocess.DEVNULL, + ) + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + return _InstallResult(False, "", + f"pip not available and ensurepip failed: {e}") - # Tier 1: uv (preferred — fast, doesn't need pip in the venv) - uv_bin = shutil.which("uv") - if uv_bin: try: r = subprocess.run( - [uv_bin, "pip", "install", *specs], - capture_output=True, text=True, timeout=timeout, env=uv_env, + pip_cmd + ["install", *target_args, *constraint_args, *specs], + capture_output=True, text=True, timeout=timeout, stdin=subprocess.DEVNULL, ) - if r.returncode == 0: - return _InstallResult(True, r.stdout or "", r.stderr or "") - logger.debug("uv pip install failed: %s", r.stderr) - except (subprocess.TimeoutExpired, FileNotFoundError) as e: - logger.debug("uv invocation failed: %s", e) - - # Tier 2: python -m pip (with ensurepip bootstrap if needed) - pip_cmd = [sys.executable, "-m", "pip"] - try: - probe = subprocess.run( - pip_cmd + ["--version"], - capture_output=True, text=True, timeout=15, - stdin=subprocess.DEVNULL, - ) - if probe.returncode != 0: - raise FileNotFoundError("pip not in venv") - except (subprocess.TimeoutExpired, FileNotFoundError): - try: - subprocess.run( - [sys.executable, "-m", "ensurepip", "--upgrade", "--default-pip"], - capture_output=True, text=True, timeout=120, check=True, - stdin=subprocess.DEVNULL, - ) - except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: - return _InstallResult(False, "", - f"pip not available and ensurepip failed: {e}") - - try: - r = subprocess.run( - pip_cmd + ["install", *specs], - capture_output=True, text=True, timeout=timeout, - stdin=subprocess.DEVNULL, - ) - return _InstallResult(r.returncode == 0, r.stdout or "", r.stderr or "") - except subprocess.TimeoutExpired as e: - return _InstallResult(False, "", f"pip install timed out: {e}") - except Exception as e: - return _InstallResult(False, "", f"pip install failed: {e}") + if r.returncode == 0 and target is not None: + _activate_target_on_syspath(target) + return _InstallResult(r.returncode == 0, r.stdout or "", r.stderr or "") + except subprocess.TimeoutExpired as e: + return _InstallResult(False, "", f"pip install timed out: {e}") + except Exception as e: + return _InstallResult(False, "", f"pip install failed: {e}") + finally: + if constraints is not None: + try: + constraints.unlink() + except OSError: + pass # =============================================================================