From d0b1ab48dc0c03adf40a7a83ff51f20b28770ad8 Mon Sep 17 00:00:00 2001 From: Ben Date: Sat, 23 May 2026 15:34:51 +1000 Subject: [PATCH] fix(container_boot): publish reconciled service dirs atomically PR #30136 review noted the asymmetry: `register_profile_gateway` used tmp_dir + rename to publish a new service slot atomically, but the boot-time reconciler wrote files into the slot directly. Same underlying concern (a concurrent s6-svscan rescan could observe a half-populated directory), different code path. Rewrite `container_boot._register_service` to mirror the manager: build everything in `/gateway-.tmp/`, then `Path.replace` into place. If a previous interrupted run left a `.tmp` sibling, it's cleaned up before the new build starts. If the target already exists, it's removed before the rename so `Path.replace` doesn't error on a non-empty target (Linux `rename` overwrites empty targets only). Three new tests: atomic publication leaves no .tmp leftovers, overwriting an existing slot still leaves no .tmp leftovers, and a stale .tmp from an interrupted run is cleaned up automatically. --- hermes_cli/container_boot.py | 76 ++++++++++++++++--------- tests/hermes_cli/test_container_boot.py | 75 ++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 26 deletions(-) diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py index 6013039dcb4..a40c72de361 100644 --- a/hermes_cli/container_boot.py +++ b/hermes_cli/container_boot.py @@ -180,7 +180,17 @@ def _register_service(scandir: Path, profile: str, *, start: bool) -> None: directly because the cont-init.d phase runs as root before s6-svscan starts scanning the dynamic scandir — the manager's ``s6-svscanctl -a`` call would fail with no control socket. + + Atomicity: build the new layout in a sibling temp directory and + rename it into place via :meth:`Path.replace`. This matches + :meth:`S6ServiceManager.register_profile_gateway` (PR #30136 + review item O4) — even though cont-init.d runs before s6-svscan + starts scanning, an atomic publication keeps the contract uniform + between the two registration paths and protects against a + half-populated dir if the script is interrupted mid-write. """ + import shutil + from hermes_cli.service_manager import ( S6ServiceManager, validate_profile_name, @@ -188,36 +198,50 @@ def _register_service(scandir: Path, profile: str, *, start: bool) -> None: validate_profile_name(profile) service_dir = scandir / f"gateway-{profile}" - service_dir.mkdir(parents=True, exist_ok=True) + tmp_dir = service_dir.with_name(service_dir.name + ".tmp") - (service_dir / "type").write_text("longrun\n") + # Wipe any leftover tmp from a previous interrupted run. + if tmp_dir.exists(): + shutil.rmtree(tmp_dir, ignore_errors=True) + tmp_dir.mkdir(parents=True) - # Reuse the manager's run-script rendering — single source of truth - # so register_profile_gateway and reconcile_profile_gateways stay - # consistent. extra_env is empty here; users who need per-profile - # env can set it via the profile's config.yaml (which the gateway - # itself loads). - run = service_dir / "run" - run.write_text(S6ServiceManager._render_run_script(profile, extra_env={})) - run.chmod(0o755) + try: + (tmp_dir / "type").write_text("longrun\n") - # Persistent log rotation (OQ8-C). - log_subdir = service_dir / "log" - log_subdir.mkdir(exist_ok=True) - log_run = log_subdir / "run" - log_run.write_text(S6ServiceManager._render_log_run(profile)) - log_run.chmod(0o755) + # Reuse the manager's run-script rendering — single source of + # truth so register_profile_gateway and reconcile_profile_gateways + # stay consistent. extra_env is empty here; users who need + # per-profile env can set it via the profile's config.yaml + # (which the gateway itself loads). + run = tmp_dir / "run" + run.write_text(S6ServiceManager._render_run_script(profile, extra_env={})) + run.chmod(0o755) - # The presence of a `down` file tells s6-supervise to NOT start - # the service when s6-svscan picks it up. User brings it up - # explicitly with `hermes -p gateway start` (which - # routes through the Phase 4 _dispatch_via_service_manager_if_s6 - # helper to `s6-svc -u`). - down_marker = service_dir / "down" - if start: - down_marker.unlink(missing_ok=True) - else: - down_marker.touch() + # Persistent log rotation (OQ8-C). + log_subdir = tmp_dir / "log" + log_subdir.mkdir() + log_run = log_subdir / "run" + log_run.write_text(S6ServiceManager._render_log_run(profile)) + log_run.chmod(0o755) + + # The presence of a `down` file tells s6-supervise to NOT + # start the service when s6-svscan picks it up. User brings + # it up explicitly with `hermes -p gateway start` + # (which routes through the Phase 4 + # _dispatch_via_service_manager_if_s6 helper to `s6-svc -u`). + if not start: + (tmp_dir / "down").touch() + + # Publish atomically. Path.replace handles the existing-target + # case the same way os.rename does on POSIX: the target is + # silently replaced, so a previous reconcile pass's slot is + # cleanly overwritten in one operation. + if service_dir.exists(): + shutil.rmtree(service_dir) + tmp_dir.replace(service_dir) + except Exception: + shutil.rmtree(tmp_dir, ignore_errors=True) + raise def _write_reconcile_log( diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py index 2f41f4f8e0f..58ad016f22e 100644 --- a/tests/hermes_cli/test_container_boot.py +++ b/tests/hermes_cli/test_container_boot.py @@ -354,6 +354,81 @@ def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None: ) +def test_register_service_publishes_atomically(tmp_path: Path) -> None: + """The reconciler should build the new service dir in a sibling + tmp directory and rename it into place — never leaving a half- + populated slot visible to a concurrent s6-svscan rescan. + + We verify the invariant indirectly: after a clean reconcile, the + target directory exists with all required files, and no sibling + .tmp leftovers remain. (Atomic publication is the only way to + achieve both with mkdir + write.) + """ + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + # No leftover tmp dir. + leftover = list(scandir.glob("*.tmp")) + assert leftover == [], f"leftover tmp directories: {leftover}" + + # Target is fully populated. + svc = scandir / "gateway-coder" + assert (svc / "type").exists() + assert (svc / "run").exists() + assert (svc / "log" / "run").exists() + + +def test_register_service_overwrites_existing_slot(tmp_path: Path) -> None: + """A second reconciliation pass cleanly replaces an existing + slot (the tmp+rename publication overwrites the previous one).""" + scandir = tmp_path / "run-service"; scandir.mkdir() + profile = _make_profile(tmp_path, "coder", state="running") + + # First pass. + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + first_run = (scandir / "gateway-coder" / "run").read_text() + + # Mutate the profile state so the run-script changes (extra_env + # rendering would differ if we wired profile config through, but + # for now just exercise the overwrite path). + (profile / "gateway_state.json").write_text( + '{"gateway_state": "stopped"}', + ) + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + # Slot still exists, no .tmp remnants. + assert (scandir / "gateway-coder" / "run").read_text() == first_run + assert list(scandir.glob("*.tmp")) == [] + # Down marker now present (state went from running → stopped). + assert (scandir / "gateway-coder" / "down").exists() + + +def test_register_service_cleans_up_stale_tmp_dir(tmp_path: Path) -> None: + """If a previous interrupted run left a .tmp sibling directory, + a fresh reconcile must clean it up rather than failing on mkdir.""" + scandir = tmp_path / "run-service"; scandir.mkdir() + # Simulate a leftover from an interrupted run. + stale_tmp = scandir / "gateway-coder.tmp" + stale_tmp.mkdir() + (stale_tmp / "stale-file").write_text("garbage") + + _make_profile(tmp_path, "coder", state="running") + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert not stale_tmp.exists() + assert (scandir / "gateway-coder" / "run").exists() + + # --------------------------------------------------------------------------- # Default-profile slot — always registered (PR #30136 review item I1) # ---------------------------------------------------------------------------