mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(container_boot): publish reconciled service dirs atomically
PR #30136 review noted the asymmetry: `register_profile_gateway` used tmp_dir + rename to publish a new service slot atomically, but the boot-time reconciler wrote files into the slot directly. Same underlying concern (a concurrent s6-svscan rescan could observe a half-populated directory), different code path. Rewrite `container_boot._register_service` to mirror the manager: build everything in `<scandir>/gateway-<profile>.tmp/`, then `Path.replace` into place. If a previous interrupted run left a `.tmp` sibling, it's cleaned up before the new build starts. If the target already exists, it's removed before the rename so `Path.replace` doesn't error on a non-empty target (Linux `rename` overwrites empty targets only). Three new tests: atomic publication leaves no .tmp leftovers, overwriting an existing slot still leaves no .tmp leftovers, and a stale .tmp from an interrupted run is cleaned up automatically.
This commit is contained in:
parent
4443fb481d
commit
d0b1ab48dc
2 changed files with 125 additions and 26 deletions
|
|
@ -180,7 +180,17 @@ def _register_service(scandir: Path, profile: str, *, start: bool) -> None:
|
|||
directly because the cont-init.d phase runs as root before
|
||||
s6-svscan starts scanning the dynamic scandir — the manager's
|
||||
``s6-svscanctl -a`` call would fail with no control socket.
|
||||
|
||||
Atomicity: build the new layout in a sibling temp directory and
|
||||
rename it into place via :meth:`Path.replace`. This matches
|
||||
:meth:`S6ServiceManager.register_profile_gateway` (PR #30136
|
||||
review item O4) — even though cont-init.d runs before s6-svscan
|
||||
starts scanning, an atomic publication keeps the contract uniform
|
||||
between the two registration paths and protects against a
|
||||
half-populated dir if the script is interrupted mid-write.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
from hermes_cli.service_manager import (
|
||||
S6ServiceManager,
|
||||
validate_profile_name,
|
||||
|
|
@ -188,36 +198,50 @@ def _register_service(scandir: Path, profile: str, *, start: bool) -> None:
|
|||
|
||||
validate_profile_name(profile)
|
||||
service_dir = scandir / f"gateway-{profile}"
|
||||
service_dir.mkdir(parents=True, exist_ok=True)
|
||||
tmp_dir = service_dir.with_name(service_dir.name + ".tmp")
|
||||
|
||||
(service_dir / "type").write_text("longrun\n")
|
||||
# Wipe any leftover tmp from a previous interrupted run.
|
||||
if tmp_dir.exists():
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
tmp_dir.mkdir(parents=True)
|
||||
|
||||
# Reuse the manager's run-script rendering — single source of truth
|
||||
# so register_profile_gateway and reconcile_profile_gateways stay
|
||||
# consistent. extra_env is empty here; users who need per-profile
|
||||
# env can set it via the profile's config.yaml (which the gateway
|
||||
# itself loads).
|
||||
run = service_dir / "run"
|
||||
run.write_text(S6ServiceManager._render_run_script(profile, extra_env={}))
|
||||
run.chmod(0o755)
|
||||
try:
|
||||
(tmp_dir / "type").write_text("longrun\n")
|
||||
|
||||
# Persistent log rotation (OQ8-C).
|
||||
log_subdir = service_dir / "log"
|
||||
log_subdir.mkdir(exist_ok=True)
|
||||
log_run = log_subdir / "run"
|
||||
log_run.write_text(S6ServiceManager._render_log_run(profile))
|
||||
log_run.chmod(0o755)
|
||||
# Reuse the manager's run-script rendering — single source of
|
||||
# truth so register_profile_gateway and reconcile_profile_gateways
|
||||
# stay consistent. extra_env is empty here; users who need
|
||||
# per-profile env can set it via the profile's config.yaml
|
||||
# (which the gateway itself loads).
|
||||
run = tmp_dir / "run"
|
||||
run.write_text(S6ServiceManager._render_run_script(profile, extra_env={}))
|
||||
run.chmod(0o755)
|
||||
|
||||
# The presence of a `down` file tells s6-supervise to NOT start
|
||||
# the service when s6-svscan picks it up. User brings it up
|
||||
# explicitly with `hermes -p <profile> gateway start` (which
|
||||
# routes through the Phase 4 _dispatch_via_service_manager_if_s6
|
||||
# helper to `s6-svc -u`).
|
||||
down_marker = service_dir / "down"
|
||||
if start:
|
||||
down_marker.unlink(missing_ok=True)
|
||||
else:
|
||||
down_marker.touch()
|
||||
# Persistent log rotation (OQ8-C).
|
||||
log_subdir = tmp_dir / "log"
|
||||
log_subdir.mkdir()
|
||||
log_run = log_subdir / "run"
|
||||
log_run.write_text(S6ServiceManager._render_log_run(profile))
|
||||
log_run.chmod(0o755)
|
||||
|
||||
# The presence of a `down` file tells s6-supervise to NOT
|
||||
# start the service when s6-svscan picks it up. User brings
|
||||
# it up explicitly with `hermes -p <profile> gateway start`
|
||||
# (which routes through the Phase 4
|
||||
# _dispatch_via_service_manager_if_s6 helper to `s6-svc -u`).
|
||||
if not start:
|
||||
(tmp_dir / "down").touch()
|
||||
|
||||
# Publish atomically. Path.replace handles the existing-target
|
||||
# case the same way os.rename does on POSIX: the target is
|
||||
# silently replaced, so a previous reconcile pass's slot is
|
||||
# cleanly overwritten in one operation.
|
||||
if service_dir.exists():
|
||||
shutil.rmtree(service_dir)
|
||||
tmp_dir.replace(service_dir)
|
||||
except Exception:
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
raise
|
||||
|
||||
|
||||
def _write_reconcile_log(
|
||||
|
|
|
|||
|
|
@ -354,6 +354,81 @@ def test_invalid_profile_name_in_directory_raises(tmp_path: Path) -> None:
|
|||
)
|
||||
|
||||
|
||||
def test_register_service_publishes_atomically(tmp_path: Path) -> None:
|
||||
"""The reconciler should build the new service dir in a sibling
|
||||
tmp directory and rename it into place — never leaving a half-
|
||||
populated slot visible to a concurrent s6-svscan rescan.
|
||||
|
||||
We verify the invariant indirectly: after a clean reconcile, the
|
||||
target directory exists with all required files, and no sibling
|
||||
.tmp leftovers remain. (Atomic publication is the only way to
|
||||
achieve both with mkdir + write.)
|
||||
"""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
_make_profile(tmp_path, "coder", state="running")
|
||||
|
||||
reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
# No leftover tmp dir.
|
||||
leftover = list(scandir.glob("*.tmp"))
|
||||
assert leftover == [], f"leftover tmp directories: {leftover}"
|
||||
|
||||
# Target is fully populated.
|
||||
svc = scandir / "gateway-coder"
|
||||
assert (svc / "type").exists()
|
||||
assert (svc / "run").exists()
|
||||
assert (svc / "log" / "run").exists()
|
||||
|
||||
|
||||
def test_register_service_overwrites_existing_slot(tmp_path: Path) -> None:
|
||||
"""A second reconciliation pass cleanly replaces an existing
|
||||
slot (the tmp+rename publication overwrites the previous one)."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
profile = _make_profile(tmp_path, "coder", state="running")
|
||||
|
||||
# First pass.
|
||||
reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
first_run = (scandir / "gateway-coder" / "run").read_text()
|
||||
|
||||
# Mutate the profile state so the run-script changes (extra_env
|
||||
# rendering would differ if we wired profile config through, but
|
||||
# for now just exercise the overwrite path).
|
||||
(profile / "gateway_state.json").write_text(
|
||||
'{"gateway_state": "stopped"}',
|
||||
)
|
||||
reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
# Slot still exists, no .tmp remnants.
|
||||
assert (scandir / "gateway-coder" / "run").read_text() == first_run
|
||||
assert list(scandir.glob("*.tmp")) == []
|
||||
# Down marker now present (state went from running → stopped).
|
||||
assert (scandir / "gateway-coder" / "down").exists()
|
||||
|
||||
|
||||
def test_register_service_cleans_up_stale_tmp_dir(tmp_path: Path) -> None:
|
||||
"""If a previous interrupted run left a .tmp sibling directory,
|
||||
a fresh reconcile must clean it up rather than failing on mkdir."""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
# Simulate a leftover from an interrupted run.
|
||||
stale_tmp = scandir / "gateway-coder.tmp"
|
||||
stale_tmp.mkdir()
|
||||
(stale_tmp / "stale-file").write_text("garbage")
|
||||
|
||||
_make_profile(tmp_path, "coder", state="running")
|
||||
reconcile_profile_gateways(
|
||||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
assert not stale_tmp.exists()
|
||||
assert (scandir / "gateway-coder" / "run").exists()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default-profile slot — always registered (PR #30136 review item I1)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue