mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
fix(s6): dot-prefix gateway staging dir so svscan ignores it mid-build (#54834)
Some checks are pending
CI / Detect affected areas (push) Waiting to run
CI / Python tests (push) Blocked by required conditions
CI / Python lints (push) Blocked by required conditions
CI / TypeScript (push) Blocked by required conditions
CI / Docs Site (push) Blocked by required conditions
CI / Deny unrelated histories (push) Blocked by required conditions
CI / Check contributors (push) Blocked by required conditions
CI / Check uv.lock (push) Blocked by required conditions
CI / Lint Docker scripts (push) Blocked by required conditions
CI / Build&Test Docker image (push) Blocked by required conditions
CI / Supply-chain scan (push) Blocked by required conditions
CI / OSV scan (push) Waiting to run
CI / All required checks pass (push) Blocked by required conditions
Deploy Site / deploy-vercel (push) Waiting to run
Deploy Site / deploy-docs (push) Waiting to run
Some checks are pending
CI / Detect affected areas (push) Waiting to run
CI / Python tests (push) Blocked by required conditions
CI / Python lints (push) Blocked by required conditions
CI / TypeScript (push) Blocked by required conditions
CI / Docs Site (push) Blocked by required conditions
CI / Deny unrelated histories (push) Blocked by required conditions
CI / Check contributors (push) Blocked by required conditions
CI / Check uv.lock (push) Blocked by required conditions
CI / Lint Docker scripts (push) Blocked by required conditions
CI / Build&Test Docker image (push) Blocked by required conditions
CI / Supply-chain scan (push) Blocked by required conditions
CI / OSV scan (push) Waiting to run
CI / All required checks pass (push) Blocked by required conditions
Deploy Site / deploy-vercel (push) Waiting to run
Deploy Site / deploy-docs (push) Waiting to run
The register path builds each profile-gateway slot in a sibling staging dir under /run/service (the scandir s6-svscan watches), then atomically renames it to the live gateway-<profile> name. The staging dir was named gateway-<profile>.tmp — a NON-dotfile — so a concurrent `s6-svscanctl -a` rescan (fired by the cont-init reconciler registering gateway-default, or by a sibling register) would supervise the half-built slot the moment it had a valid type/run: s6-supervise spawns AS ROOT and mkdirs supervise/ root-owned 0700, then the in-flight _seed_supervise_skeleton early-returns on the now-existing supervise/ and the next `mkdir supervise/event` hits PermissionError. That is the arm64-only CI flake on test_s6_unregister_removes_service_dir_in_live_container (PermissionError: /run/service/gateway-phase3test.tmp/supervise/event) — arm64-only because the native-arm runner's wider scheduling jitter lets the rescan land inside the ~ms seed window; amd64 ran 30/30 clean. Fix: dot-prefix the staging dir (.gateway-<profile>.tmp) in both register paths (S6ServiceManager.register_profile_gateway and container_boot._register_service). s6-svscan skips any scandir entry whose name begins with '.', so the half-built slot can never be supervised mid-build. The atomic rename to the dotless live name is unchanged. Verified on a real s6 image (amd64): a non-dotted staging dir is picked up by an svscanctl -a rescan (SUPERVISED owner=root) while a dot-prefixed one is ignored (NOT-SUPERVISED). Added a docker-harness regression test that asserts both, plus a unit test that the staging dir is dot-prefixed.
This commit is contained in:
parent
dbad6d47d3
commit
f53ba9bb54
5 changed files with 175 additions and 9 deletions
|
|
@ -379,7 +379,16 @@ def _register_service(scandir: Path, profile: str, *, start: bool) -> None:
|
|||
|
||||
validate_profile_name(profile)
|
||||
service_dir = scandir / f"gateway-{profile}"
|
||||
tmp_dir = service_dir.with_name(service_dir.name + ".tmp")
|
||||
# Dot-prefix the staging dir so s6-svscan skips it while half-built
|
||||
# (s6-svscan ignores scandir entries whose name starts with ".").
|
||||
# A non-dotted ``.tmp`` staging name is supervised AS ROOT by any
|
||||
# concurrent ``s6-svscanctl -a`` rescan the moment it has a valid
|
||||
# ``type``/``run``, creating a root-owned ``supervise/`` that makes
|
||||
# ``_seed_supervise_skeleton`` EACCES — see the matching comment in
|
||||
# ``S6ServiceManager.register_profile_gateway``. The atomic
|
||||
# ``tmp_dir.replace(service_dir)`` below renames to the dotless live
|
||||
# name, so the published slot is unchanged.
|
||||
tmp_dir = service_dir.with_name("." + service_dir.name + ".tmp")
|
||||
|
||||
# Wipe any leftover tmp from a previous interrupted run.
|
||||
if tmp_dir.exists():
|
||||
|
|
|
|||
|
|
@ -965,9 +965,23 @@ class S6ServiceManager:
|
|||
)
|
||||
|
||||
# Build the service directory atomically: write to a sibling
|
||||
# temp dir, then rename. Avoids s6-svscan observing a half-
|
||||
# populated directory on a fast rescan.
|
||||
tmp_dir = svc_dir.with_name(svc_dir.name + ".tmp")
|
||||
# temp dir, then rename. The staging name is DOT-PREFIXED
|
||||
# (``.gateway-<profile>.tmp``) so s6-svscan ignores it while it
|
||||
# is half-built: s6-svscan skips any scandir entry whose name
|
||||
# begins with ``.``. Without the dot prefix, a concurrent
|
||||
# ``s6-svscanctl -a`` rescan (fired by the cont-init reconciler
|
||||
# registering ``gateway-default``, or by a sibling register)
|
||||
# would supervise the still-being-seeded ``.tmp`` slot: it has a
|
||||
# valid ``type``/``run`` by that point, so s6-supervise spawns
|
||||
# AS ROOT and mkdir's ``supervise/`` root-owned 0700 — then this
|
||||
# process's ``_seed_supervise_skeleton`` early-returns on the now-
|
||||
# existing ``supervise/`` and the next ``mkdir supervise/event``
|
||||
# hits EACCES. That is the arm64-only CI flake on
|
||||
# test_s6_unregister_removes_service_dir_in_live_container
|
||||
# (the wider scheduling jitter on the native arm64 runner lets the
|
||||
# rescan land inside the ~ms seed window). The atomic rename to
|
||||
# the dotless live name below is unaffected.
|
||||
tmp_dir = svc_dir.with_name("." + svc_dir.name + ".tmp")
|
||||
if tmp_dir.exists():
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
tmp_dir.mkdir(parents=True)
|
||||
|
|
|
|||
|
|
@ -109,3 +109,93 @@ def test_s6_unregister_removes_service_dir_in_live_container(
|
|||
"print(S6ServiceManager().list_profile_gateways())"
|
||||
))
|
||||
assert "phase3test" not in r.stdout
|
||||
|
||||
|
||||
# Shell probe: build a service-shaped staging dir under the live scandir
|
||||
# with a given NAME, fire a real `s6-svscanctl -a` rescan, wait, and
|
||||
# report whether s6-svscan supervised it (which would create a root-owned
|
||||
# supervise/ dir). Used to prove the dot-prefixed staging name is INVISIBLE
|
||||
# to a concurrent rescan while a non-dotted one is not.
|
||||
#
|
||||
# Echoes one of: SUPERVISED / NOT-SUPERVISED, plus the supervise/ owner.
|
||||
_SVSCAN_PICKUP_PROBE = r"""
|
||||
set -eu
|
||||
NAME="$1"
|
||||
SCANDIR=/run/service
|
||||
DIR="$SCANDIR/$NAME"
|
||||
rm -rf "$DIR"
|
||||
mkdir -p "$DIR"
|
||||
printf 'longrun\n' > "$DIR/type"
|
||||
printf '#!/command/execlineb -P\n/command/s6-sleep 600\n' > "$DIR/run"
|
||||
chmod 755 "$DIR/run"
|
||||
# Trigger a full rescan, exactly as register/reconcile do.
|
||||
/command/s6-svscanctl -a "$SCANDIR"
|
||||
# Give s6-svscan time to act (its scan is async; 200ms is the manager's
|
||||
# own settle delay, use 2s here to be comfortably past it on any arch).
|
||||
/command/s6-sleep 2
|
||||
if [ -d "$DIR/supervise" ]; then
|
||||
owner=$(stat -c '%U' "$DIR/supervise" 2>/dev/null || echo '?')
|
||||
echo "SUPERVISED owner=$owner"
|
||||
else
|
||||
echo "NOT-SUPERVISED"
|
||||
fi
|
||||
# Best-effort teardown so the probe leaves no live supervisor behind.
|
||||
/command/s6-svc -d "$DIR" 2>/dev/null || true
|
||||
/command/s6-svscanctl -an "$SCANDIR" 2>/dev/null || true
|
||||
/command/s6-sleep 1
|
||||
rm -rf "$DIR" 2>/dev/null || true
|
||||
"""
|
||||
|
||||
|
||||
def test_s6_dotfile_staging_dir_is_ignored_by_svscan_rescan(
|
||||
built_image: str, container_name: str,
|
||||
) -> None:
|
||||
"""Regression for the arm64 register-seed race.
|
||||
|
||||
The register path builds the slot in a sibling staging dir and then
|
||||
atomically renames it to the live ``gateway-<profile>`` name. That
|
||||
staging dir lives INSIDE the scandir s6-svscan watches, so its NAME
|
||||
decides whether a concurrent ``s6-svscanctl -a`` rescan (fired by the
|
||||
cont-init reconciler registering ``gateway-default``, or by another
|
||||
register) supervises the half-built slot.
|
||||
|
||||
- A NON-dotted name (the old ``gateway-<p>.tmp``) IS picked up: once it
|
||||
has a valid ``type``/``run``, s6-svscan spawns ``s6-supervise`` AS
|
||||
ROOT, creating a root-owned ``supervise/`` — which makes the in-flight
|
||||
``_seed_supervise_skeleton`` EACCES on ``mkdir supervise/event``. That
|
||||
is the arm64-only flake (the native-arm runner's wider scheduling
|
||||
jitter lets the rescan land inside the seed window).
|
||||
- A DOT-prefixed name (the fix, ``.gateway-<p>.tmp``) is SKIPPED by
|
||||
s6-svscan and never supervised, so no root-owned ``supervise/`` can
|
||||
appear under the staging dir.
|
||||
|
||||
This proves the mechanism directly and is arch-independent (it does not
|
||||
rely on hitting the narrow timing window — it forces the rescan and
|
||||
checks pickup), so it guards the fix on the amd64 job too.
|
||||
"""
|
||||
start_container(built_image, container_name, cmd="sleep 120")
|
||||
|
||||
# Control: a NON-dotted service-shaped dir IS supervised by the rescan
|
||||
# (root-owned supervise/). This is the pre-fix staging-name behaviour and
|
||||
# confirms the probe actually exercises s6-svscan pickup.
|
||||
r = docker_exec(
|
||||
container_name, "sh", "-c", _SVSCAN_PICKUP_PROBE, "probe",
|
||||
"gateway-raceprobe.tmp", user="root", timeout=30,
|
||||
)
|
||||
assert "SUPERVISED" in r.stdout and "NOT-SUPERVISED" not in r.stdout, (
|
||||
"control failed: a non-dotted staging dir should be picked up by "
|
||||
f"s6-svscan. stdout={r.stdout!r} stderr={r.stderr!r}"
|
||||
)
|
||||
|
||||
# The fix: a DOT-prefixed staging dir (the name register/reconcile now
|
||||
# use) must be IGNORED by the same rescan — no supervisor, no root-owned
|
||||
# supervise/, so the in-flight seed can never EACCES.
|
||||
r = docker_exec(
|
||||
container_name, "sh", "-c", _SVSCAN_PICKUP_PROBE, "probe",
|
||||
".gateway-raceprobe.tmp", user="root", timeout=30,
|
||||
)
|
||||
assert "NOT-SUPERVISED" in r.stdout, (
|
||||
"dot-prefixed staging dir was supervised by s6-svscan — the race "
|
||||
f"that EACCESes the seed is still reachable. stdout={r.stdout!r} "
|
||||
f"stderr={r.stderr!r}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -489,19 +489,26 @@ def test_register_service_overwrites_existing_slot(tmp_path: Path) -> None:
|
|||
hermes_home=tmp_path, scandir=scandir, dry_run=False,
|
||||
)
|
||||
|
||||
# Slot still exists, no .tmp remnants.
|
||||
# Slot still exists, no .tmp remnants (staging dir is dot-prefixed,
|
||||
# so match it explicitly — a leading-`*` glob won't catch dotfiles).
|
||||
assert (scandir / "gateway-coder" / "run").read_text() == first_run
|
||||
assert list(scandir.glob("*.tmp")) == []
|
||||
assert list(scandir.glob(".*.tmp")) == []
|
||||
# Down marker now present (state went from running → stopped).
|
||||
assert (scandir / "gateway-coder" / "down").exists()
|
||||
|
||||
|
||||
def test_register_service_cleans_up_stale_tmp_dir(tmp_path: Path) -> None:
|
||||
"""If a previous interrupted run left a .tmp sibling directory,
|
||||
a fresh reconcile must clean it up rather than failing on mkdir."""
|
||||
"""If a previous interrupted run left a staging sibling directory,
|
||||
a fresh reconcile must clean it up rather than failing on mkdir.
|
||||
|
||||
The staging dir is dot-prefixed (``.gateway-<profile>.tmp``) so a
|
||||
concurrent s6-svscan rescan can't supervise it half-built; the
|
||||
cleanup must target that same dot-prefixed name.
|
||||
"""
|
||||
scandir = tmp_path / "run-service"; scandir.mkdir()
|
||||
# Simulate a leftover from an interrupted run.
|
||||
stale_tmp = scandir / "gateway-coder.tmp"
|
||||
# Simulate a leftover from an interrupted run (current staging name).
|
||||
stale_tmp = scandir / ".gateway-coder.tmp"
|
||||
stale_tmp.mkdir()
|
||||
(stale_tmp / "stale-file").write_text("garbage")
|
||||
|
||||
|
|
|
|||
|
|
@ -586,6 +586,52 @@ def test_s6_register_creates_service_dir_and_triggers_scan(
|
|||
), f"s6-svscanctl -a not invoked; saw: {fake_subprocess_run}"
|
||||
|
||||
|
||||
def test_s6_register_staging_dir_is_dotfile_hidden_from_svscan(
|
||||
s6_scandir, fake_subprocess_run, monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""The mid-build staging dir MUST be dot-prefixed so s6-svscan
|
||||
ignores it while it is half-populated.
|
||||
|
||||
s6-svscan skips any scandir entry whose name begins with ``.``. If
|
||||
the staging dir were a plain ``gateway-<p>.tmp`` (a non-dotfile),
|
||||
a concurrent ``s6-svscanctl -a`` rescan would supervise it the
|
||||
moment it has a valid ``type``/``run`` — spawning s6-supervise AS
|
||||
ROOT, which mkdir's a root-owned ``supervise/`` and makes the
|
||||
in-flight ``_seed_supervise_skeleton`` EACCES on
|
||||
``mkdir supervise/event``. That was the arm64-only CI flake on
|
||||
``test_s6_unregister_removes_service_dir_in_live_container``.
|
||||
|
||||
We capture the directory passed to ``_seed_supervise_skeleton``
|
||||
(called mid-build, BEFORE the atomic rename to the live name) and
|
||||
assert its basename starts with ``.`` and still lives in the
|
||||
scandir as a sibling of the live slot.
|
||||
"""
|
||||
import hermes_cli.service_manager as sm
|
||||
|
||||
seen: list[str] = []
|
||||
real_seed = sm._seed_supervise_skeleton
|
||||
|
||||
def _capturing_seed(svc_dir, *a, **kw):
|
||||
seen.append(str(svc_dir))
|
||||
return real_seed(svc_dir, *a, **kw)
|
||||
|
||||
monkeypatch.setattr(sm, "_seed_supervise_skeleton", _capturing_seed)
|
||||
|
||||
S6ServiceManager(scandir=s6_scandir).register_profile_gateway("coder")
|
||||
|
||||
assert seen, "_seed_supervise_skeleton was never called during register"
|
||||
staging = seen[0]
|
||||
staging_name = staging.rsplit("/", 1)[-1]
|
||||
assert staging_name.startswith("."), (
|
||||
f"staging dir must be a dotfile so s6-svscan skips it mid-build; "
|
||||
f"got {staging_name!r}"
|
||||
)
|
||||
# Sibling of the live slot, in the same scandir.
|
||||
assert staging == str(s6_scandir / ".gateway-coder.tmp")
|
||||
# And the published (renamed) live slot is the dotless canonical name.
|
||||
assert (s6_scandir / "gateway-coder").is_dir()
|
||||
|
||||
|
||||
def test_s6_register_start_now_false_writes_down_marker(
|
||||
s6_scandir, fake_subprocess_run,
|
||||
) -> None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue