fix(container_boot): rotate container-boot.log when it exceeds 256 KiB

PR #30136 review noted: container-boot.log was append-only with no
rotation. On a long-lived container with frequent restarts and
many profiles it would grow unboundedly (~80 B per profile per
reconcile pass).

Add a soft cap: when the file size hits 256 KiB (`_LOG_ROTATE_BYTES`,
≈3000 reconcile lines, ≈1 year of daily reboots × 5 profiles), the
current file is renamed to `container-boot.log.1` (replacing any
existing one) before new entries are appended. Worst case is two
files at ~512 KiB — well within visibility limits for grep/cat.

Rotation is intentionally simple (no logrotate or s6-log machinery
for one append-only file). Failures during rotation are logged via
the module logger and treated as non-fatal — we keep appending to
the existing file rather than dropping the reconcile entry. Three
new unit tests cover above-threshold rotation, below-threshold
non-rotation, and overwrite of an existing .1 file.
This commit is contained in:
Ben 2026-05-23 15:33:11 +10:00 committed by teknium1
parent 9914bfc594
commit 4443fb481d
No known key found for this signature in database
2 changed files with 111 additions and 1 deletions

View file

@ -229,12 +229,32 @@ def _write_reconcile_log(
up". Keeping a separate log file (vs. mixing into agent.log) lets
troubleshooters grep for "profile=foo" without wading through
unrelated activity.
Size-bounded: when the file exceeds ``_LOG_ROTATE_BYTES``
(defaults to 256 KiB 3000 reconcile lines), the current file
is renamed to ``container-boot.log.1`` (replacing any previous
rotation) before the new entries are appended. This gives long-
lived containers a soft cap of ~512 KiB across the two files
without pulling in logrotate or s6-log machinery just for this
one append-only file (PR #30136 review item O3).
"""
import time
log_dir = hermes_home / "logs"
log_dir.mkdir(parents=True, exist_ok=True)
log_path = log_dir / "container-boot.log"
# Rotate before opening to append, so the new entries always land
# in a fresh file when we crossed the threshold last time.
try:
if log_path.exists() and log_path.stat().st_size >= _LOG_ROTATE_BYTES:
log_path.replace(log_dir / "container-boot.log.1")
except OSError as exc:
# Rotation failure is non-fatal — keep appending to the
# existing file rather than losing the entry entirely.
log.warning("could not rotate %s: %s", log_path, exc)
ts = time.strftime("%Y-%m-%dT%H:%M:%S%z")
with (log_dir / "container-boot.log").open("a", encoding="utf-8") as f:
with log_path.open("a", encoding="utf-8") as f:
for a in actions:
f.write(
f"{ts} profile={a.profile} prior_state={a.prior_state} "
@ -242,6 +262,14 @@ def _write_reconcile_log(
)
# 256 KiB soft cap on container-boot.log; rotated to .1 when crossed.
# At ~80 B per reconcile-action line this is ~3000 lines, or about a
# year of daily reboots on a 5-profile container. Two files = ~512 KiB
# worst case. Tuned for visibility (small enough to grep / cat without
# scrolling forever) more than space (the persistent volume has GB).
_LOG_ROTATE_BYTES = 256 * 1024
def main() -> int:
"""Entry point invoked from /etc/cont-init.d/02-reconcile-profiles."""
hermes_home = Path(os.environ.get("HERMES_HOME", "/opt/data"))

View file

@ -223,6 +223,88 @@ def test_reconcile_log_is_written(tmp_path: Path) -> None:
assert "action=registered" in log
def test_reconcile_log_rotates_when_size_exceeded(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""When container-boot.log exceeds _LOG_ROTATE_BYTES, the existing
file is rotated to .1 before the new entries are appended."""
from hermes_cli import container_boot
# Tighten the threshold so we don't have to write 256 KiB.
monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 200)
log_path = tmp_path / "logs" / "container-boot.log"
log_path.parent.mkdir()
log_path.write_text("X" * 300) # already over the threshold
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "coder", state="running")
reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
rotated = tmp_path / "logs" / "container-boot.log.1"
assert rotated.exists(), "expected previous log to be rotated to .1"
assert rotated.read_text().startswith("X" * 300)
# The new entries land in a fresh container-boot.log (no leftover Xs).
new_contents = log_path.read_text()
assert "X" not in new_contents
assert "profile=coder" in new_contents
def test_reconcile_log_does_not_rotate_below_threshold(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""A small existing log is appended to in place; no .1 is created."""
from hermes_cli import container_boot
monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 10_000_000)
log_path = tmp_path / "logs" / "container-boot.log"
log_path.parent.mkdir()
log_path.write_text("previous entry\n")
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "coder", state="running")
reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
assert not (tmp_path / "logs" / "container-boot.log.1").exists()
contents = log_path.read_text()
assert contents.startswith("previous entry\n")
assert "profile=coder" in contents
def test_reconcile_log_rotation_overwrites_existing_dot1(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Rotating again replaces the prior .1 — we keep at most one
rotated file (soft cap of ~2 × threshold)."""
from hermes_cli import container_boot
monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 200)
log_dir = tmp_path / "logs"; log_dir.mkdir()
(log_dir / "container-boot.log.1").write_text("OLD ROTATION")
(log_dir / "container-boot.log").write_text("Y" * 300)
scandir = tmp_path / "run-service"; scandir.mkdir()
_make_profile(tmp_path, "coder", state="running")
reconcile_profile_gateways(
hermes_home=tmp_path, scandir=scandir, dry_run=False,
)
# .1 now contains the previous .log (Ys), not OLD ROTATION.
rotated = (log_dir / "container-boot.log.1").read_text()
assert "OLD ROTATION" not in rotated
assert rotated.startswith("Y" * 300)
def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None:
scandir = tmp_path / "run-service"; scandir.mkdir()
profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)