diff --git a/hermes_cli/container_boot.py b/hermes_cli/container_boot.py index 66f8f51766e..6013039dcb4 100644 --- a/hermes_cli/container_boot.py +++ b/hermes_cli/container_boot.py @@ -229,12 +229,32 @@ def _write_reconcile_log( up". Keeping a separate log file (vs. mixing into agent.log) lets troubleshooters grep for "profile=foo" without wading through unrelated activity. + + Size-bounded: when the file exceeds ``_LOG_ROTATE_BYTES`` + (defaults to 256 KiB ≈ 3000 reconcile lines), the current file + is renamed to ``container-boot.log.1`` (replacing any previous + rotation) before the new entries are appended. This gives long- + lived containers a soft cap of ~512 KiB across the two files + without pulling in logrotate or s6-log machinery just for this + one append-only file (PR #30136 review item O3). """ import time log_dir = hermes_home / "logs" log_dir.mkdir(parents=True, exist_ok=True) + log_path = log_dir / "container-boot.log" + + # Rotate before opening to append, so the new entries always land + # in a fresh file when we crossed the threshold last time. + try: + if log_path.exists() and log_path.stat().st_size >= _LOG_ROTATE_BYTES: + log_path.replace(log_dir / "container-boot.log.1") + except OSError as exc: + # Rotation failure is non-fatal — keep appending to the + # existing file rather than losing the entry entirely. + log.warning("could not rotate %s: %s", log_path, exc) + ts = time.strftime("%Y-%m-%dT%H:%M:%S%z") - with (log_dir / "container-boot.log").open("a", encoding="utf-8") as f: + with log_path.open("a", encoding="utf-8") as f: for a in actions: f.write( f"{ts} profile={a.profile} prior_state={a.prior_state} " @@ -242,6 +262,14 @@ def _write_reconcile_log( ) +# 256 KiB soft cap on container-boot.log; rotated to .1 when crossed. +# At ~80 B per reconcile-action line this is ~3000 lines, or about a +# year of daily reboots on a 5-profile container. Two files = ~512 KiB +# worst case. Tuned for visibility (small enough to grep / cat without +# scrolling forever) more than space (the persistent volume has GB). +_LOG_ROTATE_BYTES = 256 * 1024 + + def main() -> int: """Entry point invoked from /etc/cont-init.d/02-reconcile-profiles.""" hermes_home = Path(os.environ.get("HERMES_HOME", "/opt/data")) diff --git a/tests/hermes_cli/test_container_boot.py b/tests/hermes_cli/test_container_boot.py index 8272c090448..2f41f4f8e0f 100644 --- a/tests/hermes_cli/test_container_boot.py +++ b/tests/hermes_cli/test_container_boot.py @@ -223,6 +223,88 @@ def test_reconcile_log_is_written(tmp_path: Path) -> None: assert "action=registered" in log +def test_reconcile_log_rotates_when_size_exceeded( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When container-boot.log exceeds _LOG_ROTATE_BYTES, the existing + file is rotated to .1 before the new entries are appended.""" + from hermes_cli import container_boot + + # Tighten the threshold so we don't have to write 256 KiB. + monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 200) + + log_path = tmp_path / "logs" / "container-boot.log" + log_path.parent.mkdir() + log_path.write_text("X" * 300) # already over the threshold + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + rotated = tmp_path / "logs" / "container-boot.log.1" + assert rotated.exists(), "expected previous log to be rotated to .1" + assert rotated.read_text().startswith("X" * 300) + # The new entries land in a fresh container-boot.log (no leftover Xs). + new_contents = log_path.read_text() + assert "X" not in new_contents + assert "profile=coder" in new_contents + + +def test_reconcile_log_does_not_rotate_below_threshold( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """A small existing log is appended to in place; no .1 is created.""" + from hermes_cli import container_boot + monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 10_000_000) + + log_path = tmp_path / "logs" / "container-boot.log" + log_path.parent.mkdir() + log_path.write_text("previous entry\n") + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + assert not (tmp_path / "logs" / "container-boot.log.1").exists() + contents = log_path.read_text() + assert contents.startswith("previous entry\n") + assert "profile=coder" in contents + + +def test_reconcile_log_rotation_overwrites_existing_dot1( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Rotating again replaces the prior .1 — we keep at most one + rotated file (soft cap of ~2 × threshold).""" + from hermes_cli import container_boot + monkeypatch.setattr(container_boot, "_LOG_ROTATE_BYTES", 200) + + log_dir = tmp_path / "logs"; log_dir.mkdir() + (log_dir / "container-boot.log.1").write_text("OLD ROTATION") + (log_dir / "container-boot.log").write_text("Y" * 300) + + scandir = tmp_path / "run-service"; scandir.mkdir() + _make_profile(tmp_path, "coder", state="running") + + reconcile_profile_gateways( + hermes_home=tmp_path, scandir=scandir, dry_run=False, + ) + + # .1 now contains the previous .log (Ys), not OLD ROTATION. + rotated = (log_dir / "container-boot.log.1").read_text() + assert "OLD ROTATION" not in rotated + assert rotated.startswith("Y" * 300) + + def test_dry_run_makes_no_filesystem_changes(tmp_path: Path) -> None: scandir = tmp_path / "run-service"; scandir.mkdir() profile = _make_profile(tmp_path, "coder", state="running", with_pid=True)