From e4ff4948604d8da69f95ed1e555678d23de1ca18 Mon Sep 17 00:00:00 2001 From: kshitij <82637225+kshitijk4poor@users.noreply.github.com> Date: Fri, 26 Jun 2026 04:30:13 +0530 Subject: [PATCH] fix(cron): add default retention to per-run job output (#52383) (#52646) * fix(cron): add default retention to per-run job output to bound disk usage (#52383) Per-run cron output (cron/output//.md) is written once per execution and was never pruned, so a frequently-scheduled job on a long-running deploy accumulates one file per run indefinitely and can fill the volume ('no space left on device'). save_job_output() now keeps the most recent N output files per job and removes older ones. N defaults to 50 and is configurable via cron.output_retention; a non-positive value disables pruning for operators who manage cleanup externally. Salvaged from #52402 by @0xDevNinja. Closes #52383 * fix(config): add cron.output_retention to DEFAULT_CONFIG Follow-up to #52383: the retention config key was functional via get()-with-default but missing from DEFAULT_CONFIG, so the deep-merge wouldn't auto-populate it for new installs. Add it explicitly. --------- Co-authored-by: 0xDevNinja --- cron/jobs.py | 57 ++++++++++++++++++++++++++++++-- hermes_cli/config.py | 4 +++ tests/cron/test_jobs.py | 72 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 3 deletions(-) diff --git a/cron/jobs.py b/cron/jobs.py index a3e5381c772..3cab8488d2f 100644 --- a/cron/jobs.py +++ b/cron/jobs.py @@ -1497,16 +1497,64 @@ def _get_due_jobs_locked() -> List[Dict[str, Any]]: return due +# Per-run cron output (`cron/output//.md`) is written once per +# execution. Unlike the quick-snapshot store (`hermes_cli.backup`, capped at 20) +# it had no retention, so a frequently-scheduled job on a long-running deploy +# accumulated one file per run forever and could fill the disk (#52383). Keep the +# most recent N files per job; a non-positive value disables pruning (opt-out). +_CRON_OUTPUT_DEFAULT_KEEP = 50 + + +def _cron_output_keep() -> int: + """Resolve the per-job output-file retention cap from config (``cron.output_retention``).""" + try: + from hermes_cli.config import load_config + cfg = load_config() or {} + cron_cfg = cfg.get("cron", {}) if isinstance(cfg, dict) else {} + return int(cron_cfg.get("output_retention", _CRON_OUTPUT_DEFAULT_KEEP)) + except Exception: + return _CRON_OUTPUT_DEFAULT_KEEP + + +def _prune_job_output(job_output_dir: Path, keep: int) -> int: + """Remove the oldest ``*.md`` run-output files beyond *keep*. Returns count deleted. + + Mirrors the quick-snapshot retention in ``hermes_cli.backup._prune_quick_snapshots``: + output filenames are timestamp-based (``%Y-%m-%d_%H-%M-%S.md``) so a reverse + lexical sort orders newest-first, and everything past *keep* is the tail to + drop. A non-positive *keep* disables pruning. Pruning failures are swallowed + so they can never break output saving. + """ + if keep <= 0: + return 0 + try: + files = sorted( + (f for f in job_output_dir.glob("*.md") if f.is_file()), + key=lambda f: f.name, + reverse=True, + ) + except OSError: + return 0 + deleted = 0 + for stale in files[keep:]: + try: + stale.unlink() + deleted += 1 + except OSError as exc: + logger.debug("Failed to prune cron output %s: %s", stale.name, exc) + return deleted + + def save_job_output(job_id: str, output: str): """Save job output to file.""" ensure_dirs() job_output_dir = _job_output_dir(job_id) job_output_dir.mkdir(parents=True, exist_ok=True) _secure_dir(job_output_dir) - + timestamp = _hermes_now().strftime("%Y-%m-%d_%H-%M-%S") output_file = job_output_dir / f"{timestamp}.md" - + fd, tmp_path = tempfile.mkstemp(dir=str(job_output_dir), suffix='.tmp', prefix='.output_') try: with os.fdopen(fd, 'w', encoding='utf-8') as f: @@ -1521,7 +1569,10 @@ def save_job_output(job_id: str, output: str): except OSError: pass raise - + + # Bound per-job output growth so long-running deploys don't fill the disk (#52383). + _prune_job_output(job_output_dir, _cron_output_keep()) + return output_file diff --git a/hermes_cli/config.py b/hermes_cli/config.py index ca522520c97..2d1985cabf0 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -2454,6 +2454,10 @@ DEFAULT_CONFIG = { # 1 = serial (pre-v0.9 behaviour). # Also overridable via HERMES_CRON_MAX_PARALLEL env var. "max_parallel_jobs": None, + # Per-job output-file retention: save_job_output keeps the N most + # recent .md files and prunes older ones. 0 or negative disables + # pruning (for operators who manage cleanup externally). Default 50. + "output_retention": 50, }, # Kanban multi-agent coordination — controls the dispatcher loop that diff --git a/tests/cron/test_jobs.py b/tests/cron/test_jobs.py index d7c4016cac9..9a182bf8cf2 100644 --- a/tests/cron/test_jobs.py +++ b/tests/cron/test_jobs.py @@ -1242,3 +1242,75 @@ class TestSaveJobOutput: with pytest.raises(ValueError, match="output path"): save_job_output(str(tmp_cron_dir / "outside"), "# Results") assert not (tmp_cron_dir / "outside").exists() + + +class TestCronOutputRetention: + """Per-run cron output must self-prune so long deploys don't fill the disk (#52383).""" + + @staticmethod + def _seed(d, count): + d.mkdir(parents=True, exist_ok=True) + names = [f"2026-06-25_10-00-{i:02d}.md" for i in range(count)] + for n in names: + (d / n).write_text("x") + return names + + def test_prune_keeps_newest_n(self, tmp_path): + from cron.jobs import _prune_job_output + d = tmp_path / "job" + names = self._seed(d, 10) + assert _prune_job_output(d, keep=3) == 7 + assert sorted(p.name for p in d.glob("*.md")) == names[-3:] + + def test_prune_noop_when_under_cap(self, tmp_path): + from cron.jobs import _prune_job_output + d = tmp_path / "job" + self._seed(d, 3) + assert _prune_job_output(d, keep=5) == 0 + assert len(list(d.glob("*.md"))) == 3 + + def test_prune_disabled_when_keep_non_positive(self, tmp_path): + from cron.jobs import _prune_job_output + d = tmp_path / "job" + self._seed(d, 5) + assert _prune_job_output(d, keep=0) == 0 + assert _prune_job_output(d, keep=-1) == 0 + assert len(list(d.glob("*.md"))) == 5 + + def test_prune_ignores_non_md_and_temp_files(self, tmp_path): + from cron.jobs import _prune_job_output + d = tmp_path / "job" + self._seed(d, 4) + (d / ".output_abc.tmp").write_text("partial") + (d / "manifest.json").write_text("{}") + _prune_job_output(d, keep=2) + assert (d / ".output_abc.tmp").exists() + assert (d / "manifest.json").exists() + assert len(list(d.glob("*.md"))) == 2 + + def test_save_job_output_prunes_old_runs(self, tmp_cron_dir, monkeypatch): + from cron.jobs import save_job_output, _job_output_dir + monkeypatch.setattr("cron.jobs._cron_output_keep", lambda: 3) + seq = iter( + datetime(2026, 6, 25, 10, 0, 0, tzinfo=timezone.utc) + timedelta(seconds=i) + for i in range(8) + ) + monkeypatch.setattr("cron.jobs._hermes_now", lambda: next(seq)) + for _ in range(8): + save_job_output("job1", "report") + files = sorted(_job_output_dir("job1").glob("*.md")) + assert len(files) == 3 # only the 3 most-recent runs survive + + def test_cron_output_keep_reads_config(self, monkeypatch): + import cron.jobs as jobs + monkeypatch.setattr( + "hermes_cli.config.load_config", lambda: {"cron": {"output_retention": 7}} + ) + assert jobs._cron_output_keep() == 7 + + def test_cron_output_keep_defaults_on_bad_config(self, monkeypatch): + import cron.jobs as jobs + monkeypatch.setattr( + "hermes_cli.config.load_config", lambda: {"cron": {"output_retention": "oops"}} + ) + assert jobs._cron_output_keep() == jobs._CRON_OUTPUT_DEFAULT_KEEP